cpan/Encode/bin/enc2xs

   1 #!./perl
   2 BEGIN {
   3     # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
   4     # with $ENV{PERL_CORE} set
   5     # In case we need it in future...
   6     require Config; import Config;
   7 }
   8 use strict;
   9 use warnings;
  10 use Getopt::Std;
  11 use Config;
  12 my @orig_ARGV = @ARGV;
  13 our $VERSION  = do { my @r = (q$Revision: 2.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  14
  15 # These may get re-ordered.
  16 # RAW is a do_now as inserted by &enter
  17 # AGG is an aggreagated do_now, as built up by &process
  18
  19 use constant {
  20   RAW_NEXT => 0,
  21   RAW_IN_LEN => 1,
  22   RAW_OUT_BYTES => 2,
  23   RAW_FALLBACK => 3,
  24
  25   AGG_MIN_IN => 0,
  26   AGG_MAX_IN => 1,
  27   AGG_OUT_BYTES => 2,
  28   AGG_NEXT => 3,
  29   AGG_IN_LEN => 4,
  30   AGG_OUT_LEN => 5,
  31   AGG_FALLBACK => 6,
  32 };
  33
  34 # (See the algorithm in encengine.c - we're building structures for it)
  35
  36 # There are two sorts of structures.
  37 # "do_now" (an array, two variants of what needs storing) is whatever we need
  38 # to do now we've read an input byte.
  39 # It's housed in a "do_next" (which is how we got to it), and in turn points
  40 # to a "do_next" which contains all the "do_now"s for the next input byte.
  41
  42 # There will be a "do_next" which is the start state.
  43 # For a single byte encoding it's the only "do_next" - each "do_now" points
  44 # back to it, and each "do_now" will cause bytes. There is no state.
  45
  46 # For a multi-byte encoding where all characters in the input are the same
  47 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  48 # branching out from the start state, one step for each input byte.
  49 # The leaf "do_now"s will all be at the same distance from the start state,
  50 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  51 # the start state.
  52
  53 # For an encoding where there are varaible length input byte sequences, you
  54 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  55 # as before the leaves will point back to the start state.
  56
  57 # The system will cope with escape encodings (imagine them as a mostly
  58 # self-contained tree for each escape state, and cross links between trees
  59 # at the state-switching characters) but so far no input format defines these.
  60
  61 # The system will also cope with having output "leaves" in the middle of
  62 # the bifurcating branches, not just at the extremities, but again no
  63 # input format does this yet.
  64
  65 # There are two variants of the "do_now" structure. The first, smaller variant
  66 # is generated by &enter as the input file is read. There is one structure
  67 # for each input byte. Say we are mapping a single byte encoding to a
  68 # single byte encoding, with  "ABCD" going "abcd". There will be
  69 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  70
  71 # &process then walks the tree, building aggregate "do_now" structres for
  72 # adjacent bytes where possible. The aggregate is for a contiguous range of
  73 # bytes which each produce the same length of output, each move to the
  74 # same next state, and each have the same fallback flag.
  75 # So our 4 RAW "do_now"s above become replaced by a single structure
  76 # containing:
  77 # ["A", "D", "abcd", 1, ...]
  78 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  79 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  80 # which maps very nicely into pointer arithmetic in C for encengine.c
  81
  82 sub encode_U
  83 {
  84  # UTF-8 encode long hand - only covers part of perl's range
  85  ## my $uv = shift;
  86  # chr() works in native space so convert value from table
  87  # into that space before using chr().
  88  my $ch = chr(utf8::unicode_to_native($_[0]));
  89  # Now get core perl to encode that the way it likes.
  90  utf8::encode($ch);
  91  return $ch;
  92 }
  93
  94 sub encode_S
  95 {
  96  # encode single byte
  97  ## my ($ch,$page) = @_; return chr($ch);
  98  return chr $_[0];
  99 }
 100
 101 sub encode_D
 102 {
 103  # encode double byte MS byte first
 104  ## my ($ch,$page) = @_; return chr($page).chr($ch);
 105  return chr ($_[1]) . chr $_[0];
 106 }
 107
 108 sub encode_M
 109 {
 110  # encode Multi-byte - single for 0..255 otherwise double
 111  ## my ($ch,$page) = @_;
 112  ## return &encode_D if $page;
 113  ## return &encode_S;
 114  return chr ($_[1]) . chr $_[0] if $_[1];
 115  return chr $_[0];
 116 }
 117
 118 my %encode_types = (U => \&encode_U,
 119                     S => \&encode_S,
 120                     D => \&encode_D,
 121                     M => \&encode_M,
 122                    );
 123
 124 # Win32 does not expand globs on command line
 125 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 126
 127 my %opt;
 128 # I think these are:
 129 # -Q to disable the duplicate codepoint test
 130 # -S make mapping errors fatal
 131 # -q to remove comments written to output files
 132 # -O to enable the (brute force) substring optimiser
 133 # -o <output> to specify the output file name (else it's the first arg)
 134 # -f <inlist> to give a file with a list of input files (else use the args)
 135 # -n <name> to name the encoding (else use the basename of the input file.
 136 getopts('CM:SQqOo:f:n:',\%opt);
 137
 138 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
 139 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
 140
 141 # This really should go first, else the die here causes empty (non-erroneous)
 142 # output files to be written.
 143 my @encfiles;
 144 if (exists $opt{'f'}) {
 145     # -F is followed by name of file containing list of filenames
 146     my $flist = $opt{'f'};
 147     open(FLIST,$flist) || die "Cannot open $flist:$!";
 148     chomp(@encfiles = <FLIST>);
 149     close(FLIST);
 150 } else {
 151     @encfiles = @ARGV;
 152 }
 153
 154 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 155 chmod(0666,$cname) if -f $cname && !-w $cname;
 156 open(C,">$cname") || die "Cannot open $cname:$!";
 157
 158 my $dname = $cname;
 159 my $hname = $cname;
 160
 161 my ($doC,$doEnc,$doUcm,$doPet);
 162
 163 if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
 164  {
 165   $doC = 1;
 166   $dname =~ s/(\.[^\.]*)?$/.exh/;
 167   chmod(0666,$dname) if -f $cname && !-w $dname;
 168   open(D,">$dname") || die "Cannot open $dname:$!";
 169   $hname =~ s/(\.[^\.]*)?$/.h/;
 170   chmod(0666,$hname) if -f $cname && !-w $hname;
 171   open(H,">$hname") || die "Cannot open $hname:$!";
 172
 173   foreach my $fh (\*C,\*D,\*H)
 174   {
 175    print $fh <<"END" unless $opt{'q'};
 176 /*
 177  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 178  This file was autogenerated by:
 179  $^X $0 @orig_ARGV
 180  enc2xs VERSION $VERSION
 181 */
 182 END
 183   }
 184
 185   if ($cname =~ /(\w+)\.xs$/)
 186    {
 187     print C "#include <EXTERN.h>\n";
 188     print C "#include <perl.h>\n";
 189     print C "#include <XSUB.h>\n";
 190     print C "#define U8 U8\n";
 191    }
 192   print C "#include \"encode.h\"\n\n";
 193
 194  }
 195 elsif ($cname =~ /\.enc$/)
 196  {
 197   $doEnc = 1;
 198  }
 199 elsif ($cname =~ /\.ucm$/)
 200  {
 201   $doUcm = 1;
 202  }
 203 elsif ($cname =~ /\.pet$/)
 204  {
 205   $doPet = 1;
 206  }
 207
 208 my %encoding;
 209 my %strings;
 210 my $string_acc;
 211 my %strings_in_acc;
 212
 213 my $saved = 0;
 214 my $subsave = 0;
 215 my $strings = 0;
 216
 217 sub cmp_name
 218 {
 219  if ($a =~ /^.*-(\d+)/)
 220   {
 221    my $an = $1;
 222    if ($b =~ /^.*-(\d+)/)
 223     {
 224      my $r = $an <=> $1;
 225      return $r if $r;
 226     }
 227   }
 228  return $a cmp $b;
 229 }
 230
 231
 232 foreach my $enc (sort cmp_name @encfiles)
 233  {
 234   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 235   $name = $opt{'n'} if exists $opt{'n'};
 236   if (open(E,$enc))
 237    {
 238     if ($sfx eq 'enc')
 239      {
 240       compile_enc(\*E,lc($name));
 241      }
 242     else
 243      {
 244       compile_ucm(\*E,lc($name));
 245      }
 246    }
 247   else
 248    {
 249     warn "Cannot open $enc for $name:$!";
 250    }
 251  }
 252
 253 if ($doC)
 254  {
 255   print STDERR "Writing compiled form\n";
 256   foreach my $name (sort cmp_name keys %encoding)
 257    {
 258     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 259     process($name.'_utf8',$e2u);
 260     addstrings(\*C,$e2u);
 261
 262     process('utf8_'.$name,$u2e);
 263     addstrings(\*C,$u2e);
 264    }
 265   outbigstring(\*C,"enctable");
 266   foreach my $name (sort cmp_name keys %encoding)
 267    {
 268     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 269     outtable(\*C,$e2u, "enctable");
 270     outtable(\*C,$u2e, "enctable");
 271
 272     # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 273    }
 274   my $cpp = ($Config{d_cplusplus} || '') eq 'define';
 275   my $exta = $cpp ? 'extern "C" ' : "static";
 276   my $extb = $cpp ? 'extern "C" ' : "";
 277   foreach my $enc (sort cmp_name keys %encoding)
 278    {
 279     # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 280     my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
 281     #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 282     my $replen = 0;
 283     $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
 284     my $sym = "${enc}_encoding";
 285     $sym =~ s/\W+/_/g;
 286     my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
 287         $min_el,$max_el);
 288     print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";
 289     print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";
 290     print C "${extb} const encode_t $sym = \n";
 291     # This is to make null encoding work -- dankogai
 292     for (my $i = (scalar @info) - 1;  $i >= 0; --$i){
 293     $info[$i] ||= 1;
 294     }
 295     # end of null tweak -- dankogai
 296     print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
 297    }
 298
 299   foreach my $enc (sort cmp_name keys %encoding)
 300    {
 301     my $sym = "${enc}_encoding";
 302     $sym =~ s/\W+/_/g;
 303     print H "extern encode_t $sym;\n";
 304     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 305    }
 306
 307   if ($cname =~ /(\w+)\.xs$/)
 308    {
 309     my $mod = $1;
 310     print C <<'END';
 311
 312 static void
 313 Encode_XSEncoding(pTHX_ encode_t *enc)
 314 {
 315  dSP;
 316  HV *stash = gv_stashpv("Encode::XS", TRUE);
 317  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 318  int i = 0;
 319  PUSHMARK(sp);
 320  XPUSHs(sv);
 321  while (enc->name[i])
 322   {
 323    const char *name = enc->name[i++];
 324    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 325   }
 326  PUTBACK;
 327  call_pv("Encode::define_encoding",G_DISCARD);
 328  SvREFCNT_dec(sv);
 329 }
 330
 331 END
 332
 333     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 334     print C "BOOT:\n{\n";
 335     print C "#include \"$dname\"\n";
 336     print C "}\n";
 337    }
 338   # Close in void context is bad, m'kay
 339   close(D) or warn "Error closing '$dname': $!";
 340   close(H) or warn "Error closing '$hname': $!";
 341
 342   my $perc_saved    = $saved/($strings + $saved) * 100;
 343   my $perc_subsaved = $subsave/($strings + $subsave) * 100;
 344   printf STDERR "%d bytes in string tables\n",$strings;
 345   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 346     $saved, $perc_saved              if $saved;
 347   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 348     $subsave, $perc_subsaved         if $subsave;
 349  }
 350 elsif ($doEnc)
 351  {
 352   foreach my $name (sort cmp_name keys %encoding)
 353    {
 354     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 355     output_enc(\*C,$name,$e2u);
 356    }
 357  }
 358 elsif ($doUcm)
 359  {
 360   foreach my $name (sort cmp_name keys %encoding)
 361    {
 362     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 363     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 364    }
 365  }
 366
 367 # writing half meg files and then not checking to see if you just filled the
 368 # disk is bad, m'kay
 369 close(C) or die "Error closing '$cname': $!";
 370
 371 # End of the main program.
 372
 373 sub compile_ucm
 374 {
 375  my ($fh,$name) = @_;
 376  my $e2u = {};
 377  my $u2e = {};
 378  my $cs;
 379  my %attr;
 380  while (<$fh>)
 381   {
 382    s/#.*$//;
 383    last if /^\s*CHARMAP\s*$/i;
 384    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 385     {
 386      $attr{$1} = $2;
 387     }
 388   }
 389  if (!defined($cs =  $attr{'code_set_name'}))
 390   {
 391    warn "No <code_set_name> in $name\n";
 392   }
 393  else
 394   {
 395    $name = $cs unless exists $opt{'n'};
 396   }
 397  my $erep;
 398  my $urep;
 399  my $max_el;
 400  my $min_el;
 401  if (exists $attr{'subchar'})
 402   {
 403    #my @byte;
 404    #$attr{'subchar'} =~ /^\s*/cg;
 405    #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 406    #$erep = join('',map(chr(hex($_)),@byte));
 407    $erep = $attr{'subchar'};
 408    $erep =~ s/^\s+//; $erep =~ s/\s+$//;
 409   }
 410  print "Reading $name ($cs)\n";
 411  my $nfb = 0;
 412  my $hfb = 0;
 413  while (<$fh>)
 414   {
 415    s/#.*$//;
 416    last if /^\s*END\s+CHARMAP\s*$/i;
 417    next if /^\s*$/;
 418    my (@uni, @byte) = ();
 419    my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
 420        or die "Bad line: $_";
 421    while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
 422        push @uni, map { substr($_, 1) } split(/\+/, $1);
 423    }
 424    while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
 425        push @byte, $1;
 426    }
 427    if (@uni)
 428     {
 429      my $uch =  join('', map { encode_U(hex($_)) } @uni );
 430      my $ech = join('',map(chr(hex($_)),@byte));
 431      my $el  = length($ech);
 432      $max_el = $el if (!defined($max_el) || $el > $max_el);
 433      $min_el = $el if (!defined($min_el) || $el < $min_el);
 434      if (length($fb))
 435       {
 436        $fb = substr($fb,1);
 437        $hfb++;
 438       }
 439      else
 440       {
 441        $nfb++;
 442        $fb = '0';
 443       }
 444      # $fb is fallback flag
 445      # 0 - round trip safe
 446      # 1 - fallback for unicode -> enc
 447      # 2 - skip sub-char mapping
 448      # 3 - fallback enc -> unicode
 449      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 450      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 451     }
 452    else
 453     {
 454      warn $_;
 455     }
 456   }
 457  if ($nfb && $hfb)
 458   {
 459    die "$nfb entries without fallback, $hfb entries with\n";
 460   }
 461  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 462 }
 463
 464
 465
 466 sub compile_enc
 467 {
 468  my ($fh,$name) = @_;
 469  my $e2u = {};
 470  my $u2e = {};
 471
 472  my $type;
 473  while ($type = <$fh>)
 474   {
 475    last if $type !~ /^\s*#/;
 476   }
 477  chomp($type);
 478  return if $type eq 'E';
 479  # Do the hash lookup once, rather than once per function call. 4% speedup.
 480  my $type_func = $encode_types{$type};
 481  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 482  warn "$type encoded $name\n";
 483  my $rep = '';
 484  # Save a defined test by setting these to defined values.
 485  my $min_el = ~0; # A very big integer
 486  my $max_el = 0;  # Anything must be longer than 0
 487  {
 488   my $v = hex($def);
 489   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 490  }
 491  my $errors;
 492  my $seen;
 493  # use -Q to silence the seen test. Makefile.PL uses this by default.
 494  $seen = {} unless $opt{Q};
 495  do
 496   {
 497    my $line = <$fh>;
 498    chomp($line);
 499    my $page = hex($line);
 500    my $ch = 0;
 501    my $i = 16;
 502    do
 503     {
 504      # So why is it 1% faster to leave the my here?
 505      my $line = <$fh>;
 506      $line =~ s/\r\n$/\n/;
 507      die "$.:${line}Line should be exactly 65 characters long including
 508      newline (".length($line).")" unless length ($line) == 65;
 509      # Split line into groups of 4 hex digits, convert groups to ints
 510      # This takes 65.35
 511      # map {hex $_} $line =~ /(....)/g
 512      # This takes 63.75 (2.5% less time)
 513      # unpack "n*", pack "H*", $line
 514      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 515      # Doing it as while ($line =~ /(....)/g) took 74.63
 516      foreach my $val (unpack "n*", pack "H*", $line)
 517       {
 518        next if $val == 0xFFFD;
 519        my $ech = &$type_func($ch,$page);
 520        if ($val || (!$ch && !$page))
 521         {
 522          my $el  = length($ech);
 523          $max_el = $el if $el > $max_el;
 524          $min_el = $el if $el < $min_el;
 525          my $uch = encode_U($val);
 526          if ($seen) {
 527            # We're doing the test.
 528            # We don't need to read this quickly, so storing it as a scalar,
 529            # rather than 3 (anon array, plus the 2 scalars it holds) saves
 530            # RAM and may make us faster on low RAM systems. [see __END__]
 531            if (exists $seen->{$uch})
 532              {
 533                warn sprintf("U%04X is %02X%02X and %04X\n",
 534                             $val,$page,$ch,$seen->{$uch});
 535                $errors++;
 536              }
 537            else
 538              {
 539                $seen->{$uch} = $page << 8 | $ch;
 540              }
 541          }
 542          # Passing 2 extra args each time is 3.6% slower!
 543          # Even with having to add $fallback ||= 0 later
 544          enter_fb0($e2u,$ech,$uch);
 545          enter_fb0($u2e,$uch,$ech);
 546         }
 547        else
 548         {
 549          # No character at this position
 550          # enter($e2u,$ech,undef,$e2u);
 551         }
 552        $ch++;
 553       }
 554     } while --$i;
 555   } while --$pages;
 556  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 557    if $min_el > $max_el;
 558  die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
 559  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 560 }
 561
 562 # my ($a,$s,$d,$t,$fb) = @_;
 563 sub enter {
 564   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 565   # state we shift to after this (multibyte) input character defaults to same
 566   # as current state.
 567   $next ||= $current;
 568   # Making sure it is defined seems to be faster than {no warnings;} in
 569   # &process, or passing it in as 0 explicity.
 570   # XXX $fallback ||= 0;
 571
 572   # Start at the beginning and work forwards through the string to zero.
 573   # effectively we are removing 1 character from the front each time
 574   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 575   # Hence -$pos is the length of the remaining string.
 576   my $pos = -length $inbytes;
 577   while (1) {
 578     my $byte = substr $inbytes, $pos, 1;
 579     #  RAW_NEXT => 0,
 580     #  RAW_IN_LEN => 1,
 581     #  RAW_OUT_BYTES => 2,
 582     #  RAW_FALLBACK => 3,
 583     # to unicode an array would seem to be better, because the pages are dense.
 584     # from unicode can be very sparse, favouring a hash.
 585     # hash using the bytes (all length 1) as keys rather than ord value,
 586     # as it's easier to sort these in &process.
 587
 588     # It's faster to always add $fallback even if it's undef, rather than
 589     # choosing between 3 and 4 element array. (hence why we set it defined
 590     # above)
 591     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 592     # When $pos was -1 we were at the last input character.
 593     unless (++$pos) {
 594       $do_now->[RAW_OUT_BYTES] = $outbytes;
 595       $do_now->[RAW_NEXT] = $next;
 596       return;
 597     }
 598     # Tail recursion. The intermdiate state may not have a name yet.
 599     $current = $do_now->[RAW_NEXT];
 600   }
 601 }
 602
 603 # This is purely for optimistation. It's just &enter hard coded for $fallback
 604 # of 0, using only a 3 entry array ref to save memory for every entry.
 605 sub enter_fb0 {
 606   my ($current,$inbytes,$outbytes,$next) = @_;
 607   $next ||= $current;
 608
 609   my $pos = -length $inbytes;
 610   while (1) {
 611     my $byte = substr $inbytes, $pos, 1;
 612     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 613     unless (++$pos) {
 614       $do_now->[RAW_OUT_BYTES] = $outbytes;
 615       $do_now->[RAW_NEXT] = $next;
 616       return;
 617     }
 618     $current = $do_now->[RAW_NEXT];
 619   }
 620 }
 621
 622 sub process
 623 {
 624   my ($name,$a) = @_;
 625   $name =~ s/\W+/_/g;
 626   $a->{Cname} = $name;
 627   my $raw = $a->{Raw};
 628   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 629   my @ent;
 630   $agg_max_in = 0;
 631   foreach my $key (sort keys %$raw) {
 632     #  RAW_NEXT => 0,
 633     #  RAW_IN_LEN => 1,
 634     #  RAW_OUT_BYTES => 2,
 635     #  RAW_FALLBACK => 3,
 636     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 637     # Now we are converting from raw to aggregate, switch from 1 byte strings
 638     # to numbers
 639     my $b = ord $key;
 640     $fallback ||= 0;
 641     if ($l &&
 642         # If this == fails, we're going to reset $agg_max_in below anyway.
 643         $b == ++$agg_max_in &&
 644         # References in numeric context give the pointer as an int.
 645         $agg_next == $next &&
 646         $agg_in_len == $in_len &&
 647         $agg_out_len == length $out_bytes &&
 648         $agg_fallback == $fallback
 649         # && length($l->[AGG_OUT_BYTES]) < 16
 650        ) {
 651       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 652       # we can aggregate this byte onto the end.
 653       $l->[AGG_MAX_IN] = $b;
 654       $l->[AGG_OUT_BYTES] .= $out_bytes;
 655     } else {
 656       # AGG_MIN_IN => 0,
 657       # AGG_MAX_IN => 1,
 658       # AGG_OUT_BYTES => 2,
 659       # AGG_NEXT => 3,
 660       # AGG_IN_LEN => 4,
 661       # AGG_OUT_LEN => 5,
 662       # AGG_FALLBACK => 6,
 663       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 664       # (only gains .6% on euc-jp  -- is it worth it?)
 665       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 666                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 667                        $agg_fallback = $fallback];
 668     }
 669     if (exists $next->{Cname}) {
 670       $next->{'Forward'} = 1 if $next != $a;
 671     } else {
 672       process(sprintf("%s_%02x",$name,$b),$next);
 673     }
 674   }
 675   # encengine.c rules say that last entry must be for 255
 676   if ($agg_max_in < 255) {
 677     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 678   }
 679   $a->{'Entries'} = \@ent;
 680 }
 681
 682
 683 sub addstrings
 684 {
 685  my ($fh,$a) = @_;
 686  my $name = $a->{'Cname'};
 687  # String tables
 688  foreach my $b (@{$a->{'Entries'}})
 689   {
 690    next unless $b->[AGG_OUT_LEN];
 691    $strings{$b->[AGG_OUT_BYTES]} = undef;
 692   }
 693  if ($a->{'Forward'})
 694   {
 695    my $cpp = ($Config{d_cplusplus} || '') eq 'define';
 696    my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static';
 697    my $const = $cpp ? '' : 'const';
 698    print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 699   }
 700  $a->{'DoneStrings'} = 1;
 701  foreach my $b (@{$a->{'Entries'}})
 702   {
 703    my ($s,$e,$out,$t,$end,$l) = @$b;
 704    addstrings($fh,$t) unless $t->{'DoneStrings'};
 705   }
 706 }
 707
 708 sub outbigstring
 709 {
 710   my ($fh,$name) = @_;
 711
 712   $string_acc = '';
 713
 714   # Make the big string in the string accumulator. Longest first, on the hope
 715   # that this makes it more likely that we find the short strings later on.
 716   # Not sure if it helps sorting strings of the same length lexcically.
 717   foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
 718     my $index = index $string_acc, $s;
 719     if ($index >= 0) {
 720       $saved += length($s);
 721       $strings_in_acc{$s} = $index;
 722     } else {
 723     OPTIMISER: {
 724     if ($opt{'O'}) {
 725       my $sublength = length $s;
 726       while (--$sublength > 0) {
 727         # progressively lop characters off the end, to see if the start of
 728         # the new string overlaps the end of the accumulator.
 729         if (substr ($string_acc, -$sublength)
 730         eq substr ($s, 0, $sublength)) {
 731           $subsave += $sublength;
 732           $strings_in_acc{$s} = length ($string_acc) - $sublength;
 733           # append the last bit on the end.
 734           $string_acc .= substr ($s, $sublength);
 735           last OPTIMISER;
 736         }
 737         # or if the end of the new string overlaps the start of the
 738         # accumulator
 739         next unless substr ($string_acc, 0, $sublength)
 740           eq substr ($s, -$sublength);
 741         # well, the last $sublength characters of the accumulator match.
 742         # so as we're prepending to the accumulator, need to shift all our
 743         # existing offsets forwards
 744         $_ += $sublength foreach values %strings_in_acc;
 745         $subsave += $sublength;
 746         $strings_in_acc{$s} = 0;
 747         # append the first bit on the start.
 748         $string_acc = substr ($s, 0, -$sublength) . $string_acc;
 749         last OPTIMISER;
 750       }
 751     }
 752     # Optimiser (if it ran) found nothing, so just going have to tack the
 753     # whole thing on the end.
 754     $strings_in_acc{$s} = length $string_acc;
 755     $string_acc .= $s;
 756       };
 757     }
 758   }
 759
 760   $strings = length $string_acc;
 761   my $cpp = ($Config{d_cplusplus} || '') eq 'define';
 762   my $var = $cpp ? '' : 'static';
 763   my $definition = "\n$var const U8 $name\[$strings] = { " .
 764     join(',',unpack "C*",$string_acc);
 765   # We have a single long line. Split it at convenient commas.
 766   print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
 767   print $fh substr ($definition, pos $definition), " };\n";
 768 }
 769
 770 sub findstring {
 771   my ($name,$s) = @_;
 772   my $offset = $strings_in_acc{$s};
 773   die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
 774     unless defined $offset;
 775   "$name + $offset";
 776 }
 777
 778 sub outtable
 779 {
 780  my ($fh,$a,$bigname) = @_;
 781  my $name = $a->{'Cname'};
 782  $a->{'Done'} = 1;
 783  foreach my $b (@{$a->{'Entries'}})
 784   {
 785    my ($s,$e,$out,$t,$end,$l) = @$b;
 786    outtable($fh,$t,$bigname) unless $t->{'Done'};
 787   }
 788  my $cpp = ($Config{d_cplusplus} || '') eq 'define';
 789  my $var = $cpp ? '' : 'static';
 790  my $const = $cpp ? '' : 'const';
 791  print $fh "\n$var $const encpage_t $name\[",
 792    scalar(@{$a->{'Entries'}}), "] = {\n";
 793  foreach my $b (@{$a->{'Entries'}})
 794   {
 795    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 796    # $end |= 0x80 if $fb; # what the heck was on your mind, Nick?  -- Dan
 797    print  $fh "{";
 798    if ($l)
 799     {
 800      printf $fh findstring($bigname,$out);
 801     }
 802    else
 803     {
 804      print  $fh "0";
 805     }
 806    print  $fh ",",$t->{Cname};
 807    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 808   }
 809  print $fh "};\n";
 810 }
 811
 812 sub output_enc
 813 {
 814  my ($fh,$name,$a) = @_;
 815  die "Changed - fix me for new structure";
 816  foreach my $b (sort keys %$a)
 817   {
 818    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 819   }
 820 }
 821
 822 sub decode_U
 823 {
 824  my $s = shift;
 825 }
 826
 827 my @uname;
 828 sub char_names
 829 {
 830  my $s = do "unicore/Name.pl";
 831  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 832  pos($s) = 0;
 833  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 834   {
 835    my $name = $3;
 836    my $s = hex($1);
 837    last if $s >= 0x10000;
 838    my $e = length($2) ? hex($2) : $s;
 839    for (my $i = $s; $i <= $e; $i++)
 840     {
 841      $uname[$i] = $name;
 842 #    print sprintf("U%04X $name\n",$i);
 843     }
 844   }
 845 }
 846
 847 sub output_ucm_page
 848 {
 849   my ($cmap,$a,$t,$pre) = @_;
 850   # warn sprintf("Page %x\n",$pre);
 851   my $raw = $t->{Raw};
 852   foreach my $key (sort keys %$raw) {
 853     #  RAW_NEXT => 0,
 854     #  RAW_IN_LEN => 1,
 855     #  RAW_OUT_BYTES => 2,
 856     #  RAW_FALLBACK => 3,
 857     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 858     my $u = ord $key;
 859     $fallback ||= 0;
 860
 861     if ($next != $a && $next != $t) {
 862       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 863     } elsif (length $out_bytes) {
 864       if ($pre) {
 865         $u = $pre|($u &0x3f);
 866       }
 867       my $s = sprintf "<U%04X> ",$u;
 868       #foreach my $c (split(//,$out_bytes)) {
 869       #  $s .= sprintf "\\x%02X",ord($c);
 870       #}
 871       # 9.5% faster changing that loop to this:
 872       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 873       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 874       push(@$cmap,$s);
 875     } else {
 876       warn join(',',$u, @{$raw->{$key}},$a,$t);
 877     }
 878   }
 879 }
 880
 881 sub output_ucm
 882 {
 883  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 884  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 885  print $fh "<code_set_name> \"$name\"\n";
 886  char_names();
 887  if (defined $min_el)
 888   {
 889    print $fh "<mb_cur_min> $min_el\n";
 890   }
 891  if (defined $max_el)
 892   {
 893    print $fh "<mb_cur_max> $max_el\n";
 894   }
 895  if (defined $rep)
 896   {
 897    print $fh "<subchar> ";
 898    foreach my $c (split(//,$rep))
 899     {
 900      printf $fh "\\x%02X",ord($c);
 901     }
 902    print $fh "\n";
 903   }
 904  my @cmap;
 905  output_ucm_page(\@cmap,$h,$h,0);
 906  print $fh "#\nCHARMAP\n";
 907  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 908   {
 909    print $fh $line;
 910   }
 911  print $fh "END CHARMAP\n";
 912 }
 913
 914 use vars qw(
 915     $_Enc2xs
 916     $_Version
 917     $_Inc
 918     $_E2X
 919     $_Name
 920     $_TableFiles
 921     $_Now
 922 );
 923
 924 sub find_e2x{
 925     eval { require File::Find; };
 926     my (@inc, %e2x_dir);
 927     for my $inc (@INC){
 928     push @inc, $inc unless $inc eq '.'; #skip current dir
 929     }
 930     File::Find::find(
 931          sub {
 932          my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
 933              $atime,$mtime,$ctime,$blksize,$blocks)
 934              = lstat($_) or return;
 935          -f _ or return;
 936          if (/^.*\.e2x$/o){
 937              no warnings 'once';
 938              $e2x_dir{$File::Find::dir} ||= $mtime;
 939          }
 940          return;
 941          }, @inc);
 942     warn join("\n", keys %e2x_dir), "\n";
 943     for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
 944     $_E2X = $d;
 945     # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
 946     return $_E2X;
 947     }
 948 }
 949
 950 sub make_makefile_pl
 951 {
 952     eval { require Encode; };
 953     $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
 954     # our used for variable expanstion
 955     $_Enc2xs = $0;
 956     $_Version = $VERSION;
 957     $_E2X = find_e2x();
 958     $_Name = shift;
 959     $_TableFiles = join(",", map {qq('$_')} @_);
 960     $_Now = scalar localtime();
 961
 962     eval { require File::Spec; };
 963     _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
 964     _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"),        "$_Name.pm");
 965     _print_expand(File::Spec->catfile($_E2X,"_T.e2x"),         "t/$_Name.t");
 966     _print_expand(File::Spec->catfile($_E2X,"README.e2x"),     "README");
 967     _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"),    "Changes");
 968     exit;
 969 }
 970
 971 use vars qw(
 972         $_ModLines
 973         $_LocalVer
 974         );
 975
 976 sub make_configlocal_pm {
 977     eval { require Encode; };
 978     $@ and die "Unable to require Encode: $@\n";
 979     eval { require File::Spec; };
 980
 981     # our used for variable expanstion
 982     my %in_core = map { $_ => 1 } (
 983         'ascii',      'iso-8859-1', 'utf8',
 984         'ascii-ctrl', 'null',       'utf-8-strict'
 985     );
 986     my %LocalMod = ();
 987     # check @enc;
 988     use File::Find ();
 989     my $wanted = sub{
 990         -f $_ or return;
 991         $File::Find::name =~ /\A\./        and return;
 992         $File::Find::name =~ /\.pm\z/      or  return;
 993         $File::Find::name =~ m/\bEncode\b/ or  return;
 994         my $mod = $File::Find::name;
 995         $mod =~ s/.*\bEncode\b/Encode/o;
 996         $mod =~ s/\.pm\z//o;
 997         $mod =~ s,/,::,og;
 998         warn qq{ require $mod;\n};
 999         eval qq{ require $mod; };
1000         $@ and die "Can't require $mod: $@\n";
1001         for my $enc ( Encode->encodings() ) {
1002             no warnings;
1003             $in_core{$enc}                   and next;
1004             $Encode::Config::ExtModule{$enc} and next;
1005             $LocalMod{$enc} ||= $mod;
1006         }
1007     };
1008     File::Find::find({wanted => $wanted}, @INC);
1009     $_ModLines = "";
1010     for my $enc ( sort keys %LocalMod ) {
1011         $_ModLines .=
1012           qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
1013     }
1014     warn $_ModLines;
1015     $_LocalVer = _mkversion();
1016     $_E2X      = find_e2x();
1017     $_Inc      = $INC{"Encode.pm"};
1018     $_Inc =~ s/\.pm$//o;
1019     _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1020         File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
1021     exit;
1022 }
1023
1024 sub _mkversion{
1025     # v-string is now depreciated; use time() instead;
1026     #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1027     #$yyyy += 1900, $mo +=1;
1028     #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1029     return time();
1030 }
1031
1032 sub _print_expand{
1033     eval { require File::Basename; };
1034     $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
1035     File::Basename->import();
1036     my ($src, $dst, $clobber) = @_;
1037     if (!$clobber and -e $dst){
1038     warn "$dst exists. skipping\n";
1039     return;
1040     }
1041     warn "Generating $dst...\n";
1042     open my $in, $src or die "$src : $!";
1043     if ((my $d = dirname($dst)) ne '.'){
1044     -d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
1045     }
1046     open my $out, ">$dst" or die "$!";
1047     my $asis = 0;
1048     while (<$in>){
1049     if (/^#### END_OF_HEADER/){
1050         $asis = 1; next;
1051     }
1052     s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1053     print $out $_;
1054     }
1055 }
1056 __END__
1057
1058 =head1 NAME
1059
1060 enc2xs -- Perl Encode Module Generator
1061
1062 =head1 SYNOPSIS
1063
1064   enc2xs -[options]
1065   enc2xs -M ModName mapfiles...
1066   enc2xs -C
1067
1068 =head1 DESCRIPTION
1069
1070 F<enc2xs> builds a Perl extension for use by Encode from either
1071 Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1072 Besides being used internally during the build process of the Encode
1073 module, you can use F<enc2xs> to add your own encoding to perl.
1074 No knowledge of XS is necessary.
1075
1076 =head1 Quick Guide
1077
1078 If you want to know as little about Perl as possible but need to
1079 add a new encoding, just read this chapter and forget the rest.
1080
1081 =over 4
1082
1083 =item 0.
1084
1085 Have a .ucm file ready.  You can get it from somewhere or you can write
1086 your own from scratch or you can grab one from the Encode distribution
1087 and customize it.  For the UCM format, see the next Chapter.  In the
1088 example below, I'll call my theoretical encoding myascii, defined
1089 in I<my.ucm>.  C<$> is a shell prompt.
1090
1091   $ ls -F
1092   my.ucm
1093
1094 =item 1.
1095
1096 Issue a command as follows;
1097
1098   $ enc2xs -M My my.ucm
1099   generating Makefile.PL
1100   generating My.pm
1101   generating README
1102   generating Changes
1103
1104 Now take a look at your current directory.  It should look like this.
1105
1106   $ ls -F
1107   Makefile.PL   My.pm         my.ucm        t/
1108
1109 The following files were created.
1110
1111   Makefile.PL - MakeMaker script
1112   My.pm       - Encode submodule
1113   t/My.t      - test file
1114
1115 =over 4
1116
1117 =item 1.1.
1118
1119 If you want *.ucm installed together with the modules, do as follows;
1120
1121   $ mkdir Encode
1122   $ mv *.ucm Encode
1123   $ enc2xs -M My Encode/*ucm
1124
1125 =back
1126
1127 =item 2.
1128
1129 Edit the files generated.  You don't have to if you have no time AND no
1130 intention to give it to someone else.  But it is a good idea to edit
1131 the pod and to add more tests.
1132
1133 =item 3.
1134
1135 Now issue a command all Perl Mongers love:
1136
1137   $ perl Makefile.PL
1138   Writing Makefile for Encode::My
1139
1140 =item 4.
1141
1142 Now all you have to do is make.
1143
1144   $ make
1145   cp My.pm blib/lib/Encode/My.pm
1146   /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1147     -o encode_t.c -f encode_t.fnm
1148   Reading myascii (myascii)
1149   Writing compiled form
1150   128 bytes in string tables
1151   384 bytes (75%) saved spotting duplicates
1152   1 bytes (0.775%) saved using substrings
1153   ....
1154   chmod 644 blib/arch/auto/Encode/My/My.bs
1155   $
1156
1157 The time it takes varies depending on how fast your machine is and
1158 how large your encoding is.  Unless you are working on something big
1159 like euc-tw, it won't take too long.
1160
1161 =item 5.
1162
1163 You can "make install" already but you should test first.
1164
1165   $ make test
1166   PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1167     -e 'use Test::Harness  qw(&runtests $verbose); \
1168     $verbose=0; runtests @ARGV;' t/*.t
1169   t/My....ok
1170   All tests successful.
1171   Files=1, Tests=2,  0 wallclock secs
1172    ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1173
1174 =item 6.
1175
1176 If you are content with the test result, just "make install"
1177
1178 =item 7.
1179
1180 If you want to add your encoding to Encode's demand-loading list
1181 (so you don't have to "use Encode::YourEncoding"), run
1182
1183   enc2xs -C
1184
1185 to update Encode::ConfigLocal, a module that controls local settings.
1186 After that, "use Encode;" is enough to load your encodings on demand.
1187
1188 =back
1189
1190 =head1 The Unicode Character Map
1191
1192 Encode uses the Unicode Character Map (UCM) format for source character
1193 mappings.  This format is used by IBM's ICU package and was adopted
1194 by Nick Ing-Simmons for use with the Encode module.  Since UCM is
1195 more flexible than Tcl's Encoding Map and far more user-friendly,
1196 this is the recommended format for Encode now.
1197
1198 A UCM file looks like this.
1199
1200   #
1201   # Comments
1202   #
1203   <code_set_name> "US-ascii" # Required
1204   <code_set_alias> "ascii"   # Optional
1205   <mb_cur_min> 1             # Required; usually 1
1206   <mb_cur_max> 1             # Max. # of bytes/char
1207   <subchar> \x3F             # Substitution char
1208   #
1209   CHARMAP
1210   <U0000> \x00 |0 # <control>
1211   <U0001> \x01 |0 # <control>
1212   <U0002> \x02 |0 # <control>
1213   ....
1214   <U007C> \x7C |0 # VERTICAL LINE
1215   <U007D> \x7D |0 # RIGHT CURLY BRACKET
1216   <U007E> \x7E |0 # TILDE
1217   <U007F> \x7F |0 # <control>
1218   END CHARMAP
1219
1220 =over 4
1221
1222 =item *
1223
1224 Anything that follows C<#> is treated as a comment.
1225
1226 =item *
1227
1228 The header section continues until a line containing the word
1229 CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1230 pair per line.  Strings used as values must be quoted. Barewords are
1231 treated as numbers.  I<\xXX> represents a byte.
1232
1233 Most of the keywords are self-explanatory. I<subchar> means
1234 substitution character, not subcharacter.  When you decode a Unicode
1235 sequence to this encoding but no matching character is found, the byte
1236 sequence defined here will be used.  For most cases, the value here is
1237 \x3F; in ASCII, this is a question mark.
1238
1239 =item *
1240
1241 CHARMAP starts the character map section.  Each line has a form as
1242 follows:
1243
1244   <UXXXX> \xXX.. |0 # comment
1245     ^     ^      ^
1246     |     |      +- Fallback flag
1247     |     +-------- Encoded byte sequence
1248     +-------------- Unicode Character ID in hex
1249
1250 The format is roughly the same as a header section except for the
1251 fallback flag: | followed by 0..3.   The meaning of the possible
1252 values is as follows:
1253
1254 =over 4
1255
1256 =item |0
1257
1258 Round trip safe.  A character decoded to Unicode encodes back to the
1259 same byte sequence.  Most characters have this flag.
1260
1261 =item |1
1262
1263 Fallback for unicode -> encoding.  When seen, enc2xs adds this
1264 character for the encode map only.
1265
1266 =item |2
1267
1268 Skip sub-char mapping should there be no code point.
1269
1270 =item |3
1271
1272 Fallback for encoding -> unicode.  When seen, enc2xs adds this
1273 character for the decode map only.
1274
1275 =back
1276
1277 =item *
1278
1279 And finally, END OF CHARMAP ends the section.
1280
1281 =back
1282
1283 When you are manually creating a UCM file, you should copy ascii.ucm
1284 or an existing encoding which is close to yours, rather than write
1285 your own from scratch.
1286
1287 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1288 is, unless your environment is EBCDIC.
1289
1290 B<CAVEAT>: not all features in UCM are implemented.  For example,
1291 icu:state is not used.  Because of that, you need to write a perl
1292 module if you want to support algorithmical encodings, notably
1293 the ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1294 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1295
1296 =head2 Coping with duplicate mappings
1297
1298 When you create a map, you SHOULD make your mappings round-trip safe.
1299 That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1300 $data> stands for all characters that are marked as C<|0>.  Here is
1301 how to make sure:
1302
1303 =over 4
1304
1305 =item *
1306
1307 Sort your map in Unicode order.
1308
1309 =item *
1310
1311 When you have a duplicate entry, mark either one with '|1' or '|3'.
1312
1313 =item *
1314
1315 And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1316
1317 =back
1318
1319 Here is an example from big5-eten.
1320
1321   <U2550> \xF9\xF9 |0
1322   <U2550> \xA2\xA4 |3
1323
1324 Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1325 this;
1326
1327   E to U               U to E
1328   --------------------------------------
1329   \xF9\xF9 => U2550    U2550 => \xF9\xF9
1330   \xA2\xA4 => U2550
1331
1332 So it is round-trip safe for \xF9\xF9.  But if the line above is upside
1333 down, here is what happens.
1334
1335   E to U               U to E
1336   --------------------------------------
1337   \xA2\xA4 => U2550    U2550 => \xF9\xF9
1338   (\xF9\xF9 => U2550 is now overwritten!)
1339
1340 The Encode package comes with F<ucmlint>, a crude but sufficient
1341 utility to check the integrity of a UCM file.  Check under the
1342 Encode/bin directory for this.
1343
1344 When in doubt, you can use F<ucmsort>, yet another utility under
1345 Encode/bin directory.
1346
1347 =head1 Bookmarks
1348
1349 =over 4
1350
1351 =item *
1352
1353 ICU Home Page
1354 L<http://www.icu-project.org/>
1355
1356 =item *
1357
1358 ICU Character Mapping Tables
1359 L<http://www.icu-project.org/charset/>
1360
1361 =item *
1362
1363 ICU:Conversion Data
1364 L<http://www.icu-project.org/userguide/conversion-data.html>
1365
1366 =back
1367
1368 =head1 SEE ALSO
1369
1370 L<Encode>,
1371 L<perlmod>,
1372 L<perlpod>
1373
1374 =cut
1375
1376 # -Q to disable the duplicate codepoint test
1377 # -S make mapping errors fatal
1378 # -q to remove comments written to output files
1379 # -O to enable the (brute force) substring optimiser
1380 # -o <output> to specify the output file name (else it's the first arg)
1381 # -f <inlist> to give a file with a list of input files (else use the args)
1382 # -n <name> to name the encoding (else use the basename of the input file.
1383
1384 With %seen holding array refs:
1385
1386       865.66 real        28.80 user         8.79 sys
1387       7904  maximum resident set size
1388       1356  average shared memory size
1389      18566  average unshared data size
1390        229  average unshared stack size
1391      46080  page reclaims
1392      33373  page faults
1393
1394 With %seen holding simple scalars:
1395
1396       342.16 real        27.11 user         3.54 sys
1397       8388  maximum resident set size
1398       1394  average shared memory size
1399      14969  average unshared data size
1400        236  average unshared stack size
1401      28159  page reclaims
1402       9839  page faults
1403
1404 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1405 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1406 RAM machine, but it's going to help even on modern machines.
1407 Swapping is bad, m'kay :-)