ext/Encode/bin/enc2xs

   1 #!./perl
   2 BEGIN {
   3     # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
   4     # with $ENV{PERL_CORE} set
   5     # In case we need it in future...
   6     require Config; import Config;
   7 }
   8 use strict;
   9 use warnings;
  10 use Getopt::Std;
  11 use Config;
  12 my @orig_ARGV = @ARGV;
  13 our $VERSION  = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
  14
  15 # These may get re-ordered.
  16 # RAW is a do_now as inserted by &enter
  17 # AGG is an aggreagated do_now, as built up by &process
  18
  19 use constant {
  20   RAW_NEXT => 0,
  21   RAW_IN_LEN => 1,
  22   RAW_OUT_BYTES => 2,
  23   RAW_FALLBACK => 3,
  24
  25   AGG_MIN_IN => 0,
  26   AGG_MAX_IN => 1,
  27   AGG_OUT_BYTES => 2,
  28   AGG_NEXT => 3,
  29   AGG_IN_LEN => 4,
  30   AGG_OUT_LEN => 5,
  31   AGG_FALLBACK => 6,
  32 };
  33
  34 # (See the algorithm in encengine.c - we're building structures for it)
  35
  36 # There are two sorts of structures.
  37 # "do_now" (an array, two variants of what needs storing) is whatever we need
  38 # to do now we've read an input byte.
  39 # It's housed in a "do_next" (which is how we got to it), and in turn points
  40 # to a "do_next" which contains all the "do_now"s for the next input byte.
  41
  42 # There will be a "do_next" which is the start state.
  43 # For a single byte encoding it's the only "do_next" - each "do_now" points
  44 # back to it, and each "do_now" will cause bytes. There is no state.
  45
  46 # For a multi-byte encoding where all characters in the input are the same
  47 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
  48 # branching out from the start state, one step for each input byte.
  49 # The leaf "do_now"s will all be at the same distance from the start state,
  50 # only the leaf "do_now"s cause output bytes, and they in turn point back to
  51 # the start state.
  52
  53 # For an encoding where there are varaible length input byte sequences, you
  54 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
  55 # as before the leaves will point back to the start state.
  56
  57 # The system will cope with escape encodings (imagine them as a mostly
  58 # self-contained tree for each escape state, and cross links between trees
  59 # at the state-switching characters) but so far no input format defines these.
  60
  61 # The system will also cope with having output "leaves" in the middle of
  62 # the bifurcating branches, not just at the extremities, but again no
  63 # input format does this yet.
  64
  65 # There are two variants of the "do_now" structure. The first, smaller variant
  66 # is generated by &enter as the input file is read. There is one structure
  67 # for each input byte. Say we are mapping a single byte encoding to a
  68 # single byte encoding, with  "ABCD" going "abcd". There will be
  69 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
  70
  71 # &process then walks the tree, building aggregate "do_now" structres for
  72 # adjacent bytes where possible. The aggregate is for a contiguous range of
  73 # bytes which each produce the same length of output, each move to the
  74 # same next state, and each have the same fallback flag.
  75 # So our 4 RAW "do_now"s above become replaced by a single structure
  76 # containing:
  77 # ["A", "D", "abcd", 1, ...]
  78 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
  79 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
  80 # which maps very nicely into pointer arithmetic in C for encengine.c
  81
  82 sub encode_U
  83 {
  84  # UTF-8 encode long hand - only covers part of perl's range
  85  ## my $uv = shift;
  86  # chr() works in native space so convert value from table
  87  # into that space before using chr().
  88  my $ch = chr(utf8::unicode_to_native($_[0]));
  89  # Now get core perl to encode that the way it likes.
  90  utf8::encode($ch);
  91  return $ch;
  92 }
  93
  94 sub encode_S
  95 {
  96  # encode single byte
  97  ## my ($ch,$page) = @_; return chr($ch);
  98  return chr $_[0];
  99 }
 100
 101 sub encode_D
 102 {
 103  # encode double byte MS byte first
 104  ## my ($ch,$page) = @_; return chr($page).chr($ch);
 105  return chr ($_[1]) . chr $_[0];
 106 }
 107
 108 sub encode_M
 109 {
 110  # encode Multi-byte - single for 0..255 otherwise double
 111  ## my ($ch,$page) = @_;
 112  ## return &encode_D if $page;
 113  ## return &encode_S;
 114  return chr ($_[1]) . chr $_[0] if $_[1];
 115  return chr $_[0];
 116 }
 117
 118 my %encode_types = (U => \&encode_U,
 119                     S => \&encode_S,
 120                     D => \&encode_D,
 121                     M => \&encode_M,
 122                    );
 123
 124 # Win32 does not expand globs on command line
 125 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
 126
 127 my %opt;
 128 # I think these are:
 129 # -Q to disable the duplicate codepoint test
 130 # -S make mapping errors fatal
 131 # -q to remove comments written to output files
 132 # -O to enable the (brute force) substring optimiser
 133 # -o <output> to specify the output file name (else it's the first arg)
 134 # -f <inlist> to give a file with a list of input files (else use the args)
 135 # -n <name> to name the encoding (else use the basename of the input file.
 136 getopts('CM:SQqOo:f:n:',\%opt);
 137
 138 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
 139 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
 140
 141 # This really should go first, else the die here causes empty (non-erroneous)
 142 # output files to be written.
 143 my @encfiles;
 144 if (exists $opt{'f'}) {
 145     # -F is followed by name of file containing list of filenames
 146     my $flist = $opt{'f'};
 147     open(FLIST,$flist) || die "Cannot open $flist:$!";
 148     chomp(@encfiles = <FLIST>);
 149     close(FLIST);
 150 } else {
 151     @encfiles = @ARGV;
 152 }
 153
 154 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
 155 chmod(0666,$cname) if -f $cname && !-w $cname;
 156 open(C,">$cname") || die "Cannot open $cname:$!";
 157
 158 my $dname = $cname;
 159 my $hname = $cname;
 160
 161 my ($doC,$doEnc,$doUcm,$doPet);
 162
 163 if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
 164  {
 165   $doC = 1;
 166   $dname =~ s/(\.[^\.]*)?$/.exh/;
 167   chmod(0666,$dname) if -f $cname && !-w $dname;
 168   open(D,">$dname") || die "Cannot open $dname:$!";
 169   $hname =~ s/(\.[^\.]*)?$/.h/;
 170   chmod(0666,$hname) if -f $cname && !-w $hname;
 171   open(H,">$hname") || die "Cannot open $hname:$!";
 172
 173   foreach my $fh (\*C,\*D,\*H)
 174   {
 175    print $fh <<"END" unless $opt{'q'};
 176 /*
 177  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 178  This file was autogenerated by:
 179  $^X $0 @orig_ARGV
 180  enc2xs VERSION $VERSION
 181 */
 182 END
 183   }
 184
 185   if ($cname =~ /(\w+)\.xs$/)
 186    {
 187     print C "#include <EXTERN.h>\n";
 188     print C "#include <perl.h>\n";
 189     print C "#include <XSUB.h>\n";
 190     print C "#define U8 U8\n";
 191    }
 192   print C "#include \"encode.h\"\n\n";
 193
 194  }
 195 elsif ($cname =~ /\.enc$/)
 196  {
 197   $doEnc = 1;
 198  }
 199 elsif ($cname =~ /\.ucm$/)
 200  {
 201   $doUcm = 1;
 202  }
 203 elsif ($cname =~ /\.pet$/)
 204  {
 205   $doPet = 1;
 206  }
 207
 208 my %encoding;
 209 my %strings;
 210 my $string_acc;
 211 my %strings_in_acc;
 212
 213 my $saved = 0;
 214 my $subsave = 0;
 215 my $strings = 0;
 216
 217 sub cmp_name
 218 {
 219  if ($a =~ /^.*-(\d+)/)
 220   {
 221    my $an = $1;
 222    if ($b =~ /^.*-(\d+)/)
 223     {
 224      my $r = $an <=> $1;
 225      return $r if $r;
 226     }
 227   }
 228  return $a cmp $b;
 229 }
 230
 231
 232 foreach my $enc (sort cmp_name @encfiles)
 233  {
 234   my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
 235   $name = $opt{'n'} if exists $opt{'n'};
 236   if (open(E,$enc))
 237    {
 238     if ($sfx eq 'enc')
 239      {
 240       compile_enc(\*E,lc($name));
 241      }
 242     else
 243      {
 244       compile_ucm(\*E,lc($name));
 245      }
 246    }
 247   else
 248    {
 249     warn "Cannot open $enc for $name:$!";
 250    }
 251  }
 252
 253 if ($doC)
 254  {
 255   print STDERR "Writing compiled form\n";
 256   foreach my $name (sort cmp_name keys %encoding)
 257    {
 258     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 259     process($name.'_utf8',$e2u);
 260     addstrings(\*C,$e2u);
 261
 262     process('utf8_'.$name,$u2e);
 263     addstrings(\*C,$u2e);
 264    }
 265   outbigstring(\*C,"enctable");
 266   foreach my $name (sort cmp_name keys %encoding)
 267    {
 268     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 269     outtable(\*C,$e2u, "enctable");
 270     outtable(\*C,$u2e, "enctable");
 271
 272     # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
 273    }
 274   my $cpp  = ($Config{d_cplusplus} || '') eq 'define';
 275   my $ext_c = $cpp ? 'extern "C" ' : "";
 276   foreach my $enc (sort cmp_name keys %encoding)
 277    {
 278     # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
 279     my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
 280     #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
 281     my $replen = 0;
 282     $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
 283     my $sym = "${enc}_encoding";
 284     $sym =~ s/\W+/_/g;
 285     my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
 286         $min_el,$max_el);
 287     print C "${ext_c}static const U8 ${sym}_rep_character[] = \"$rep\";\n";
 288     print C "${ext_c}static const char ${sym}_enc_name[] = \"$enc\";\n\n";
 289     print C "${ext_c}const encode_t $sym = \n";
 290     # This is to make null encoding work -- dankogai
 291     for (my $i = (scalar @info) - 1;  $i >= 0; --$i){
 292     $info[$i] ||= 1;
 293     }
 294     # end of null tweak -- dankogai
 295     print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
 296    }
 297
 298   foreach my $enc (sort cmp_name keys %encoding)
 299    {
 300     my $sym = "${enc}_encoding";
 301     $sym =~ s/\W+/_/g;
 302     print H "extern encode_t $sym;\n";
 303     print D " Encode_XSEncoding(aTHX_ &$sym);\n";
 304    }
 305
 306   if ($cname =~ /(\w+)\.xs$/)
 307    {
 308     my $mod = $1;
 309     print C <<'END';
 310
 311 static void
 312 Encode_XSEncoding(pTHX_ encode_t *enc)
 313 {
 314  dSP;
 315  HV *stash = gv_stashpv("Encode::XS", TRUE);
 316  SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
 317  int i = 0;
 318  PUSHMARK(sp);
 319  XPUSHs(sv);
 320  while (enc->name[i])
 321   {
 322    const char *name = enc->name[i++];
 323    XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
 324   }
 325  PUTBACK;
 326  call_pv("Encode::define_encoding",G_DISCARD);
 327  SvREFCNT_dec(sv);
 328 }
 329
 330 END
 331
 332     print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
 333     print C "BOOT:\n{\n";
 334     print C "#include \"$dname\"\n";
 335     print C "}\n";
 336    }
 337   # Close in void context is bad, m'kay
 338   close(D) or warn "Error closing '$dname': $!";
 339   close(H) or warn "Error closing '$hname': $!";
 340
 341   my $perc_saved    = $saved/($strings + $saved) * 100;
 342   my $perc_subsaved = $subsave/($strings + $subsave) * 100;
 343   printf STDERR "%d bytes in string tables\n",$strings;
 344   printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
 345     $saved, $perc_saved              if $saved;
 346   printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
 347     $subsave, $perc_subsaved         if $subsave;
 348  }
 349 elsif ($doEnc)
 350  {
 351   foreach my $name (sort cmp_name keys %encoding)
 352    {
 353     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 354     output_enc(\*C,$name,$e2u);
 355    }
 356  }
 357 elsif ($doUcm)
 358  {
 359   foreach my $name (sort cmp_name keys %encoding)
 360    {
 361     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
 362     output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
 363    }
 364  }
 365
 366 # writing half meg files and then not checking to see if you just filled the
 367 # disk is bad, m'kay
 368 close(C) or die "Error closing '$cname': $!";
 369
 370 # End of the main program.
 371
 372 sub compile_ucm
 373 {
 374  my ($fh,$name) = @_;
 375  my $e2u = {};
 376  my $u2e = {};
 377  my $cs;
 378  my %attr;
 379  while (<$fh>)
 380   {
 381    s/#.*$//;
 382    last if /^\s*CHARMAP\s*$/i;
 383    if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
 384     {
 385      $attr{$1} = $2;
 386     }
 387   }
 388  if (!defined($cs =  $attr{'code_set_name'}))
 389   {
 390    warn "No <code_set_name> in $name\n";
 391   }
 392  else
 393   {
 394    $name = $cs unless exists $opt{'n'};
 395   }
 396  my $erep;
 397  my $urep;
 398  my $max_el;
 399  my $min_el;
 400  if (exists $attr{'subchar'})
 401   {
 402    #my @byte;
 403    #$attr{'subchar'} =~ /^\s*/cg;
 404    #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
 405    #$erep = join('',map(chr(hex($_)),@byte));
 406    $erep = $attr{'subchar'};
 407    $erep =~ s/^\s+//; $erep =~ s/\s+$//;
 408   }
 409  print "Reading $name ($cs)\n";
 410  my $nfb = 0;
 411  my $hfb = 0;
 412  while (<$fh>)
 413   {
 414    s/#.*$//;
 415    last if /^\s*END\s+CHARMAP\s*$/i;
 416    next if /^\s*$/;
 417    my (@uni, @byte) = ();
 418    my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
 419        or die "Bad line: $_";
 420    while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
 421        push @uni, map { substr($_, 1) } split(/\+/, $1);
 422    }
 423    while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
 424        push @byte, $1;
 425    }
 426    if (@uni)
 427     {
 428      my $uch =  join('', map { encode_U(hex($_)) } @uni );
 429      my $ech = join('',map(chr(hex($_)),@byte));
 430      my $el  = length($ech);
 431      $max_el = $el if (!defined($max_el) || $el > $max_el);
 432      $min_el = $el if (!defined($min_el) || $el < $min_el);
 433      if (length($fb))
 434       {
 435        $fb = substr($fb,1);
 436        $hfb++;
 437       }
 438      else
 439       {
 440        $nfb++;
 441        $fb = '0';
 442       }
 443      # $fb is fallback flag
 444      # 0 - round trip safe
 445      # 1 - fallback for unicode -> enc
 446      # 2 - skip sub-char mapping
 447      # 3 - fallback enc -> unicode
 448      enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
 449      enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
 450     }
 451    else
 452     {
 453      warn $_;
 454     }
 455   }
 456  if ($nfb && $hfb)
 457   {
 458    die "$nfb entries without fallback, $hfb entries with\n";
 459   }
 460  $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
 461 }
 462
 463
 464
 465 sub compile_enc
 466 {
 467  my ($fh,$name) = @_;
 468  my $e2u = {};
 469  my $u2e = {};
 470
 471  my $type;
 472  while ($type = <$fh>)
 473   {
 474    last if $type !~ /^\s*#/;
 475   }
 476  chomp($type);
 477  return if $type eq 'E';
 478  # Do the hash lookup once, rather than once per function call. 4% speedup.
 479  my $type_func = $encode_types{$type};
 480  my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
 481  warn "$type encoded $name\n";
 482  my $rep = '';
 483  # Save a defined test by setting these to defined values.
 484  my $min_el = ~0; # A very big integer
 485  my $max_el = 0;  # Anything must be longer than 0
 486  {
 487   my $v = hex($def);
 488   $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
 489  }
 490  my $errors;
 491  my $seen;
 492  # use -Q to silence the seen test. Makefile.PL uses this by default.
 493  $seen = {} unless $opt{Q};
 494  do
 495   {
 496    my $line = <$fh>;
 497    chomp($line);
 498    my $page = hex($line);
 499    my $ch = 0;
 500    my $i = 16;
 501    do
 502     {
 503      # So why is it 1% faster to leave the my here?
 504      my $line = <$fh>;
 505      $line =~ s/\r\n$/\n/;
 506      die "$.:${line}Line should be exactly 65 characters long including
 507      newline (".length($line).")" unless length ($line) == 65;
 508      # Split line into groups of 4 hex digits, convert groups to ints
 509      # This takes 65.35
 510      # map {hex $_} $line =~ /(....)/g
 511      # This takes 63.75 (2.5% less time)
 512      # unpack "n*", pack "H*", $line
 513      # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
 514      # Doing it as while ($line =~ /(....)/g) took 74.63
 515      foreach my $val (unpack "n*", pack "H*", $line)
 516       {
 517        next if $val == 0xFFFD;
 518        my $ech = &$type_func($ch,$page);
 519        if ($val || (!$ch && !$page))
 520         {
 521          my $el  = length($ech);
 522          $max_el = $el if $el > $max_el;
 523          $min_el = $el if $el < $min_el;
 524          my $uch = encode_U($val);
 525          if ($seen) {
 526            # We're doing the test.
 527            # We don't need to read this quickly, so storing it as a scalar,
 528            # rather than 3 (anon array, plus the 2 scalars it holds) saves
 529            # RAM and may make us faster on low RAM systems. [see __END__]
 530            if (exists $seen->{$uch})
 531              {
 532                warn sprintf("U%04X is %02X%02X and %04X\n",
 533                             $val,$page,$ch,$seen->{$uch});
 534                $errors++;
 535              }
 536            else
 537              {
 538                $seen->{$uch} = $page << 8 | $ch;
 539              }
 540          }
 541          # Passing 2 extra args each time is 3.6% slower!
 542          # Even with having to add $fallback ||= 0 later
 543          enter_fb0($e2u,$ech,$uch);
 544          enter_fb0($u2e,$uch,$ech);
 545         }
 546        else
 547         {
 548          # No character at this position
 549          # enter($e2u,$ech,undef,$e2u);
 550         }
 551        $ch++;
 552       }
 553     } while --$i;
 554   } while --$pages;
 555  die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
 556    if $min_el > $max_el;
 557  die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
 558  $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
 559 }
 560
 561 # my ($a,$s,$d,$t,$fb) = @_;
 562 sub enter {
 563   my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
 564   # state we shift to after this (multibyte) input character defaults to same
 565   # as current state.
 566   $next ||= $current;
 567   # Making sure it is defined seems to be faster than {no warnings;} in
 568   # &process, or passing it in as 0 explicity.
 569   # XXX $fallback ||= 0;
 570
 571   # Start at the beginning and work forwards through the string to zero.
 572   # effectively we are removing 1 character from the front each time
 573   # but we don't actually edit the string. [this alone seems to be 14% speedup]
 574   # Hence -$pos is the length of the remaining string.
 575   my $pos = -length $inbytes;
 576   while (1) {
 577     my $byte = substr $inbytes, $pos, 1;
 578     #  RAW_NEXT => 0,
 579     #  RAW_IN_LEN => 1,
 580     #  RAW_OUT_BYTES => 2,
 581     #  RAW_FALLBACK => 3,
 582     # to unicode an array would seem to be better, because the pages are dense.
 583     # from unicode can be very sparse, favouring a hash.
 584     # hash using the bytes (all length 1) as keys rather than ord value,
 585     # as it's easier to sort these in &process.
 586
 587     # It's faster to always add $fallback even if it's undef, rather than
 588     # choosing between 3 and 4 element array. (hence why we set it defined
 589     # above)
 590     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
 591     # When $pos was -1 we were at the last input character.
 592     unless (++$pos) {
 593       $do_now->[RAW_OUT_BYTES] = $outbytes;
 594       $do_now->[RAW_NEXT] = $next;
 595       return;
 596     }
 597     # Tail recursion. The intermdiate state may not have a name yet.
 598     $current = $do_now->[RAW_NEXT];
 599   }
 600 }
 601
 602 # This is purely for optimistation. It's just &enter hard coded for $fallback
 603 # of 0, using only a 3 entry array ref to save memory for every entry.
 604 sub enter_fb0 {
 605   my ($current,$inbytes,$outbytes,$next) = @_;
 606   $next ||= $current;
 607
 608   my $pos = -length $inbytes;
 609   while (1) {
 610     my $byte = substr $inbytes, $pos, 1;
 611     my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
 612     unless (++$pos) {
 613       $do_now->[RAW_OUT_BYTES] = $outbytes;
 614       $do_now->[RAW_NEXT] = $next;
 615       return;
 616     }
 617     $current = $do_now->[RAW_NEXT];
 618   }
 619 }
 620
 621 sub process
 622 {
 623   my ($name,$a) = @_;
 624   $name =~ s/\W+/_/g;
 625   $a->{Cname} = $name;
 626   my $raw = $a->{Raw};
 627   my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
 628   my @ent;
 629   $agg_max_in = 0;
 630   foreach my $key (sort keys %$raw) {
 631     #  RAW_NEXT => 0,
 632     #  RAW_IN_LEN => 1,
 633     #  RAW_OUT_BYTES => 2,
 634     #  RAW_FALLBACK => 3,
 635     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 636     # Now we are converting from raw to aggregate, switch from 1 byte strings
 637     # to numbers
 638     my $b = ord $key;
 639     $fallback ||= 0;
 640     if ($l &&
 641         # If this == fails, we're going to reset $agg_max_in below anyway.
 642         $b == ++$agg_max_in &&
 643         # References in numeric context give the pointer as an int.
 644         $agg_next == $next &&
 645         $agg_in_len == $in_len &&
 646         $agg_out_len == length $out_bytes &&
 647         $agg_fallback == $fallback
 648         # && length($l->[AGG_OUT_BYTES]) < 16
 649        ) {
 650       #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
 651       # we can aggregate this byte onto the end.
 652       $l->[AGG_MAX_IN] = $b;
 653       $l->[AGG_OUT_BYTES] .= $out_bytes;
 654     } else {
 655       # AGG_MIN_IN => 0,
 656       # AGG_MAX_IN => 1,
 657       # AGG_OUT_BYTES => 2,
 658       # AGG_NEXT => 3,
 659       # AGG_IN_LEN => 4,
 660       # AGG_OUT_LEN => 5,
 661       # AGG_FALLBACK => 6,
 662       # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
 663       # (only gains .6% on euc-jp  -- is it worth it?)
 664       push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
 665                        $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
 666                        $agg_fallback = $fallback];
 667     }
 668     if (exists $next->{Cname}) {
 669       $next->{'Forward'} = 1 if $next != $a;
 670     } else {
 671       process(sprintf("%s_%02x",$name,$b),$next);
 672     }
 673   }
 674   # encengine.c rules say that last entry must be for 255
 675   if ($agg_max_in < 255) {
 676     push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
 677   }
 678   $a->{'Entries'} = \@ent;
 679 }
 680
 681
 682 sub addstrings
 683 {
 684  my ($fh,$a) = @_;
 685  my $name = $a->{'Cname'};
 686  # String tables
 687  foreach my $b (@{$a->{'Entries'}})
 688   {
 689    next unless $b->[AGG_OUT_LEN];
 690    $strings{$b->[AGG_OUT_BYTES]} = undef;
 691   }
 692  if ($a->{'Forward'})
 693   {
 694    my $cpp = ($Config{d_cplusplus} || '') eq 'define';
 695    my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static';
 696    my $const = $cpp ? '' : 'const';
 697    print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
 698   }
 699  $a->{'DoneStrings'} = 1;
 700  foreach my $b (@{$a->{'Entries'}})
 701   {
 702    my ($s,$e,$out,$t,$end,$l) = @$b;
 703    addstrings($fh,$t) unless $t->{'DoneStrings'};
 704   }
 705 }
 706
 707 sub outbigstring
 708 {
 709   my ($fh,$name) = @_;
 710
 711   $string_acc = '';
 712
 713   # Make the big string in the string accumulator. Longest first, on the hope
 714   # that this makes it more likely that we find the short strings later on.
 715   # Not sure if it helps sorting strings of the same length lexcically.
 716   foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
 717     my $index = index $string_acc, $s;
 718     if ($index >= 0) {
 719       $saved += length($s);
 720       $strings_in_acc{$s} = $index;
 721     } else {
 722     OPTIMISER: {
 723     if ($opt{'O'}) {
 724       my $sublength = length $s;
 725       while (--$sublength > 0) {
 726         # progressively lop characters off the end, to see if the start of
 727         # the new string overlaps the end of the accumulator.
 728         if (substr ($string_acc, -$sublength)
 729         eq substr ($s, 0, $sublength)) {
 730           $subsave += $sublength;
 731           $strings_in_acc{$s} = length ($string_acc) - $sublength;
 732           # append the last bit on the end.
 733           $string_acc .= substr ($s, $sublength);
 734           last OPTIMISER;
 735         }
 736         # or if the end of the new string overlaps the start of the
 737         # accumulator
 738         next unless substr ($string_acc, 0, $sublength)
 739           eq substr ($s, -$sublength);
 740         # well, the last $sublength characters of the accumulator match.
 741         # so as we're prepending to the accumulator, need to shift all our
 742         # existing offsets forwards
 743         $_ += $sublength foreach values %strings_in_acc;
 744         $subsave += $sublength;
 745         $strings_in_acc{$s} = 0;
 746         # append the first bit on the start.
 747         $string_acc = substr ($s, 0, -$sublength) . $string_acc;
 748         last OPTIMISER;
 749       }
 750     }
 751     # Optimiser (if it ran) found nothing, so just going have to tack the
 752     # whole thing on the end.
 753     $strings_in_acc{$s} = length $string_acc;
 754     $string_acc .= $s;
 755       };
 756     }
 757   }
 758
 759   $strings = length $string_acc;
 760   my $cpp = ($Config{d_cplusplus} || '') eq 'define';
 761   my $var = $cpp ? '' : 'static';
 762   my $definition = "\n$var const U8 $name\[$strings] = { " .
 763     join(',',unpack "C*",$string_acc);
 764   # We have a single long line. Split it at convenient commas.
 765   print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
 766   print $fh substr ($definition, pos $definition), " };\n";
 767 }
 768
 769 sub findstring {
 770   my ($name,$s) = @_;
 771   my $offset = $strings_in_acc{$s};
 772   die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
 773     unless defined $offset;
 774   "$name + $offset";
 775 }
 776
 777 sub outtable
 778 {
 779  my ($fh,$a,$bigname) = @_;
 780  my $name = $a->{'Cname'};
 781  $a->{'Done'} = 1;
 782  foreach my $b (@{$a->{'Entries'}})
 783   {
 784    my ($s,$e,$out,$t,$end,$l) = @$b;
 785    outtable($fh,$t,$bigname) unless $t->{'Done'};
 786   }
 787  my $cpp = ($Config{d_cplusplus} || '') eq 'define';
 788  my $var = $cpp ? '' : 'static';
 789  my $const = $cpp ? '' : 'const';
 790  print $fh "\n$var $const encpage_t $name\[",
 791    scalar(@{$a->{'Entries'}}), "] = {\n";
 792  foreach my $b (@{$a->{'Entries'}})
 793   {
 794    my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
 795    # $end |= 0x80 if $fb; # what the heck was on your mind, Nick?  -- Dan
 796    print  $fh "{";
 797    if ($l)
 798     {
 799      printf $fh findstring($bigname,$out);
 800     }
 801    else
 802     {
 803      print  $fh "0";
 804     }
 805    print  $fh ",",$t->{Cname};
 806    printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
 807   }
 808  print $fh "};\n";
 809 }
 810
 811 sub output_enc
 812 {
 813  my ($fh,$name,$a) = @_;
 814  die "Changed - fix me for new structure";
 815  foreach my $b (sort keys %$a)
 816   {
 817    my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
 818   }
 819 }
 820
 821 sub decode_U
 822 {
 823  my $s = shift;
 824 }
 825
 826 my @uname;
 827 sub char_names
 828 {
 829  my $s = do "unicore/Name.pl";
 830  die "char_names: unicore/Name.pl: $!\n" unless defined $s;
 831  pos($s) = 0;
 832  while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
 833   {
 834    my $name = $3;
 835    my $s = hex($1);
 836    last if $s >= 0x10000;
 837    my $e = length($2) ? hex($2) : $s;
 838    for (my $i = $s; $i <= $e; $i++)
 839     {
 840      $uname[$i] = $name;
 841 #    print sprintf("U%04X $name\n",$i);
 842     }
 843   }
 844 }
 845
 846 sub output_ucm_page
 847 {
 848   my ($cmap,$a,$t,$pre) = @_;
 849   # warn sprintf("Page %x\n",$pre);
 850   my $raw = $t->{Raw};
 851   foreach my $key (sort keys %$raw) {
 852     #  RAW_NEXT => 0,
 853     #  RAW_IN_LEN => 1,
 854     #  RAW_OUT_BYTES => 2,
 855     #  RAW_FALLBACK => 3,
 856     my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
 857     my $u = ord $key;
 858     $fallback ||= 0;
 859
 860     if ($next != $a && $next != $t) {
 861       output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
 862     } elsif (length $out_bytes) {
 863       if ($pre) {
 864         $u = $pre|($u &0x3f);
 865       }
 866       my $s = sprintf "<U%04X> ",$u;
 867       #foreach my $c (split(//,$out_bytes)) {
 868       #  $s .= sprintf "\\x%02X",ord($c);
 869       #}
 870       # 9.5% faster changing that loop to this:
 871       $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
 872       $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
 873       push(@$cmap,$s);
 874     } else {
 875       warn join(',',$u, @{$raw->{$key}},$a,$t);
 876     }
 877   }
 878 }
 879
 880 sub output_ucm
 881 {
 882  my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
 883  print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
 884  print $fh "<code_set_name> \"$name\"\n";
 885  char_names();
 886  if (defined $min_el)
 887   {
 888    print $fh "<mb_cur_min> $min_el\n";
 889   }
 890  if (defined $max_el)
 891   {
 892    print $fh "<mb_cur_max> $max_el\n";
 893   }
 894  if (defined $rep)
 895   {
 896    print $fh "<subchar> ";
 897    foreach my $c (split(//,$rep))
 898     {
 899      printf $fh "\\x%02X",ord($c);
 900     }
 901    print $fh "\n";
 902   }
 903  my @cmap;
 904  output_ucm_page(\@cmap,$h,$h,0);
 905  print $fh "#\nCHARMAP\n";
 906  foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
 907   {
 908    print $fh $line;
 909   }
 910  print $fh "END CHARMAP\n";
 911 }
 912
 913 use vars qw(
 914     $_Enc2xs
 915     $_Version
 916     $_Inc
 917     $_E2X
 918     $_Name
 919     $_TableFiles
 920     $_Now
 921 );
 922
 923 sub find_e2x{
 924     eval { require File::Find; };
 925     my (@inc, %e2x_dir);
 926     for my $inc (@INC){
 927     push @inc, $inc unless $inc eq '.'; #skip current dir
 928     }
 929     File::Find::find(
 930          sub {
 931          my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
 932              $atime,$mtime,$ctime,$blksize,$blocks)
 933              = lstat($_) or return;
 934          -f _ or return;
 935          if (/^.*\.e2x$/o){
 936              no warnings 'once';
 937              $e2x_dir{$File::Find::dir} ||= $mtime;
 938          }
 939          return;
 940          }, @inc);
 941     warn join("\n", keys %e2x_dir), "\n";
 942     for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
 943     $_E2X = $d;
 944     # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
 945     return $_E2X;
 946     }
 947 }
 948
 949 sub make_makefile_pl
 950 {
 951     eval { require Encode; };
 952     $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
 953     # our used for variable expanstion
 954     $_Enc2xs = $0;
 955     $_Version = $VERSION;
 956     $_E2X = find_e2x();
 957     $_Name = shift;
 958     $_TableFiles = join(",", map {qq('$_')} @_);
 959     $_Now = scalar localtime();
 960
 961     eval { require File::Spec; };
 962     _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
 963     _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"),        "$_Name.pm");
 964     _print_expand(File::Spec->catfile($_E2X,"_T.e2x"),         "t/$_Name.t");
 965     _print_expand(File::Spec->catfile($_E2X,"README.e2x"),     "README");
 966     _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"),    "Changes");
 967     exit;
 968 }
 969
 970 use vars qw(
 971         $_ModLines
 972         $_LocalVer
 973         );
 974
 975 sub make_configlocal_pm {
 976     eval { require Encode; };
 977     $@ and die "Unable to require Encode: $@\n";
 978     eval { require File::Spec; };
 979
 980     # our used for variable expanstion
 981     my %in_core = map { $_ => 1 } (
 982         'ascii',      'iso-8859-1', 'utf8',
 983         'ascii-ctrl', 'null',       'utf-8-strict'
 984     );
 985     my %LocalMod = ();
 986     # check @enc;
 987     use File::Find ();
 988     my $wanted = sub{
 989         -f $_ or return;
 990         $File::Find::name =~ /\A\./        and return;
 991         $File::Find::name =~ /\.pm\z/      or  return;
 992         $File::Find::name =~ m/\bEncode\b/ or  return;
 993         my $mod = $File::Find::name;
 994         $mod =~ s/.*\bEncode\b/Encode/o;
 995         $mod =~ s/\.pm\z//o;
 996         $mod =~ s,/,::,og;
 997         warn qq{ require $mod;\n};
 998         eval qq{ require $mod; };
 999         $@ and die "Can't require $mod: $@\n";
1000         for my $enc ( Encode->encodings() ) {
1001             no warnings;
1002             $in_core{$enc}                   and next;
1003             $Encode::Config::ExtModule{$enc} and next;
1004             $LocalMod{$enc} ||= $mod;
1005         }
1006     };
1007     File::Find::find({wanted => $wanted}, @INC);
1008     $_ModLines = "";
1009     for my $enc ( sort keys %LocalMod ) {
1010         $_ModLines .=
1011           qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
1012     }
1013     warn $_ModLines;
1014     $_LocalVer = _mkversion();
1015     $_E2X      = find_e2x();
1016     $_Inc      = $INC{"Encode.pm"};
1017     $_Inc =~ s/\.pm$//o;
1018     _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1019         File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
1020     exit;
1021 }
1022
1023 sub _mkversion{
1024     # v-string is now depreciated; use time() instead;
1025     #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1026     #$yyyy += 1900, $mo +=1;
1027     #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1028     return time();
1029 }
1030
1031 sub _print_expand{
1032     eval { require File::Basename; };
1033     $@ and die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
1034     File::Basename->import();
1035     my ($src, $dst, $clobber) = @_;
1036     if (!$clobber and -e $dst){
1037     warn "$dst exists. skipping\n";
1038     return;
1039     }
1040     warn "Generating $dst...\n";
1041     open my $in, $src or die "$src : $!";
1042     if ((my $d = dirname($dst)) ne '.'){
1043     -d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
1044     }
1045     open my $out, ">$dst" or die "$!";
1046     my $asis = 0;
1047     while (<$in>){
1048     if (/^#### END_OF_HEADER/){
1049         $asis = 1; next;
1050     }
1051     s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1052     print $out $_;
1053     }
1054 }
1055 __END__
1056
1057 =head1 NAME
1058
1059 enc2xs -- Perl Encode Module Generator
1060
1061 =head1 SYNOPSIS
1062
1063   enc2xs -[options]
1064   enc2xs -M ModName mapfiles...
1065   enc2xs -C
1066
1067 =head1 DESCRIPTION
1068
1069 F<enc2xs> builds a Perl extension for use by Encode from either
1070 Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1071 Besides being used internally during the build process of the Encode
1072 module, you can use F<enc2xs> to add your own encoding to perl.
1073 No knowledge of XS is necessary.
1074
1075 =head1 Quick Guide
1076
1077 If you want to know as little about Perl as possible but need to
1078 add a new encoding, just read this chapter and forget the rest.
1079
1080 =over 4
1081
1082 =item 0.
1083
1084 Have a .ucm file ready.  You can get it from somewhere or you can write
1085 your own from scratch or you can grab one from the Encode distribution
1086 and customize it.  For the UCM format, see the next Chapter.  In the
1087 example below, I'll call my theoretical encoding myascii, defined
1088 in I<my.ucm>.  C<$> is a shell prompt.
1089
1090   $ ls -F
1091   my.ucm
1092
1093 =item 1.
1094
1095 Issue a command as follows;
1096
1097   $ enc2xs -M My my.ucm
1098   generating Makefile.PL
1099   generating My.pm
1100   generating README
1101   generating Changes
1102
1103 Now take a look at your current directory.  It should look like this.
1104
1105   $ ls -F
1106   Makefile.PL   My.pm         my.ucm        t/
1107
1108 The following files were created.
1109
1110   Makefile.PL - MakeMaker script
1111   My.pm       - Encode submodule
1112   t/My.t      - test file
1113
1114 =over 4
1115
1116 =item 1.1.
1117
1118 If you want *.ucm installed together with the modules, do as follows;
1119
1120   $ mkdir Encode
1121   $ mv *.ucm Encode
1122   $ enc2xs -M My Encode/*ucm
1123
1124 =back
1125
1126 =item 2.
1127
1128 Edit the files generated.  You don't have to if you have no time AND no
1129 intention to give it to someone else.  But it is a good idea to edit
1130 the pod and to add more tests.
1131
1132 =item 3.
1133
1134 Now issue a command all Perl Mongers love:
1135
1136   $ perl Makefile.PL
1137   Writing Makefile for Encode::My
1138
1139 =item 4.
1140
1141 Now all you have to do is make.
1142
1143   $ make
1144   cp My.pm blib/lib/Encode/My.pm
1145   /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1146     -o encode_t.c -f encode_t.fnm
1147   Reading myascii (myascii)
1148   Writing compiled form
1149   128 bytes in string tables
1150   384 bytes (75%) saved spotting duplicates
1151   1 bytes (0.775%) saved using substrings
1152   ....
1153   chmod 644 blib/arch/auto/Encode/My/My.bs
1154   $
1155
1156 The time it takes varies depending on how fast your machine is and
1157 how large your encoding is.  Unless you are working on something big
1158 like euc-tw, it won't take too long.
1159
1160 =item 5.
1161
1162 You can "make install" already but you should test first.
1163
1164   $ make test
1165   PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1166     -e 'use Test::Harness  qw(&runtests $verbose); \
1167     $verbose=0; runtests @ARGV;' t/*.t
1168   t/My....ok
1169   All tests successful.
1170   Files=1, Tests=2,  0 wallclock secs
1171    ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1172
1173 =item 6.
1174
1175 If you are content with the test result, just "make install"
1176
1177 =item 7.
1178
1179 If you want to add your encoding to Encode's demand-loading list
1180 (so you don't have to "use Encode::YourEncoding"), run
1181
1182   enc2xs -C
1183
1184 to update Encode::ConfigLocal, a module that controls local settings.
1185 After that, "use Encode;" is enough to load your encodings on demand.
1186
1187 =back
1188
1189 =head1 The Unicode Character Map
1190
1191 Encode uses the Unicode Character Map (UCM) format for source character
1192 mappings.  This format is used by IBM's ICU package and was adopted
1193 by Nick Ing-Simmons for use with the Encode module.  Since UCM is
1194 more flexible than Tcl's Encoding Map and far more user-friendly,
1195 this is the recommended formet for Encode now.
1196
1197 A UCM file looks like this.
1198
1199   #
1200   # Comments
1201   #
1202   <code_set_name> "US-ascii" # Required
1203   <code_set_alias> "ascii"   # Optional
1204   <mb_cur_min> 1             # Required; usually 1
1205   <mb_cur_max> 1             # Max. # of bytes/char
1206   <subchar> \x3F             # Substitution char
1207   #
1208   CHARMAP
1209   <U0000> \x00 |0 # <control>
1210   <U0001> \x01 |0 # <control>
1211   <U0002> \x02 |0 # <control>
1212   ....
1213   <U007C> \x7C |0 # VERTICAL LINE
1214   <U007D> \x7D |0 # RIGHT CURLY BRACKET
1215   <U007E> \x7E |0 # TILDE
1216   <U007F> \x7F |0 # <control>
1217   END CHARMAP
1218
1219 =over 4
1220
1221 =item *
1222
1223 Anything that follows C<#> is treated as a comment.
1224
1225 =item *
1226
1227 The header section continues until a line containing the word
1228 CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1229 pair per line.  Strings used as values must be quoted. Barewords are
1230 treated as numbers.  I<\xXX> represents a byte.
1231
1232 Most of the keywords are self-explanatory. I<subchar> means
1233 substitution character, not subcharacter.  When you decode a Unicode
1234 sequence to this encoding but no matching character is found, the byte
1235 sequence defined here will be used.  For most cases, the value here is
1236 \x3F; in ASCII, this is a question mark.
1237
1238 =item *
1239
1240 CHARMAP starts the character map section.  Each line has a form as
1241 follows:
1242
1243   <UXXXX> \xXX.. |0 # comment
1244     ^     ^      ^
1245     |     |      +- Fallback flag
1246     |     +-------- Encoded byte sequence
1247     +-------------- Unicode Character ID in hex
1248
1249 The format is roughly the same as a header section except for the
1250 fallback flag: | followed by 0..3.   The meaning of the possible
1251 values is as follows:
1252
1253 =over 4
1254
1255 =item |0
1256
1257 Round trip safe.  A character decoded to Unicode encodes back to the
1258 same byte sequence.  Most characters have this flag.
1259
1260 =item |1
1261
1262 Fallback for unicode -> encoding.  When seen, enc2xs adds this
1263 character for the encode map only.
1264
1265 =item |2
1266
1267 Skip sub-char mapping should there be no code point.
1268
1269 =item |3
1270
1271 Fallback for encoding -> unicode.  When seen, enc2xs adds this
1272 character for the decode map only.
1273
1274 =back
1275
1276 =item *
1277
1278 And finally, END OF CHARMAP ends the section.
1279
1280 =back
1281
1282 When you are manually creating a UCM file, you should copy ascii.ucm
1283 or an existing encoding which is close to yours, rather than write
1284 your own from scratch.
1285
1286 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1287 is, unless your environment is EBCDIC.
1288
1289 B<CAVEAT>: not all features in UCM are implemented.  For example,
1290 icu:state is not used.  Because of that, you need to write a perl
1291 module if you want to support algorithmical encodings, notably
1292 the ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1293 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1294
1295 =head2 Coping with duplicate mappings
1296
1297 When you create a map, you SHOULD make your mappings round-trip safe.
1298 That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1299 $data> stands for all characters that are marked as C<|0>.  Here is
1300 how to make sure:
1301
1302 =over 4
1303
1304 =item *
1305
1306 Sort your map in Unicode order.
1307
1308 =item *
1309
1310 When you have a duplicate entry, mark either one with '|1' or '|3'.
1311
1312 =item *
1313
1314 And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1315
1316 =back
1317
1318 Here is an example from big5-eten.
1319
1320   <U2550> \xF9\xF9 |0
1321   <U2550> \xA2\xA4 |3
1322
1323 Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1324 this;
1325
1326   E to U               U to E
1327   --------------------------------------
1328   \xF9\xF9 => U2550    U2550 => \xF9\xF9
1329   \xA2\xA4 => U2550
1330
1331 So it is round-trip safe for \xF9\xF9.  But if the line above is upside
1332 down, here is what happens.
1333
1334   E to U               U to E
1335   --------------------------------------
1336   \xA2\xA4 => U2550    U2550 => \xF9\xF9
1337   (\xF9\xF9 => U2550 is now overwritten!)
1338
1339 The Encode package comes with F<ucmlint>, a crude but sufficient
1340 utility to check the integrity of a UCM file.  Check under the
1341 Encode/bin directory for this.
1342
1343 When in doubt, you can use F<ucmsort>, yet another utility under
1344 Encode/bin directory.
1345
1346 =head1 Bookmarks
1347
1348 =over 4
1349
1350 =item *
1351
1352 ICU Home Page
1353 L<http://oss.software.ibm.com/icu/>
1354
1355 =item *
1356
1357 ICU Character Mapping Tables
1358 L<http://oss.software.ibm.com/icu/charset/>
1359
1360 =item *
1361
1362 ICU:Conversion Data
1363 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1364
1365 =back
1366
1367 =head1 SEE ALSO
1368
1369 L<Encode>,
1370 L<perlmod>,
1371 L<perlpod>
1372
1373 =cut
1374
1375 # -Q to disable the duplicate codepoint test
1376 # -S make mapping errors fatal
1377 # -q to remove comments written to output files
1378 # -O to enable the (brute force) substring optimiser
1379 # -o <output> to specify the output file name (else it's the first arg)
1380 # -f <inlist> to give a file with a list of input files (else use the args)
1381 # -n <name> to name the encoding (else use the basename of the input file.
1382
1383 With %seen holding array refs:
1384
1385       865.66 real        28.80 user         8.79 sys
1386       7904  maximum resident set size
1387       1356  average shared memory size
1388      18566  average unshared data size
1389        229  average unshared stack size
1390      46080  page reclaims
1391      33373  page faults
1392
1393 With %seen holding simple scalars:
1394
1395       342.16 real        27.11 user         3.54 sys
1396       8388  maximum resident set size
1397       1394  average shared memory size
1398      14969  average unshared data size
1399        236  average unshared stack size
1400      28159  page reclaims
1401       9839  page faults
1402
1403 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1404 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1405 RAM machine, but it's going to help even on modern machines.
1406 Swapping is bad, m'kay :-)