3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
10 my @orig_ARGV = @ARGV;
11 our $VERSION = do { my @r = (q$Revision: 1.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
13 # These may get re-ordered.
14 # RAW is a do_now as inserted by &enter
15 # AGG is an aggreagated do_now, as built up by &process
32 # (See the algorithm in encengine.c - we're building structures for it)
34 # There are two sorts of structures.
35 # "do_now" (an array, two variants of what needs storing) is whatever we need
36 # to do now we've read an input byte.
37 # It's housed in a "do_next" (which is how we got to it), and in turn points
38 # to a "do_next" which contains all the "do_now"s for the next input byte.
40 # There will be a "do_next" which is the start state.
41 # For a single byte encoding it's the only "do_next" - each "do_now" points
42 # back to it, and each "do_now" will cause bytes. There is no state.
44 # For a multi-byte encoding where all characters in the input are the same
45 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
46 # branching out from the start state, one step for each input byte.
47 # The leaf "do_now"s will all be at the same distance from the start state,
48 # only the leaf "do_now"s cause output bytes, and they in turn point back to
51 # For an encoding where there are varaible length input byte sequences, you
52 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
53 # as before the leaves will point back to the start state.
55 # The system will cope with escape encodings (imagine them as a mostly
56 # self-contained tree for each escape state, and cross links between trees
57 # at the state-switching characters) but so far no input format defines these.
59 # The system will also cope with having output "leaves" in the middle of
60 # the bifurcating branches, not just at the extremities, but again no
61 # input format does this yet.
63 # There are two variants of the "do_now" structure. The first, smaller variant
64 # is generated by &enter as the input file is read. There is one structure
65 # for each input byte. Say we are mapping a single byte encoding to a
66 # single byte encoding, with "ABCD" going "abcd". There will be
67 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
69 # &process then walks the tree, building aggregate "do_now" structres for
70 # adjacent bytes where possible. The aggregate is for a contiguous range of
71 # bytes which each produce the same length of output, each move to the
72 # same next state, and each have the same fallback flag.
73 # So our 4 RAW "do_now"s above become replaced by a single structure
75 # ["A", "D", "abcd", 1, ...]
76 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
77 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
78 # which maps very nicely into pointer arithmetic in C for encengine.c
82 # UTF-8 encode long hand - only covers part of perl's range
84 # chr() works in native space so convert value from table
85 # into that space before using chr().
86 my $ch = chr(utf8::unicode_to_native($_[0]));
87 # Now get core perl to encode that the way it likes.
95 ## my ($ch,$page) = @_; return chr($ch);
101 # encode double byte MS byte first
102 ## my ($ch,$page) = @_; return chr($page).chr($ch);
103 return chr ($_[1]) . chr $_[0];
108 # encode Multi-byte - single for 0..255 otherwise double
109 ## my ($ch,$page) = @_;
110 ## return &encode_D if $page;
112 return chr ($_[1]) . chr $_[0] if $_[1];
116 my %encode_types = (U => \&encode_U,
122 # Win32 does not expand globs on command line
123 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
127 # -Q to disable the duplicate codepoint test
128 # -S make mapping errors fatal
129 # -q to remove comments written to output files
130 # -O to enable the (brute force) substring optimiser
131 # -o <output> to specify the output file name (else it's the first arg)
132 # -f <inlist> to give a file with a list of input files (else use the args)
133 # -n <name> to name the encoding (else use the basename of the input file.
134 getopts('CM:SQqOo:f:n:',\%opt);
136 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
137 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
139 # This really should go first, else the die here causes empty (non-erroneous)
140 # output files to be written.
142 if (exists $opt{'f'}) {
143 # -F is followed by name of file containing list of filenames
144 my $flist = $opt{'f'};
145 open(FLIST,$flist) || die "Cannot open $flist:$!";
146 chomp(@encfiles = <FLIST>);
152 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
153 chmod(0666,$cname) if -f $cname && !-w $cname;
154 open(C,">$cname") || die "Cannot open $cname:$!";
159 my ($doC,$doEnc,$doUcm,$doPet);
161 if ($cname =~ /\.(c|xs)$/)
164 $dname =~ s/(\.[^\.]*)?$/.exh/;
165 chmod(0666,$dname) if -f $cname && !-w $dname;
166 open(D,">$dname") || die "Cannot open $dname:$!";
167 $hname =~ s/(\.[^\.]*)?$/.h/;
168 chmod(0666,$hname) if -f $cname && !-w $hname;
169 open(H,">$hname") || die "Cannot open $hname:$!";
171 foreach my $fh (\*C,\*D,\*H)
173 print $fh <<"END" unless $opt{'q'};
175 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
176 This file was autogenerated by:
182 if ($cname =~ /(\w+)\.xs$/)
184 print C "#include <EXTERN.h>\n";
185 print C "#include <perl.h>\n";
186 print C "#include <XSUB.h>\n";
187 print C "#define U8 U8\n";
189 print C "#include \"encode.h\"\n";
192 elsif ($cname =~ /\.enc$/)
196 elsif ($cname =~ /\.ucm$/)
200 elsif ($cname =~ /\.pet$/)
213 if ($a =~ /^.*-(\d+)/)
216 if ($b =~ /^.*-(\d+)/)
226 foreach my $enc (sort cmp_name @encfiles)
228 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
229 $name = $opt{'n'} if exists $opt{'n'};
234 compile_enc(\*E,lc($name));
238 compile_ucm(\*E,lc($name));
243 warn "Cannot open $enc for $name:$!";
249 print STDERR "Writing compiled form\n";
250 foreach my $name (sort cmp_name keys %encoding)
252 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
253 output(\*C,$name.'_utf8',$e2u);
254 output(\*C,'utf8_'.$name,$u2e);
255 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
257 foreach my $enc (sort cmp_name keys %encoding)
259 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
260 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
261 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
263 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
264 my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
265 my $sym = "${enc}_encoding";
267 print C "encode_t $sym = \n";
268 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
271 foreach my $enc (sort cmp_name keys %encoding)
273 my $sym = "${enc}_encoding";
275 print H "extern encode_t $sym;\n";
276 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
279 if ($cname =~ /(\w+)\.xs$/)
285 Encode_XSEncoding(pTHX_ encode_t *enc)
288 HV *stash = gv_stashpv("Encode::XS", TRUE);
289 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
295 const char *name = enc->name[i++];
296 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
299 call_pv("Encode::define_encoding",G_DISCARD);
305 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
306 print C "BOOT:\n{\n";
307 print C "#include \"$dname\"\n";
310 # Close in void context is bad, m'kay
311 close(D) or warn "Error closing '$dname': $!";
312 close(H) or warn "Error closing '$hname': $!";
314 my $perc_saved = $strings/($strings + $saved) * 100;
315 my $perc_subsaved = $strings/($strings + $subsave) * 100;
316 printf STDERR "%d bytes in string tables\n",$strings;
317 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
318 $saved, $perc_saved if $saved;
319 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
320 $subsave, $perc_subsaved if $subsave;
324 foreach my $name (sort cmp_name keys %encoding)
326 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
327 output_enc(\*C,$name,$e2u);
332 foreach my $name (sort cmp_name keys %encoding)
334 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
335 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
339 # writing half meg files and then not checking to see if you just filled the
341 close(C) or die "Error closing '$cname': $!";
343 # End of the main program.
355 last if /^\s*CHARMAP\s*$/i;
356 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
361 if (!defined($cs = $attr{'code_set_name'}))
363 warn "No <code_set_name> in $name\n";
367 $name = $cs unless exists $opt{'n'};
373 if (exists $attr{'subchar'})
376 #$attr{'subchar'} =~ /^\s*/cg;
377 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
378 #$erep = join('',map(chr(hex($_)),@byte));
379 $erep = $attr{'subchar'};
380 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
382 print "Reading $name ($cs)\n";
388 last if /^\s*END\s+CHARMAP\s*$/i;
390 my (@uni, @byte) = ();
391 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
392 or die "Bad line: $_";
393 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
394 push @uni, map { substr($_, 1) } split(/\+/, $1);
396 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
401 my $uch = join('', map { encode_U(hex($_)) } @uni );
402 my $ech = join('',map(chr(hex($_)),@byte));
403 my $el = length($ech);
404 $max_el = $el if (!defined($max_el) || $el > $max_el);
405 $min_el = $el if (!defined($min_el) || $el < $min_el);
416 # $fb is fallback flag
417 # 0 - round trip safe
418 # 1 - fallback for unicode -> enc
419 # 2 - skip sub-char mapping
420 # 3 - fallback enc -> unicode
421 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
422 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
431 die "$nfb entries without fallback, $hfb entries with\n";
433 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
445 while ($type = <$fh>)
447 last if $type !~ /^\s*#/;
450 return if $type eq 'E';
451 # Do the hash lookup once, rather than once per function call. 4% speedup.
452 my $type_func = $encode_types{$type};
453 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
454 warn "$type encoded $name\n";
456 # Save a defined test by setting these to defined values.
457 my $min_el = ~0; # A very big integer
458 my $max_el = 0; # Anything must be longer than 0
461 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
465 # use -Q to silence the seen test. Makefile.PL uses this by default.
466 $seen = {} unless $opt{Q};
471 my $page = hex($line);
476 # So why is it 1% faster to leave the my here?
478 $line =~ s/\r\n$/\n/;
479 die "$.:${line}Line should be exactly 65 characters long including
480 newline (".length($line).")" unless length ($line) == 65;
481 # Split line into groups of 4 hex digits, convert groups to ints
483 # map {hex $_} $line =~ /(....)/g
484 # This takes 63.75 (2.5% less time)
485 # unpack "n*", pack "H*", $line
486 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
487 # Doing it as while ($line =~ /(....)/g) took 74.63
488 foreach my $val (unpack "n*", pack "H*", $line)
490 next if $val == 0xFFFD;
491 my $ech = &$type_func($ch,$page);
492 if ($val || (!$ch && !$page))
494 my $el = length($ech);
495 $max_el = $el if $el > $max_el;
496 $min_el = $el if $el < $min_el;
497 my $uch = encode_U($val);
499 # We're doing the test.
500 # We don't need to read this quickly, so storing it as a scalar,
501 # rather than 3 (anon array, plus the 2 scalars it holds) saves
502 # RAM and may make us faster on low RAM systems. [see __END__]
503 if (exists $seen->{$uch})
505 warn sprintf("U%04X is %02X%02X and %04X\n",
506 $val,$page,$ch,$seen->{$uch});
511 $seen->{$uch} = $page << 8 | $ch;
514 # Passing 2 extra args each time is 3.6% slower!
515 # Even with having to add $fallback ||= 0 later
516 enter_fb0($e2u,$ech,$uch);
517 enter_fb0($u2e,$uch,$ech);
521 # No character at this position
522 # enter($e2u,$ech,undef,$e2u);
528 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
529 if $min_el > $max_el;
530 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
531 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
534 # my ($a,$s,$d,$t,$fb) = @_;
536 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
537 # state we shift to after this (multibyte) input character defaults to same
540 # Making sure it is defined seems to be faster than {no warnings;} in
541 # &process, or passing it in as 0 explicity.
542 # XXX $fallback ||= 0;
544 # Start at the beginning and work forwards through the string to zero.
545 # effectively we are removing 1 character from the front each time
546 # but we don't actually edit the string. [this alone seems to be 14% speedup]
547 # Hence -$pos is the length of the remaining string.
548 my $pos = -length $inbytes;
550 my $byte = substr $inbytes, $pos, 1;
553 # RAW_OUT_BYTES => 2,
555 # to unicode an array would seem to be better, because the pages are dense.
556 # from unicode can be very sparse, favouring a hash.
557 # hash using the bytes (all length 1) as keys rather than ord value,
558 # as it's easier to sort these in &process.
560 # It's faster to always add $fallback even if it's undef, rather than
561 # choosing between 3 and 4 element array. (hence why we set it defined
563 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
564 # When $pos was -1 we were at the last input character.
566 $do_now->[RAW_OUT_BYTES] = $outbytes;
567 $do_now->[RAW_NEXT] = $next;
570 # Tail recursion. The intermdiate state may not have a name yet.
571 $current = $do_now->[RAW_NEXT];
575 # This is purely for optimistation. It's just &enter hard coded for $fallback
576 # of 0, using only a 3 entry array ref to save memory for every entry.
578 my ($current,$inbytes,$outbytes,$next) = @_;
581 my $pos = -length $inbytes;
583 my $byte = substr $inbytes, $pos, 1;
584 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
586 $do_now->[RAW_OUT_BYTES] = $outbytes;
587 $do_now->[RAW_NEXT] = $next;
590 $current = $do_now->[RAW_NEXT];
597 my ($fh,$name,$s) = @_;
598 my $sym = $strings{$s};
601 $saved += length($s);
606 foreach my $o (keys %strings)
608 next unless (my $i = index($o,$s)) >= 0;
610 # gcc things that 0x0e+0x10 (anything with e+) starts to look like
611 # a hexadecimal floating point constant. Silly gcc. Only p
612 # introduces a floating point constant. Put the space in to stop it
614 $sym .= sprintf(" +0x%02x",$i) if ($i);
615 $subsave += length($s);
616 return $strings{$s} = $sym;
619 $strings{$s} = $sym = $name;
620 $strings += length($s);
621 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
622 # Maybe we should assert that these are all <256.
623 $definition .= join(',',unpack "C*",$s);
624 # We have a single long line. Split it at convenient commas.
625 $definition =~ s/(.{74,77},)/$1\n/g;
626 print $fh "$definition };\n\n";
637 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
640 foreach my $key (sort keys %$raw) {
643 # RAW_OUT_BYTES => 2,
645 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
646 # Now we are converting from raw to aggregate, switch from 1 byte strings
651 # If this == fails, we're going to reset $agg_max_in below anyway.
652 $b == ++$agg_max_in &&
653 # References in numeric context give the pointer as an int.
654 $agg_next == $next &&
655 $agg_in_len == $in_len &&
656 $agg_out_len == length $out_bytes &&
657 $agg_fallback == $fallback
658 # && length($l->[AGG_OUT_BYTES]) < 16
660 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
661 # we can aggregate this byte onto the end.
662 $l->[AGG_MAX_IN] = $b;
663 $l->[AGG_OUT_BYTES] .= $out_bytes;
667 # AGG_OUT_BYTES => 2,
672 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
673 # (only gains .6% on euc-jp -- is it worth it?)
674 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
675 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
676 $agg_fallback = $fallback];
678 if (exists $next->{Cname}) {
679 $next->{'Forward'} = 1 if $next != $a;
681 process(sprintf("%s_%02x",$name,$b),$next);
684 # encengine.c rules say that last entry must be for 255
685 if ($agg_max_in < 255) {
686 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
688 $a->{'Entries'} = \@ent;
694 my $name = $a->{'Cname'};
696 foreach my $b (@{$a->{'Entries'}})
698 next unless $b->[AGG_OUT_LEN];
699 my $s = $b->[AGG_MIN_IN];
700 my $e = $b->[AGG_MAX_IN];
701 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
705 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
708 foreach my $b (@{$a->{'Entries'}})
710 my ($s,$e,$out,$t,$end,$l) = @$b;
711 outtable($fh,$t) unless $t->{'Done'};
713 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
714 foreach my $b (@{$a->{'Entries'}})
716 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
721 printf $fh outstring($fh,'',$out);
727 print $fh ",",$t->{Cname};
728 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
735 my ($fh,$name,$a) = @_;
743 my ($fh,$name,$a) = @_;
744 die "Changed - fix me for new structure";
745 foreach my $b (sort keys %$a)
747 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
759 my $s = do "unicore/Name.pl";
760 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
762 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
766 last if $s >= 0x10000;
767 my $e = length($2) ? hex($2) : $s;
768 for (my $i = $s; $i <= $e; $i++)
771 # print sprintf("U%04X $name\n",$i);
778 my ($cmap,$a,$t,$pre) = @_;
779 # warn sprintf("Page %x\n",$pre);
781 foreach my $key (sort keys %$raw) {
784 # RAW_OUT_BYTES => 2,
786 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
790 if ($next != $a && $next != $t) {
791 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
792 } elsif (length $out_bytes) {
794 $u = $pre|($u &0x3f);
796 my $s = sprintf "<U%04X> ",$u;
797 #foreach my $c (split(//,$out_bytes)) {
798 # $s .= sprintf "\\x%02X",ord($c);
800 # 9.5% faster changing that loop to this:
801 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
802 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
805 warn join(',',$u, @{$raw->{$key}},$a,$t);
812 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
813 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
814 print $fh "<code_set_name> \"$name\"\n";
818 print $fh "<mb_cur_min> $min_el\n";
822 print $fh "<mb_cur_max> $max_el\n";
826 print $fh "<subchar> ";
827 foreach my $c (split(//,$rep))
829 printf $fh "\\x%02X",ord($c);
834 output_ucm_page(\@cmap,$h,$h,0);
835 print $fh "#\nCHARMAP\n";
836 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
840 print $fh "END CHARMAP\n";
854 eval { require File::Find };
857 push @inc, $inc unless $inc eq '.'; #skip current dir
861 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
862 $atime,$mtime,$ctime,$blksize,$blocks)
863 = lstat($_) or return;
866 $e2x_dir{$File::Find::dir} ||= $mtime;
870 warn join("\n", keys %e2x_dir), "\n";
871 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
873 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
880 eval { require Encode; };
881 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
882 # our used for variable expanstion
884 $_Version = $VERSION;
887 $_TableFiles = join(",", map {qq('$_')} @_);
888 $_Now = scalar localtime();
890 eval { require File::Spec; };
891 warn "Generating Makefile.PL\n";
892 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
893 warn "Generating $_Name.pm\n";
894 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
895 warn "Generating t/$_Name.t\n";
896 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
897 warn "Generating README\n";
898 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
899 warn "Generating t/$_Name.t\n";
900 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
909 sub make_configlocal_pm
911 eval { require Encode; };
912 $@ and die "Unable to require Encode: $@\n";
913 eval { require File::Spec; };
914 # our used for variable expanstion
915 my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
918 my $inc = File::Spec->catfile($d, "Encode");
920 opendir my $dh, $inc or die "$inc:$!";
921 warn "Checking $inc...\n";
922 for my $f (grep /\.pm$/o, readdir($dh)){
923 -f File::Spec->catfile($inc, "$f") or next;
924 $INC{"Encode/$f"} and next;
925 warn "require Encode/$f;\n";
926 eval { require "Encode/$f"; };
927 $@ and die "Can't require Encode/$f: $@\n";
928 for my $enc (Encode->encodings()){
929 $in_core{$enc} and next;
930 $Encode::Config::ExtModule{$enc} and next;
931 my $mod = "Encode/$f";
932 $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
933 $LocalMod{$enc} ||= $mod;
938 for my $enc (sort keys %LocalMod){
940 qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
943 $_LocalVer = _mkversion();
945 $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
946 warn "Writing ", File::Spec->catfile($_Inc,"ConfigLocal.pm"), "\n";
947 _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
948 File::Spec->catfile($_Inc,"ConfigLocal.pm"));
953 my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
954 $yyyy += 1900, $mo +=1;
955 return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
959 eval { require File::Basename; };
960 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
961 File::Basename->import();
962 my ($src, $dst) = @_;
963 open my $in, $src or die "$src : $!";
964 if ((my $d = dirname($dst)) ne '.'){
965 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
967 open my $out, ">$dst" or die "$!";
970 if (/^#### END_OF_HEADER/){
973 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
981 enc2xs -- Perl Encode Module Generator
986 enc2xs -M ModName mapfiles...
991 F<enc2xs> builds a Perl extension for use by Encode from either
992 Unicode Character Mapping files (.ucm) or Tcl Encoding Files
993 (.enc) Besides internally used during the build process of Encode
994 module, you can use F<enc2xs> to add your own encoding to perl. No
995 knowledge on XS is necessary.
999 If what you want to know as little about Perl possible but needs to
1000 add a new encoding, just read this chapter and forget the rest.
1006 Have a .ucm file ready. You can get it from somewhere or you can
1007 write your own from scratch or you can grab one from Encode
1008 distribution and customize. For UCM format, see the next Chapter.
1009 In the example below, I'll call my theoretical encoding myascii,
1010 defined inI<my.ucm>. C<$> is a shell prompt.
1017 Issue a command as follows;
1019 $ enc2xs -M My my.ucm
1020 generating Makefile.PL
1025 Now take a look at your current directory. It should look like this.
1028 Makefile.PL My.pm my.ucm t/
1030 The following files are created.
1032 Makefle.PL - MakeMaker script
1033 My.pm - Encode Submodule
1038 If you want *.ucm installed together with the modules, do as follows;
1042 $ enc2xs -M My Encode/*ucm
1046 Edit the files generated. You don't have to if you have no time AND no
1047 intention to give it to someone else. But it is a good idea to edit
1048 pod and add more tests.
1052 Now issue a command all Perl Mongers love;
1054 $ perl5.7.3 Makefile.PL
1055 Writing Makefile for Encode::My
1059 Now all you have to do is make.
1062 cp My.pm blib/lib/Encode/My.pm
1063 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1064 -o encode_t.c -f encode_t.fnm
1065 Reading myascii (myascii)
1066 Writing compiled form
1067 128 bytes in string tables
1068 384 bytes (25%) saved spotting duplicates
1069 1 bytes (99.2%) saved using substrings
1071 chmod 644 blib/arch/auto/Encode/My/My.bs
1074 The time it takes varies how fast your machine is and how large your
1075 encoding is. Unless you are working on something big like euc-tw, it
1076 won't take too long.
1080 You can "make install" already but you should test first.
1083 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1084 -e 'use Test::Harness qw(&runtests $verbose); \
1085 $verbose=0; runtests @ARGV;' t/*.t
1087 All tests successful.
1088 Files=1, Tests=2, 0 wallclock secs
1089 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1093 If you are content with the test result, just "make install"
1097 If you want to add your encoding to Encode demand-loading list
1098 (so you don't have to "use Encode::YourEncoding"), run
1102 to update Encode::ConfigLocal, a module that controls local settings.
1103 After that, "use Encode;" is enough to load your encodings on demand.
1107 =head1 The Unicode Character Map
1109 Encode uses The Unicode Character Map (UCM) for source character
1110 mappings. This format is used by ICU package of IBM and adopted by
1111 Nick Ing-Simmons. Since UCM is more flexible than Tcl's Encoding Map
1112 and far more user-friendly, This is the recommended formet for
1115 UCM file looks like this.
1120 <code_set_name> "US-ascii" # Required
1121 <code_set_alias> "ascii" # Optional
1122 <mb_cur_min> 1 # Required; usually 1
1123 <mb_cur_max> 1 # Max. # of bytes/char
1124 <subchar> \x3F # Substitution char
1127 <U0000> \x00 |0 # <control>
1128 <U0001> \x01 |0 # <control>
1129 <U0002> \x02 |0 # <control>
1131 <U007C> \x7C |0 # VERTICAL LINE
1132 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1133 <U007E> \x7E |0 # TILDE
1134 <U007F> \x7F |0 # <control>
1141 Anything that follows C<#> is treated as comments.
1145 The header section continues until CHARMAP. This section Has a form of
1146 I<E<lt>keywordE<gt> value>, one at a line. For a value, strings must
1147 be quoted. Barewords are treated as numbers. I<\xXX> represents a
1150 Most of the keywords are self-explanatory. I<subchar> means
1151 substitution character, not subcharacter. When you decode a Unicode
1152 sequence to this encoding but no matching character is found, the byte
1153 sequence defined here will be used. For most cases, the value here is
1154 \x3F, in ASCII this is a question mark.
1158 CHARMAP starts the character map section. Each line has a form as
1161 <UXXXX> \xXX.. |0 # comment
1163 | | +- Fallback flag
1164 | +-------- Encoded byte sequence
1165 +-------------- Unicode Character ID in hex
1167 The format is roughly the same as a header section except for fallback
1168 flag. It is | followed by 0..3. And their meaning as follows
1174 Round trip safe. A character decoded to Unicode encodes back to the
1175 same byte sequence. most character belong to this.
1179 Fallback for unicode -> encoding. When seen, enc2xs adds this
1180 character for encode map only
1184 Skip sub-char mapping should there be no code point.
1188 Fallback for encoding -> unicode. When seen, enc2xs adds this
1189 character for decode map only
1195 And finally, END OF CHARMAP ends the section.
1199 When you are manually creating a UCM file, you should copy ascii.ucm
1200 or existing encoding which is close to yours than write your own from
1203 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1204 is, unless your environment is on EBCDIC.
1206 B<CAVEAT>: not all features in UCM are implemented. For example,
1207 icu:state is not used. Because of that, you need to write a perl
1208 module if you want to support algorithmical encodings, notablly
1209 ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
1210 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1212 =head2 Coping with duplicate mappings
1214 When you create a map, you SHOULD make your mappings round-trip safe.
1215 That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1216 $data> stands for all characters that are marked as C<|0>. Here is
1223 Sort your map in Unicode order.
1227 When you have a duplicate entry, mark either one with '|1' or '|3'.
1231 And make sure '|1' or '|3' FOLLOWS '|0' entry.
1235 Here is an example from big5-eten.
1240 Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1244 --------------------------------------
1245 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1248 So it is round-trip safe for \xF9\xF9. But if the line above is upside
1249 down, here is what happens.
1252 --------------------------------------
1253 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1254 (\xF9\xF9 => U2550 is now overwritten!)
1256 The Encode package comes with F<ucmlint>, a crude but sufficient
1257 utility to check the integrity of ucm file. Check under Encode/bin
1264 L<http://oss.software.ibm.com/icu/>
1266 ICU Character Mapping Tables
1267 L<http://oss.software.ibm.com/icu/charset/>
1270 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1280 # -Q to disable the duplicate codepoint test
1281 # -S make mapping errors fatal
1282 # -q to remove comments written to output files
1283 # -O to enable the (brute force) substring optimiser
1284 # -o <output> to specify the output file name (else it's the first arg)
1285 # -f <inlist> to give a file with a list of input files (else use the args)
1286 # -n <name> to name the encoding (else use the basename of the input file.
1288 With %seen holding array refs:
1290 865.66 real 28.80 user 8.79 sys
1291 7904 maximum resident set size
1292 1356 average shared memory size
1293 18566 average unshared data size
1294 229 average unshared stack size
1298 With %seen holding simple scalars:
1300 342.16 real 27.11 user 3.54 sys
1301 8388 maximum resident set size
1302 1394 average shared memory size
1303 14969 average unshared data size
1304 236 average unshared stack size
1308 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1309 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1310 RAM machine, but it's going to help even on modern machines.
1311 Swapping is bad, m'kay :-)