3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
10 my @orig_ARGV = @ARGV;
11 our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
13 # These may get re-ordered.
14 # RAW is a do_now as inserted by &enter
15 # AGG is an aggreagated do_now, as built up by &process
32 # (See the algorithm in encengine.c - we're building structures for it)
34 # There are two sorts of structures.
35 # "do_now" (an array, two variants of what needs storing) is whatever we need
36 # to do now we've read an input byte.
37 # It's housed in a "do_next" (which is how we got to it), and in turn points
38 # to a "do_next" which contains all the "do_now"s for the next input byte.
40 # There will be a "do_next" which is the start state.
41 # For a single byte encoding it's the only "do_next" - each "do_now" points
42 # back to it, and each "do_now" will cause bytes. There is no state.
44 # For a multi-byte encoding where all characters in the input are the same
45 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
46 # branching out from the start state, one step for each input byte.
47 # The leaf "do_now"s will all be at the same distance from the start state,
48 # only the leaf "do_now"s cause output bytes, and they in turn point back to
51 # For an encoding where there are varaible length input byte sequences, you
52 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
53 # as before the leaves will point back to the start state.
55 # The system will cope with escape encodings (imagine them as a mostly
56 # self-contained tree for each escape state, and cross links between trees
57 # at the state-switching characters) but so far no input format defines these.
59 # The system will also cope with having output "leaves" in the middle of
60 # the bifurcating branches, not just at the extremities, but again no
61 # input format does this yet.
63 # There are two variants of the "do_now" structure. The first, smaller variant
64 # is generated by &enter as the input file is read. There is one structure
65 # for each input byte. Say we are mapping a single byte encoding to a
66 # single byte encoding, with "ABCD" going "abcd". There will be
67 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
69 # &process then walks the tree, building aggregate "do_now" structres for
70 # adjacent bytes where possible. The aggregate is for a contiguous range of
71 # bytes which each produce the same length of output, each move to the
72 # same next state, and each have the same fallback flag.
73 # So our 4 RAW "do_now"s above become replaced by a single structure
75 # ["A", "D", "abcd", 1, ...]
76 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
77 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
78 # which maps very nicely into pointer arithmetic in C for encengine.c
82 # UTF-8 encode long hand - only covers part of perl's range
84 # chr() works in native space so convert value from table
85 # into that space before using chr().
86 my $ch = chr(utf8::unicode_to_native($_[0]));
87 # Now get core perl to encode that the way it likes.
95 ## my ($ch,$page) = @_; return chr($ch);
101 # encode double byte MS byte first
102 ## my ($ch,$page) = @_; return chr($page).chr($ch);
103 return chr ($_[1]) . chr $_[0];
108 # encode Multi-byte - single for 0..255 otherwise double
109 ## my ($ch,$page) = @_;
110 ## return &encode_D if $page;
112 return chr ($_[1]) . chr $_[0] if $_[1];
116 my %encode_types = (U => \&encode_U,
122 # Win32 does not expand globs on command line
123 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
127 # -Q to disable the duplicate codepoint test
128 # -S make mapping errors fatal
129 # -q to remove comments written to output files
130 # -O to enable the (brute force) substring optimiser
131 # -o <output> to specify the output file name (else it's the first arg)
132 # -f <inlist> to give a file with a list of input files (else use the args)
133 # -n <name> to name the encoding (else use the basename of the input file.
134 getopts('CM:SQqOo:f:n:',\%opt);
136 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
137 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
139 # This really should go first, else the die here causes empty (non-erroneous)
140 # output files to be written.
142 if (exists $opt{'f'}) {
143 # -F is followed by name of file containing list of filenames
144 my $flist = $opt{'f'};
145 open(FLIST,$flist) || die "Cannot open $flist:$!";
146 chomp(@encfiles = <FLIST>);
152 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
153 chmod(0666,$cname) if -f $cname && !-w $cname;
154 open(C,">$cname") || die "Cannot open $cname:$!";
159 my ($doC,$doEnc,$doUcm,$doPet);
161 if ($cname =~ /\.(c|xs)$/)
164 $dname =~ s/(\.[^\.]*)?$/.exh/;
165 chmod(0666,$dname) if -f $cname && !-w $dname;
166 open(D,">$dname") || die "Cannot open $dname:$!";
167 $hname =~ s/(\.[^\.]*)?$/.h/;
168 chmod(0666,$hname) if -f $cname && !-w $hname;
169 open(H,">$hname") || die "Cannot open $hname:$!";
171 foreach my $fh (\*C,\*D,\*H)
173 print $fh <<"END" unless $opt{'q'};
175 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
176 This file was autogenerated by:
182 if ($cname =~ /(\w+)\.xs$/)
184 print C "#include <EXTERN.h>\n";
185 print C "#include <perl.h>\n";
186 print C "#include <XSUB.h>\n";
187 print C "#define U8 U8\n";
189 print C "#include \"encode.h\"\n";
192 elsif ($cname =~ /\.enc$/)
196 elsif ($cname =~ /\.ucm$/)
200 elsif ($cname =~ /\.pet$/)
213 if ($a =~ /^.*-(\d+)/)
216 if ($b =~ /^.*-(\d+)/)
226 foreach my $enc (sort cmp_name @encfiles)
228 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
229 $name = $opt{'n'} if exists $opt{'n'};
234 compile_enc(\*E,lc($name));
238 compile_ucm(\*E,lc($name));
243 warn "Cannot open $enc for $name:$!";
249 print STDERR "Writing compiled form\n";
250 foreach my $name (sort cmp_name keys %encoding)
252 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
253 output(\*C,$name.'_utf8',$e2u);
254 output(\*C,'utf8_'.$name,$u2e);
255 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
257 foreach my $enc (sort cmp_name keys %encoding)
259 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
260 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
261 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
263 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
264 my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
265 my $sym = "${enc}_encoding";
267 print C "encode_t $sym = \n";
268 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
271 foreach my $enc (sort cmp_name keys %encoding)
273 my $sym = "${enc}_encoding";
275 print H "extern encode_t $sym;\n";
276 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
279 if ($cname =~ /(\w+)\.xs$/)
285 Encode_XSEncoding(pTHX_ encode_t *enc)
288 HV *stash = gv_stashpv("Encode::XS", TRUE);
289 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
295 const char *name = enc->name[i++];
296 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
299 call_pv("Encode::define_encoding",G_DISCARD);
305 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
306 print C "BOOT:\n{\n";
307 print C "#include \"$dname\"\n";
310 # Close in void context is bad, m'kay
311 close(D) or warn "Error closing '$dname': $!";
312 close(H) or warn "Error closing '$hname': $!";
314 my $perc_saved = $strings/($strings + $saved) * 100;
315 my $perc_subsaved = $strings/($strings + $subsave) * 100;
316 printf STDERR "%d bytes in string tables\n",$strings;
317 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
318 $saved, $perc_saved if $saved;
319 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
320 $subsave, $perc_subsaved if $subsave;
324 foreach my $name (sort cmp_name keys %encoding)
326 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
327 output_enc(\*C,$name,$e2u);
332 foreach my $name (sort cmp_name keys %encoding)
334 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
335 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
339 # writing half meg files and then not checking to see if you just filled the
341 close(C) or die "Error closing '$cname': $!";
343 # End of the main program.
355 last if /^\s*CHARMAP\s*$/i;
356 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
361 if (!defined($cs = $attr{'code_set_name'}))
363 warn "No <code_set_name> in $name\n";
367 $name = $cs unless exists $opt{'n'};
373 if (exists $attr{'subchar'})
376 #$attr{'subchar'} =~ /^\s*/cg;
377 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
378 #$erep = join('',map(chr(hex($_)),@byte));
379 $erep = $attr{'subchar'};
380 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
382 print "Reading $name ($cs)\n";
388 last if /^\s*END\s+CHARMAP\s*$/i;
390 my (@uni, @byte) = ();
391 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
392 or die "Bad line: $_";
393 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
394 push @uni, map { substr($_, 1) } split(/\+/, $1);
396 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
401 my $uch = join('', map { encode_U(hex($_)) } @uni );
402 my $ech = join('',map(chr(hex($_)),@byte));
403 my $el = length($ech);
404 $max_el = $el if (!defined($max_el) || $el > $max_el);
405 $min_el = $el if (!defined($min_el) || $el < $min_el);
416 # $fb is fallback flag
417 # 0 - round trip safe
418 # 1 - fallback for unicode -> enc
419 # 2 - skip sub-char mapping
420 # 3 - fallback enc -> unicode
421 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
422 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
431 die "$nfb entries without fallback, $hfb entries with\n";
433 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
445 while ($type = <$fh>)
447 last if $type !~ /^\s*#/;
450 return if $type eq 'E';
451 # Do the hash lookup once, rather than once per function call. 4% speedup.
452 my $type_func = $encode_types{$type};
453 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
454 warn "$type encoded $name\n";
456 # Save a defined test by setting these to defined values.
457 my $min_el = ~0; # A very big integer
458 my $max_el = 0; # Anything must be longer than 0
461 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
465 # use -Q to silence the seen test. Makefile.PL uses this by default.
466 $seen = {} unless $opt{Q};
471 my $page = hex($line);
476 # So why is it 1% faster to leave the my here?
478 $line =~ s/\r\n$/\n/;
479 die "$.:${line}Line should be exactly 65 characters long including
480 newline (".length($line).")" unless length ($line) == 65;
481 # Split line into groups of 4 hex digits, convert groups to ints
483 # map {hex $_} $line =~ /(....)/g
484 # This takes 63.75 (2.5% less time)
485 # unpack "n*", pack "H*", $line
486 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
487 # Doing it as while ($line =~ /(....)/g) took 74.63
488 foreach my $val (unpack "n*", pack "H*", $line)
490 next if $val == 0xFFFD;
491 my $ech = &$type_func($ch,$page);
492 if ($val || (!$ch && !$page))
494 my $el = length($ech);
495 $max_el = $el if $el > $max_el;
496 $min_el = $el if $el < $min_el;
497 my $uch = encode_U($val);
499 # We're doing the test.
500 # We don't need to read this quickly, so storing it as a scalar,
501 # rather than 3 (anon array, plus the 2 scalars it holds) saves
502 # RAM and may make us faster on low RAM systems. [see __END__]
503 if (exists $seen->{$uch})
505 warn sprintf("U%04X is %02X%02X and %04X\n",
506 $val,$page,$ch,$seen->{$uch});
511 $seen->{$uch} = $page << 8 | $ch;
514 # Passing 2 extra args each time is 3.6% slower!
515 # Even with having to add $fallback ||= 0 later
516 enter_fb0($e2u,$ech,$uch);
517 enter_fb0($u2e,$uch,$ech);
521 # No character at this position
522 # enter($e2u,$ech,undef,$e2u);
528 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
529 if $min_el > $max_el;
530 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
531 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
534 # my ($a,$s,$d,$t,$fb) = @_;
536 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
537 # state we shift to after this (multibyte) input character defaults to same
540 # Making sure it is defined seems to be faster than {no warnings;} in
541 # &process, or passing it in as 0 explicity.
542 # XXX $fallback ||= 0;
544 # Start at the beginning and work forwards through the string to zero.
545 # effectively we are removing 1 character from the front each time
546 # but we don't actually edit the string. [this alone seems to be 14% speedup]
547 # Hence -$pos is the length of the remaining string.
548 my $pos = -length $inbytes;
550 my $byte = substr $inbytes, $pos, 1;
553 # RAW_OUT_BYTES => 2,
555 # to unicode an array would seem to be better, because the pages are dense.
556 # from unicode can be very sparse, favouring a hash.
557 # hash using the bytes (all length 1) as keys rather than ord value,
558 # as it's easier to sort these in &process.
560 # It's faster to always add $fallback even if it's undef, rather than
561 # choosing between 3 and 4 element array. (hence why we set it defined
563 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
564 # When $pos was -1 we were at the last input character.
566 $do_now->[RAW_OUT_BYTES] = $outbytes;
567 $do_now->[RAW_NEXT] = $next;
570 # Tail recursion. The intermdiate state may not have a name yet.
571 $current = $do_now->[RAW_NEXT];
575 # This is purely for optimistation. It's just &enter hard coded for $fallback
576 # of 0, using only a 3 entry array ref to save memory for every entry.
578 my ($current,$inbytes,$outbytes,$next) = @_;
581 my $pos = -length $inbytes;
583 my $byte = substr $inbytes, $pos, 1;
584 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
586 $do_now->[RAW_OUT_BYTES] = $outbytes;
587 $do_now->[RAW_NEXT] = $next;
590 $current = $do_now->[RAW_NEXT];
597 my ($fh,$name,$s) = @_;
598 my $sym = $strings{$s};
601 $saved += length($s);
606 foreach my $o (keys %strings)
608 next unless (my $i = index($o,$s)) >= 0;
610 # gcc things that 0x0e+0x10 (anything with e+) starts to look like
611 # a hexadecimal floating point constant. Silly gcc. Only p
612 # introduces a floating point constant. Put the space in to stop it
614 $sym .= sprintf(" +0x%02x",$i) if ($i);
615 $subsave += length($s);
616 return $strings{$s} = $sym;
619 $strings{$s} = $sym = $name;
620 $strings += length($s);
621 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
622 # Maybe we should assert that these are all <256.
623 $definition .= join(',',unpack "C*",$s);
624 # We have a single long line. Split it at convenient commas.
625 $definition =~ s/(.{74,77},)/$1\n/g;
626 print $fh "$definition };\n\n";
637 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
640 foreach my $key (sort keys %$raw) {
643 # RAW_OUT_BYTES => 2,
645 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
646 # Now we are converting from raw to aggregate, switch from 1 byte strings
651 # If this == fails, we're going to reset $agg_max_in below anyway.
652 $b == ++$agg_max_in &&
653 # References in numeric context give the pointer as an int.
654 $agg_next == $next &&
655 $agg_in_len == $in_len &&
656 $agg_out_len == length $out_bytes &&
657 $agg_fallback == $fallback
658 # && length($l->[AGG_OUT_BYTES]) < 16
660 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
661 # we can aggregate this byte onto the end.
662 $l->[AGG_MAX_IN] = $b;
663 $l->[AGG_OUT_BYTES] .= $out_bytes;
667 # AGG_OUT_BYTES => 2,
672 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
673 # (only gains .6% on euc-jp -- is it worth it?)
674 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
675 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
676 $agg_fallback = $fallback];
678 if (exists $next->{Cname}) {
679 $next->{'Forward'} = 1 if $next != $a;
681 process(sprintf("%s_%02x",$name,$b),$next);
684 # encengine.c rules say that last entry must be for 255
685 if ($agg_max_in < 255) {
686 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
688 $a->{'Entries'} = \@ent;
694 my $name = $a->{'Cname'};
696 foreach my $b (@{$a->{'Entries'}})
698 next unless $b->[AGG_OUT_LEN];
699 my $s = $b->[AGG_MIN_IN];
700 my $e = $b->[AGG_MAX_IN];
701 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
705 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
708 foreach my $b (@{$a->{'Entries'}})
710 my ($s,$e,$out,$t,$end,$l) = @$b;
711 outtable($fh,$t) unless $t->{'Done'};
713 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
714 foreach my $b (@{$a->{'Entries'}})
716 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
721 printf $fh outstring($fh,'',$out);
727 print $fh ",",$t->{Cname};
728 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
735 my ($fh,$name,$a) = @_;
743 my ($fh,$name,$a) = @_;
744 die "Changed - fix me for new structure";
745 foreach my $b (sort keys %$a)
747 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
759 my $s = do "unicore/Name.pl";
760 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
762 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
766 last if $s >= 0x10000;
767 my $e = length($2) ? hex($2) : $s;
768 for (my $i = $s; $i <= $e; $i++)
771 # print sprintf("U%04X $name\n",$i);
778 my ($cmap,$a,$t,$pre) = @_;
779 # warn sprintf("Page %x\n",$pre);
781 foreach my $key (sort keys %$raw) {
784 # RAW_OUT_BYTES => 2,
786 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
790 if ($next != $a && $next != $t) {
791 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
792 } elsif (length $out_bytes) {
794 $u = $pre|($u &0x3f);
796 my $s = sprintf "<U%04X> ",$u;
797 #foreach my $c (split(//,$out_bytes)) {
798 # $s .= sprintf "\\x%02X",ord($c);
800 # 9.5% faster changing that loop to this:
801 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
802 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
805 warn join(',',$u, @{$raw->{$key}},$a,$t);
812 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
813 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
814 print $fh "<code_set_name> \"$name\"\n";
818 print $fh "<mb_cur_min> $min_el\n";
822 print $fh "<mb_cur_max> $max_el\n";
826 print $fh "<subchar> ";
827 foreach my $c (split(//,$rep))
829 printf $fh "\\x%02X",ord($c);
834 output_ucm_page(\@cmap,$h,$h,0);
835 print $fh "#\nCHARMAP\n";
836 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
840 print $fh "END CHARMAP\n";
854 eval { require File::Find };
857 push @inc, $inc unless $inc eq '.'; #skip current dir
861 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
862 $atime,$mtime,$ctime,$blksize,$blocks)
863 = lstat($_) or return;
866 $e2x_dir{$File::Find::dir} ||= $mtime;
870 warn join("\n", keys %e2x_dir), "\n";
871 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
873 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
880 eval { require Encode; };
881 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
882 # our used for variable expanstion
884 $_Version = $VERSION;
887 $_TableFiles = join(",", map {qq('$_')} @_);
888 $_Now = scalar localtime();
890 eval { require File::Spec; };
891 warn "Generating Makefile.PL\n";
892 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
893 warn "Generating $_Name.pm\n";
894 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
895 warn "Generating t/$_Name.t\n";
896 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
897 warn "Generating README\n";
898 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
899 warn "Generating t/$_Name.t\n";
900 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
909 sub make_configlocal_pm
911 eval { require Encode; };
912 $@ and die "Unable to require Encode: $@\n";
913 eval { require File::Spec; };
914 # our used for variable expanstion
915 my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
918 my $inc = File::Spec->catfile($d, "Encode");
920 opendir my $dh, $inc or die "$inc:$!";
921 warn "Checking $inc...\n";
922 for my $f (grep /\.pm$/o, readdir($dh)){
923 -f File::Spec->catfile($inc, "$f") or next;
924 $INC{"Encode/$f"} and next;
925 warn "require Encode/$f;\n";
926 eval { require "Encode/$f"; };
927 $@ and die "Can't require Encode/$f: $@\n";
928 for my $enc (Encode->encodings()){
929 $in_core{$enc} and next;
930 $Encode::Config::ExtModule{$enc} and next;
931 my $mod = "Encode/$f";
932 $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
933 $LocalMod{$enc} ||= $mod;
938 for my $enc (sort keys %LocalMod){
940 qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
943 $_LocalVer = _mkversion();
945 $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
946 warn "Writing ", File::Spec->catfile($_Inc,"ConfigLocal.pm"), "\n";
947 _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
948 File::Spec->catfile($_Inc,"ConfigLocal.pm"));
953 my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
954 $yyyy += 1900, $mo +=1;
955 return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
959 eval { require File::Basename; };
960 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
961 File::Basename->import();
962 my ($src, $dst) = @_;
963 open my $in, $src or die "$src : $!";
964 if ((my $d = dirname($dst)) ne '.'){
965 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
967 open my $out, ">$dst" or die "$!";
970 if (/^#### END_OF_HEADER/){
973 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
981 enc2xs -- Perl Encode Module Generator
986 enc2xs -M ModName mapfiles...
991 F<enc2xs> builds a Perl extension for use by Encode from either
992 Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
993 Besides being used internally during the build process of the Encode
994 module, you can use F<enc2xs> to add your own encoding to perl.
995 No knowledge of XS is necessary.
999 If you want to know as little about Perl as possible but need to
1000 add a new encoding, just read this chapter and forget the rest.
1006 Have a .ucm file ready. You can get it from somewhere or you can write
1007 your own from scratch or you can grab one from the Encode distribution
1008 and customize it. For the UCM format, see the next Chapter. In the
1009 example below, I'll call my theoretical encoding myascii, defined
1010 in I<my.ucm>. C<$> is a shell prompt.
1017 Issue a command as follows;
1019 $ enc2xs -M My my.ucm
1020 generating Makefile.PL
1025 Now take a look at your current directory. It should look like this.
1028 Makefile.PL My.pm my.ucm t/
1030 The following files were created.
1032 Makefile.PL - MakeMaker script
1033 My.pm - Encode submodule
1040 If you want *.ucm installed together with the modules, do as follows;
1044 $ enc2xs -M My Encode/*ucm
1050 Edit the files generated. You don't have to if you have no time AND no
1051 intention to give it to someone else. But it is a good idea to edit
1052 the pod and to add more tests.
1056 Now issue a command all Perl Mongers love:
1058 $ perl5.7.3 Makefile.PL
1059 Writing Makefile for Encode::My
1063 Now all you have to do is make.
1066 cp My.pm blib/lib/Encode/My.pm
1067 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1068 -o encode_t.c -f encode_t.fnm
1069 Reading myascii (myascii)
1070 Writing compiled form
1071 128 bytes in string tables
1072 384 bytes (25%) saved spotting duplicates
1073 1 bytes (99.2%) saved using substrings
1075 chmod 644 blib/arch/auto/Encode/My/My.bs
1078 The time it takes varies depending on how fast your machine is and
1079 how large your encoding is. Unless you are working on something big
1080 like euc-tw, it won't take too long.
1084 You can "make install" already but you should test first.
1087 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1088 -e 'use Test::Harness qw(&runtests $verbose); \
1089 $verbose=0; runtests @ARGV;' t/*.t
1091 All tests successful.
1092 Files=1, Tests=2, 0 wallclock secs
1093 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1097 If you are content with the test result, just "make install"
1101 If you want to add your encoding to Encode's demand-loading list
1102 (so you don't have to "use Encode::YourEncoding"), run
1106 to update Encode::ConfigLocal, a module that controls local settings.
1107 After that, "use Encode;" is enough to load your encodings on demand.
1111 =head1 The Unicode Character Map
1113 Encode uses the Unicode Character Map (UCM) format for source character
1114 mappings. This format is used by IBM's ICU package and was adopted
1115 by Nick Ing-Simmons for use with the Encode module. Since UCM is
1116 more flexible than Tcl's Encoding Map and far more user-friendly,
1117 this is the recommended formet for Encode now.
1119 A UCM file looks like this.
1124 <code_set_name> "US-ascii" # Required
1125 <code_set_alias> "ascii" # Optional
1126 <mb_cur_min> 1 # Required; usually 1
1127 <mb_cur_max> 1 # Max. # of bytes/char
1128 <subchar> \x3F # Substitution char
1131 <U0000> \x00 |0 # <control>
1132 <U0001> \x01 |0 # <control>
1133 <U0002> \x02 |0 # <control>
1135 <U007C> \x7C |0 # VERTICAL LINE
1136 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1137 <U007E> \x7E |0 # TILDE
1138 <U007F> \x7F |0 # <control>
1145 Anything that follows C<#> is treated as a comment.
1149 The header section continues until a line containing the word
1150 CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1151 pair per line. Strings used as values must be quoted. Barewords are
1152 treated as numbers. I<\xXX> represents a byte.
1154 Most of the keywords are self-explanatory. I<subchar> means
1155 substitution character, not subcharacter. When you decode a Unicode
1156 sequence to this encoding but no matching character is found, the byte
1157 sequence defined here will be used. For most cases, the value here is
1158 \x3F; in ASCII, this is a question mark.
1162 CHARMAP starts the character map section. Each line has a form as
1165 <UXXXX> \xXX.. |0 # comment
1167 | | +- Fallback flag
1168 | +-------- Encoded byte sequence
1169 +-------------- Unicode Character ID in hex
1171 The format is roughly the same as a header section except for the
1172 fallback flag: | followed by 0..3. The meaning of the possible
1173 values is as follows:
1179 Round trip safe. A character decoded to Unicode encodes back to the
1180 same byte sequence. Most characters have this flag.
1184 Fallback for unicode -> encoding. When seen, enc2xs adds this
1185 character for the encode map only.
1189 Skip sub-char mapping should there be no code point.
1193 Fallback for encoding -> unicode. When seen, enc2xs adds this
1194 character for the decode map only.
1200 And finally, END OF CHARMAP ends the section.
1204 When you are manually creating a UCM file, you should copy ascii.ucm
1205 or an existing encoding which is close to yours, rather than write
1206 your own from scratch.
1208 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1209 is, unless your environment is EBCDIC.
1211 B<CAVEAT>: not all features in UCM are implemented. For example,
1212 icu:state is not used. Because of that, you need to write a perl
1213 module if you want to support algorithmical encodings, notably
1214 the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
1215 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1217 =head2 Coping with duplicate mappings
1219 When you create a map, you SHOULD make your mappings round-trip safe.
1220 That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1221 $data> stands for all characters that are marked as C<|0>. Here is
1228 Sort your map in Unicode order.
1232 When you have a duplicate entry, mark either one with '|1' or '|3'.
1236 And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1240 Here is an example from big5-eten.
1245 Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1249 --------------------------------------
1250 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1253 So it is round-trip safe for \xF9\xF9. But if the line above is upside
1254 down, here is what happens.
1257 --------------------------------------
1258 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1259 (\xF9\xF9 => U2550 is now overwritten!)
1261 The Encode package comes with F<ucmlint>, a crude but sufficient
1262 utility to check the integrity of a UCM file. Check under the
1263 Encode/bin directory for this.
1273 L<http://oss.software.ibm.com/icu/>
1277 ICU Character Mapping Tables
1278 L<http://oss.software.ibm.com/icu/charset/>
1283 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1295 # -Q to disable the duplicate codepoint test
1296 # -S make mapping errors fatal
1297 # -q to remove comments written to output files
1298 # -O to enable the (brute force) substring optimiser
1299 # -o <output> to specify the output file name (else it's the first arg)
1300 # -f <inlist> to give a file with a list of input files (else use the args)
1301 # -n <name> to name the encoding (else use the basename of the input file.
1303 With %seen holding array refs:
1305 865.66 real 28.80 user 8.79 sys
1306 7904 maximum resident set size
1307 1356 average shared memory size
1308 18566 average unshared data size
1309 229 average unshared stack size
1313 With %seen holding simple scalars:
1315 342.16 real 27.11 user 3.54 sys
1316 8388 maximum resident set size
1317 1394 average shared memory size
1318 14969 average unshared data size
1319 236 average unshared stack size
1323 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1324 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1325 RAM machine, but it's going to help even on modern machines.
1326 Swapping is bad, m'kay :-)