3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
10 my @orig_ARGV = @ARGV;
11 our $VERSION = do { my @r = (q$Revision: 1.29 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
13 # These may get re-ordered.
14 # RAW is a do_now as inserted by &enter
15 # AGG is an aggreagated do_now, as built up by &process
32 # (See the algorithm in encengine.c - we're building structures for it)
34 # There are two sorts of structures.
35 # "do_now" (an array, two variants of what needs storing) is whatever we need
36 # to do now we've read an input byte.
37 # It's housed in a "do_next" (which is how we got to it), and in turn points
38 # to a "do_next" which contains all the "do_now"s for the next input byte.
40 # There will be a "do_next" which is the start state.
41 # For a single byte encoding it's the only "do_next" - each "do_now" points
42 # back to it, and each "do_now" will cause bytes. There is no state.
44 # For a multi-byte encoding where all characters in the input are the same
45 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
46 # branching out from the start state, one step for each input byte.
47 # The leaf "do_now"s will all be at the same distance from the start state,
48 # only the leaf "do_now"s cause output bytes, and they in turn point back to
51 # For an encoding where there are varaible length input byte sequences, you
52 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
53 # as before the leaves will point back to the start state.
55 # The system will cope with escape encodings (imagine them as a mostly
56 # self-contained tree for each escape state, and cross links between trees
57 # at the state-switching characters) but so far no input format defines these.
59 # The system will also cope with having output "leaves" in the middle of
60 # the bifurcating branches, not just at the extremities, but again no
61 # input format does this yet.
63 # There are two variants of the "do_now" structure. The first, smaller variant
64 # is generated by &enter as the input file is read. There is one structure
65 # for each input byte. Say we are mapping a single byte encoding to a
66 # single byte encoding, with "ABCD" going "abcd". There will be
67 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
69 # &process then walks the tree, building aggregate "do_now" structres for
70 # adjacent bytes where possible. The aggregate is for a contiguous range of
71 # bytes which each produce the same length of output, each move to the
72 # same next state, and each have the same fallback flag.
73 # So our 4 RAW "do_now"s above become replaced by a single structure
75 # ["A", "D", "abcd", 1, ...]
76 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
77 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
78 # which maps very nicely into pointer arithmetic in C for encengine.c
82 # UTF-8 encode long hand - only covers part of perl's range
84 # chr() works in native space so convert value from table
85 # into that space before using chr().
86 my $ch = chr(utf8::unicode_to_native($_[0]));
87 # Now get core perl to encode that the way it likes.
95 ## my ($ch,$page) = @_; return chr($ch);
101 # encode double byte MS byte first
102 ## my ($ch,$page) = @_; return chr($page).chr($ch);
103 return chr ($_[1]) . chr $_[0];
108 # encode Multi-byte - single for 0..255 otherwise double
109 ## my ($ch,$page) = @_;
110 ## return &encode_D if $page;
112 return chr ($_[1]) . chr $_[0] if $_[1];
116 my %encode_types = (U => \&encode_U,
122 # Win32 does not expand globs on command line
123 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
127 # -Q to disable the duplicate codepoint test
128 # -S make mapping errors fatal
129 # -q to remove comments written to output files
130 # -O to enable the (brute force) substring optimiser
131 # -o <output> to specify the output file name (else it's the first arg)
132 # -f <inlist> to give a file with a list of input files (else use the args)
133 # -n <name> to name the encoding (else use the basename of the input file.
134 getopts('CM:SQqOo:f:n:',\%opt);
136 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
137 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
139 # This really should go first, else the die here causes empty (non-erroneous)
140 # output files to be written.
142 if (exists $opt{'f'}) {
143 # -F is followed by name of file containing list of filenames
144 my $flist = $opt{'f'};
145 open(FLIST,$flist) || die "Cannot open $flist:$!";
146 chomp(@encfiles = <FLIST>);
152 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
153 chmod(0666,$cname) if -f $cname && !-w $cname;
154 open(C,">$cname") || die "Cannot open $cname:$!";
159 my ($doC,$doEnc,$doUcm,$doPet);
161 if ($cname =~ /\.(c|xs)$/)
164 $dname =~ s/(\.[^\.]*)?$/.exh/;
165 chmod(0666,$dname) if -f $cname && !-w $dname;
166 open(D,">$dname") || die "Cannot open $dname:$!";
167 $hname =~ s/(\.[^\.]*)?$/.h/;
168 chmod(0666,$hname) if -f $cname && !-w $hname;
169 open(H,">$hname") || die "Cannot open $hname:$!";
171 foreach my $fh (\*C,\*D,\*H)
173 print $fh <<"END" unless $opt{'q'};
175 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
176 This file was autogenerated by:
182 if ($cname =~ /(\w+)\.xs$/)
184 print C "#include <EXTERN.h>\n";
185 print C "#include <perl.h>\n";
186 print C "#include <XSUB.h>\n";
187 print C "#define U8 U8\n";
189 print C "#include \"encode.h\"\n";
192 elsif ($cname =~ /\.enc$/)
196 elsif ($cname =~ /\.ucm$/)
200 elsif ($cname =~ /\.pet$/)
213 if ($a =~ /^.*-(\d+)/)
216 if ($b =~ /^.*-(\d+)/)
226 foreach my $enc (sort cmp_name @encfiles)
228 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
229 $name = $opt{'n'} if exists $opt{'n'};
234 compile_enc(\*E,lc($name));
238 compile_ucm(\*E,lc($name));
243 warn "Cannot open $enc for $name:$!";
249 print STDERR "Writing compiled form\n";
250 foreach my $name (sort cmp_name keys %encoding)
252 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
253 output(\*C,$name.'_utf8',$e2u);
254 output(\*C,'utf8_'.$name,$u2e);
255 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
257 foreach my $enc (sort cmp_name keys %encoding)
259 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
260 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
261 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
263 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
264 my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
265 my $sym = "${enc}_encoding";
267 print C "encode_t $sym = \n";
268 # This is to make null encoding work -- dankogai
269 for (my $i = (scalar @info) - 1; $i >= 0; --$i){
272 # end of null tweak -- dankogai
273 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
276 foreach my $enc (sort cmp_name keys %encoding)
278 my $sym = "${enc}_encoding";
280 print H "extern encode_t $sym;\n";
281 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
284 if ($cname =~ /(\w+)\.xs$/)
290 Encode_XSEncoding(pTHX_ encode_t *enc)
293 HV *stash = gv_stashpv("Encode::XS", TRUE);
294 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
300 const char *name = enc->name[i++];
301 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
304 call_pv("Encode::define_encoding",G_DISCARD);
310 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
311 print C "BOOT:\n{\n";
312 print C "#include \"$dname\"\n";
315 # Close in void context is bad, m'kay
316 close(D) or warn "Error closing '$dname': $!";
317 close(H) or warn "Error closing '$hname': $!";
319 my $perc_saved = $strings/($strings + $saved) * 100;
320 my $perc_subsaved = $strings/($strings + $subsave) * 100;
321 printf STDERR "%d bytes in string tables\n",$strings;
322 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
323 $saved, $perc_saved if $saved;
324 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
325 $subsave, $perc_subsaved if $subsave;
329 foreach my $name (sort cmp_name keys %encoding)
331 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
332 output_enc(\*C,$name,$e2u);
337 foreach my $name (sort cmp_name keys %encoding)
339 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
340 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
344 # writing half meg files and then not checking to see if you just filled the
346 close(C) or die "Error closing '$cname': $!";
348 # End of the main program.
360 last if /^\s*CHARMAP\s*$/i;
361 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
366 if (!defined($cs = $attr{'code_set_name'}))
368 warn "No <code_set_name> in $name\n";
372 $name = $cs unless exists $opt{'n'};
378 if (exists $attr{'subchar'})
381 #$attr{'subchar'} =~ /^\s*/cg;
382 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
383 #$erep = join('',map(chr(hex($_)),@byte));
384 $erep = $attr{'subchar'};
385 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
387 print "Reading $name ($cs)\n";
393 last if /^\s*END\s+CHARMAP\s*$/i;
395 my (@uni, @byte) = ();
396 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
397 or die "Bad line: $_";
398 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
399 push @uni, map { substr($_, 1) } split(/\+/, $1);
401 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
406 my $uch = join('', map { encode_U(hex($_)) } @uni );
407 my $ech = join('',map(chr(hex($_)),@byte));
408 my $el = length($ech);
409 $max_el = $el if (!defined($max_el) || $el > $max_el);
410 $min_el = $el if (!defined($min_el) || $el < $min_el);
421 # $fb is fallback flag
422 # 0 - round trip safe
423 # 1 - fallback for unicode -> enc
424 # 2 - skip sub-char mapping
425 # 3 - fallback enc -> unicode
426 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
427 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
436 die "$nfb entries without fallback, $hfb entries with\n";
438 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
450 while ($type = <$fh>)
452 last if $type !~ /^\s*#/;
455 return if $type eq 'E';
456 # Do the hash lookup once, rather than once per function call. 4% speedup.
457 my $type_func = $encode_types{$type};
458 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
459 warn "$type encoded $name\n";
461 # Save a defined test by setting these to defined values.
462 my $min_el = ~0; # A very big integer
463 my $max_el = 0; # Anything must be longer than 0
466 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
470 # use -Q to silence the seen test. Makefile.PL uses this by default.
471 $seen = {} unless $opt{Q};
476 my $page = hex($line);
481 # So why is it 1% faster to leave the my here?
483 $line =~ s/\r\n$/\n/;
484 die "$.:${line}Line should be exactly 65 characters long including
485 newline (".length($line).")" unless length ($line) == 65;
486 # Split line into groups of 4 hex digits, convert groups to ints
488 # map {hex $_} $line =~ /(....)/g
489 # This takes 63.75 (2.5% less time)
490 # unpack "n*", pack "H*", $line
491 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
492 # Doing it as while ($line =~ /(....)/g) took 74.63
493 foreach my $val (unpack "n*", pack "H*", $line)
495 next if $val == 0xFFFD;
496 my $ech = &$type_func($ch,$page);
497 if ($val || (!$ch && !$page))
499 my $el = length($ech);
500 $max_el = $el if $el > $max_el;
501 $min_el = $el if $el < $min_el;
502 my $uch = encode_U($val);
504 # We're doing the test.
505 # We don't need to read this quickly, so storing it as a scalar,
506 # rather than 3 (anon array, plus the 2 scalars it holds) saves
507 # RAM and may make us faster on low RAM systems. [see __END__]
508 if (exists $seen->{$uch})
510 warn sprintf("U%04X is %02X%02X and %04X\n",
511 $val,$page,$ch,$seen->{$uch});
516 $seen->{$uch} = $page << 8 | $ch;
519 # Passing 2 extra args each time is 3.6% slower!
520 # Even with having to add $fallback ||= 0 later
521 enter_fb0($e2u,$ech,$uch);
522 enter_fb0($u2e,$uch,$ech);
526 # No character at this position
527 # enter($e2u,$ech,undef,$e2u);
533 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
534 if $min_el > $max_el;
535 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
536 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
539 # my ($a,$s,$d,$t,$fb) = @_;
541 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
542 # state we shift to after this (multibyte) input character defaults to same
545 # Making sure it is defined seems to be faster than {no warnings;} in
546 # &process, or passing it in as 0 explicity.
547 # XXX $fallback ||= 0;
549 # Start at the beginning and work forwards through the string to zero.
550 # effectively we are removing 1 character from the front each time
551 # but we don't actually edit the string. [this alone seems to be 14% speedup]
552 # Hence -$pos is the length of the remaining string.
553 my $pos = -length $inbytes;
555 my $byte = substr $inbytes, $pos, 1;
558 # RAW_OUT_BYTES => 2,
560 # to unicode an array would seem to be better, because the pages are dense.
561 # from unicode can be very sparse, favouring a hash.
562 # hash using the bytes (all length 1) as keys rather than ord value,
563 # as it's easier to sort these in &process.
565 # It's faster to always add $fallback even if it's undef, rather than
566 # choosing between 3 and 4 element array. (hence why we set it defined
568 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
569 # When $pos was -1 we were at the last input character.
571 $do_now->[RAW_OUT_BYTES] = $outbytes;
572 $do_now->[RAW_NEXT] = $next;
575 # Tail recursion. The intermdiate state may not have a name yet.
576 $current = $do_now->[RAW_NEXT];
580 # This is purely for optimistation. It's just &enter hard coded for $fallback
581 # of 0, using only a 3 entry array ref to save memory for every entry.
583 my ($current,$inbytes,$outbytes,$next) = @_;
586 my $pos = -length $inbytes;
588 my $byte = substr $inbytes, $pos, 1;
589 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
591 $do_now->[RAW_OUT_BYTES] = $outbytes;
592 $do_now->[RAW_NEXT] = $next;
595 $current = $do_now->[RAW_NEXT];
602 my ($fh,$name,$s) = @_;
603 my $sym = $strings{$s};
606 $saved += length($s);
611 foreach my $o (keys %strings)
613 next unless (my $i = index($o,$s)) >= 0;
615 # gcc things that 0x0e+0x10 (anything with e+) starts to look like
616 # a hexadecimal floating point constant. Silly gcc. Only p
617 # introduces a floating point constant. Put the space in to stop it
619 $sym .= sprintf(" +0x%02x",$i) if ($i);
620 $subsave += length($s);
621 return $strings{$s} = $sym;
624 $strings{$s} = $sym = $name;
625 $strings += length($s);
626 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
627 # Maybe we should assert that these are all <256.
628 $definition .= join(',',unpack "C*",$s);
629 # We have a single long line. Split it at convenient commas.
630 $definition =~ s/(.{74,77},)/$1\n/g;
631 print $fh "$definition };\n\n";
642 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
645 foreach my $key (sort keys %$raw) {
648 # RAW_OUT_BYTES => 2,
650 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
651 # Now we are converting from raw to aggregate, switch from 1 byte strings
656 # If this == fails, we're going to reset $agg_max_in below anyway.
657 $b == ++$agg_max_in &&
658 # References in numeric context give the pointer as an int.
659 $agg_next == $next &&
660 $agg_in_len == $in_len &&
661 $agg_out_len == length $out_bytes &&
662 $agg_fallback == $fallback
663 # && length($l->[AGG_OUT_BYTES]) < 16
665 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
666 # we can aggregate this byte onto the end.
667 $l->[AGG_MAX_IN] = $b;
668 $l->[AGG_OUT_BYTES] .= $out_bytes;
672 # AGG_OUT_BYTES => 2,
677 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
678 # (only gains .6% on euc-jp -- is it worth it?)
679 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
680 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
681 $agg_fallback = $fallback];
683 if (exists $next->{Cname}) {
684 $next->{'Forward'} = 1 if $next != $a;
686 process(sprintf("%s_%02x",$name,$b),$next);
689 # encengine.c rules say that last entry must be for 255
690 if ($agg_max_in < 255) {
691 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
693 $a->{'Entries'} = \@ent;
699 my $name = $a->{'Cname'};
701 foreach my $b (@{$a->{'Entries'}})
703 next unless $b->[AGG_OUT_LEN];
704 my $s = $b->[AGG_MIN_IN];
705 my $e = $b->[AGG_MAX_IN];
706 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
710 my $var = $^O eq 'MacOS' ? 'extern' : 'static';
711 print $fh "\n$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
714 foreach my $b (@{$a->{'Entries'}})
716 my ($s,$e,$out,$t,$end,$l) = @$b;
717 outtable($fh,$t) unless $t->{'Done'};
719 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
720 foreach my $b (@{$a->{'Entries'}})
722 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
723 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
727 printf $fh outstring($fh,'',$out);
733 print $fh ",",$t->{Cname};
734 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
741 my ($fh,$name,$a) = @_;
749 my ($fh,$name,$a) = @_;
750 die "Changed - fix me for new structure";
751 foreach my $b (sort keys %$a)
753 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
765 my $s = do "unicore/Name.pl";
766 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
768 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
772 last if $s >= 0x10000;
773 my $e = length($2) ? hex($2) : $s;
774 for (my $i = $s; $i <= $e; $i++)
777 # print sprintf("U%04X $name\n",$i);
784 my ($cmap,$a,$t,$pre) = @_;
785 # warn sprintf("Page %x\n",$pre);
787 foreach my $key (sort keys %$raw) {
790 # RAW_OUT_BYTES => 2,
792 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
796 if ($next != $a && $next != $t) {
797 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
798 } elsif (length $out_bytes) {
800 $u = $pre|($u &0x3f);
802 my $s = sprintf "<U%04X> ",$u;
803 #foreach my $c (split(//,$out_bytes)) {
804 # $s .= sprintf "\\x%02X",ord($c);
806 # 9.5% faster changing that loop to this:
807 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
808 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
811 warn join(',',$u, @{$raw->{$key}},$a,$t);
818 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
819 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
820 print $fh "<code_set_name> \"$name\"\n";
824 print $fh "<mb_cur_min> $min_el\n";
828 print $fh "<mb_cur_max> $max_el\n";
832 print $fh "<subchar> ";
833 foreach my $c (split(//,$rep))
835 printf $fh "\\x%02X",ord($c);
840 output_ucm_page(\@cmap,$h,$h,0);
841 print $fh "#\nCHARMAP\n";
842 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
846 print $fh "END CHARMAP\n";
860 eval { require File::Find };
863 push @inc, $inc unless $inc eq '.'; #skip current dir
867 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
868 $atime,$mtime,$ctime,$blksize,$blocks)
869 = lstat($_) or return;
872 $e2x_dir{$File::Find::dir} ||= $mtime;
876 warn join("\n", keys %e2x_dir), "\n";
877 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
879 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
886 eval { require Encode; };
887 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
888 # our used for variable expanstion
890 $_Version = $VERSION;
893 $_TableFiles = join(",", map {qq('$_')} @_);
894 $_Now = scalar localtime();
896 eval { require File::Spec; };
897 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
898 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
899 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
900 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
901 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
910 sub make_configlocal_pm
912 eval { require Encode; };
913 $@ and die "Unable to require Encode: $@\n";
914 eval { require File::Spec; };
915 # our used for variable expanstion
916 my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
919 my $inc = File::Spec->catfile($d, "Encode");
921 opendir my $dh, $inc or die "$inc:$!";
922 warn "Checking $inc...\n";
923 for my $f (grep /\.pm$/o, readdir($dh)){
924 -f File::Spec->catfile($inc, "$f") or next;
925 $INC{"Encode/$f"} and next;
926 warn "require Encode/$f;\n";
927 eval { require "Encode/$f"; };
928 $@ and die "Can't require Encode/$f: $@\n";
929 for my $enc (Encode->encodings()){
930 $in_core{$enc} and next;
931 $Encode::Config::ExtModule{$enc} and next;
932 my $mod = "Encode/$f";
933 $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
934 $LocalMod{$enc} ||= $mod;
939 for my $enc (sort keys %LocalMod){
941 qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
944 $_LocalVer = _mkversion();
946 $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
947 _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
948 File::Spec->catfile($_Inc,"ConfigLocal.pm"),
954 my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
955 $yyyy += 1900, $mo +=1;
956 return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
960 eval { require File::Basename; };
961 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
962 File::Basename->import();
963 my ($src, $dst, $clobber) = @_;
964 if (!$clobber and -e $dst){
965 warn "$dst exists. skipping\n";
968 warn "Generating $dst...\n";
969 open my $in, $src or die "$src : $!";
970 if ((my $d = dirname($dst)) ne '.'){
971 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
973 open my $out, ">$dst" or die "$!";
976 if (/^#### END_OF_HEADER/){
979 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
987 enc2xs -- Perl Encode Module Generator
992 enc2xs -M ModName mapfiles...
997 F<enc2xs> builds a Perl extension for use by Encode from either
998 Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
999 Besides being used internally during the build process of the Encode
1000 module, you can use F<enc2xs> to add your own encoding to perl.
1001 No knowledge of XS is necessary.
1005 If you want to know as little about Perl as possible but need to
1006 add a new encoding, just read this chapter and forget the rest.
1012 Have a .ucm file ready. You can get it from somewhere or you can write
1013 your own from scratch or you can grab one from the Encode distribution
1014 and customize it. For the UCM format, see the next Chapter. In the
1015 example below, I'll call my theoretical encoding myascii, defined
1016 in I<my.ucm>. C<$> is a shell prompt.
1023 Issue a command as follows;
1025 $ enc2xs -M My my.ucm
1026 generating Makefile.PL
1031 Now take a look at your current directory. It should look like this.
1034 Makefile.PL My.pm my.ucm t/
1036 The following files were created.
1038 Makefile.PL - MakeMaker script
1039 My.pm - Encode submodule
1046 If you want *.ucm installed together with the modules, do as follows;
1050 $ enc2xs -M My Encode/*ucm
1056 Edit the files generated. You don't have to if you have no time AND no
1057 intention to give it to someone else. But it is a good idea to edit
1058 the pod and to add more tests.
1062 Now issue a command all Perl Mongers love:
1065 Writing Makefile for Encode::My
1069 Now all you have to do is make.
1072 cp My.pm blib/lib/Encode/My.pm
1073 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1074 -o encode_t.c -f encode_t.fnm
1075 Reading myascii (myascii)
1076 Writing compiled form
1077 128 bytes in string tables
1078 384 bytes (25%) saved spotting duplicates
1079 1 bytes (99.2%) saved using substrings
1081 chmod 644 blib/arch/auto/Encode/My/My.bs
1084 The time it takes varies depending on how fast your machine is and
1085 how large your encoding is. Unless you are working on something big
1086 like euc-tw, it won't take too long.
1090 You can "make install" already but you should test first.
1093 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1094 -e 'use Test::Harness qw(&runtests $verbose); \
1095 $verbose=0; runtests @ARGV;' t/*.t
1097 All tests successful.
1098 Files=1, Tests=2, 0 wallclock secs
1099 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1103 If you are content with the test result, just "make install"
1107 If you want to add your encoding to Encode's demand-loading list
1108 (so you don't have to "use Encode::YourEncoding"), run
1112 to update Encode::ConfigLocal, a module that controls local settings.
1113 After that, "use Encode;" is enough to load your encodings on demand.
1117 =head1 The Unicode Character Map
1119 Encode uses the Unicode Character Map (UCM) format for source character
1120 mappings. This format is used by IBM's ICU package and was adopted
1121 by Nick Ing-Simmons for use with the Encode module. Since UCM is
1122 more flexible than Tcl's Encoding Map and far more user-friendly,
1123 this is the recommended formet for Encode now.
1125 A UCM file looks like this.
1130 <code_set_name> "US-ascii" # Required
1131 <code_set_alias> "ascii" # Optional
1132 <mb_cur_min> 1 # Required; usually 1
1133 <mb_cur_max> 1 # Max. # of bytes/char
1134 <subchar> \x3F # Substitution char
1137 <U0000> \x00 |0 # <control>
1138 <U0001> \x01 |0 # <control>
1139 <U0002> \x02 |0 # <control>
1141 <U007C> \x7C |0 # VERTICAL LINE
1142 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1143 <U007E> \x7E |0 # TILDE
1144 <U007F> \x7F |0 # <control>
1151 Anything that follows C<#> is treated as a comment.
1155 The header section continues until a line containing the word
1156 CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1157 pair per line. Strings used as values must be quoted. Barewords are
1158 treated as numbers. I<\xXX> represents a byte.
1160 Most of the keywords are self-explanatory. I<subchar> means
1161 substitution character, not subcharacter. When you decode a Unicode
1162 sequence to this encoding but no matching character is found, the byte
1163 sequence defined here will be used. For most cases, the value here is
1164 \x3F; in ASCII, this is a question mark.
1168 CHARMAP starts the character map section. Each line has a form as
1171 <UXXXX> \xXX.. |0 # comment
1173 | | +- Fallback flag
1174 | +-------- Encoded byte sequence
1175 +-------------- Unicode Character ID in hex
1177 The format is roughly the same as a header section except for the
1178 fallback flag: | followed by 0..3. The meaning of the possible
1179 values is as follows:
1185 Round trip safe. A character decoded to Unicode encodes back to the
1186 same byte sequence. Most characters have this flag.
1190 Fallback for unicode -> encoding. When seen, enc2xs adds this
1191 character for the encode map only.
1195 Skip sub-char mapping should there be no code point.
1199 Fallback for encoding -> unicode. When seen, enc2xs adds this
1200 character for the decode map only.
1206 And finally, END OF CHARMAP ends the section.
1210 When you are manually creating a UCM file, you should copy ascii.ucm
1211 or an existing encoding which is close to yours, rather than write
1212 your own from scratch.
1214 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1215 is, unless your environment is EBCDIC.
1217 B<CAVEAT>: not all features in UCM are implemented. For example,
1218 icu:state is not used. Because of that, you need to write a perl
1219 module if you want to support algorithmical encodings, notably
1220 the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
1221 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1223 =head2 Coping with duplicate mappings
1225 When you create a map, you SHOULD make your mappings round-trip safe.
1226 That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1227 $data> stands for all characters that are marked as C<|0>. Here is
1234 Sort your map in Unicode order.
1238 When you have a duplicate entry, mark either one with '|1' or '|3'.
1242 And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1246 Here is an example from big5-eten.
1251 Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1255 --------------------------------------
1256 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1259 So it is round-trip safe for \xF9\xF9. But if the line above is upside
1260 down, here is what happens.
1263 --------------------------------------
1264 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1265 (\xF9\xF9 => U2550 is now overwritten!)
1267 The Encode package comes with F<ucmlint>, a crude but sufficient
1268 utility to check the integrity of a UCM file. Check under the
1269 Encode/bin directory for this.
1279 L<http://oss.software.ibm.com/icu/>
1283 ICU Character Mapping Tables
1284 L<http://oss.software.ibm.com/icu/charset/>
1289 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1301 # -Q to disable the duplicate codepoint test
1302 # -S make mapping errors fatal
1303 # -q to remove comments written to output files
1304 # -O to enable the (brute force) substring optimiser
1305 # -o <output> to specify the output file name (else it's the first arg)
1306 # -f <inlist> to give a file with a list of input files (else use the args)
1307 # -n <name> to name the encoding (else use the basename of the input file.
1309 With %seen holding array refs:
1311 865.66 real 28.80 user 8.79 sys
1312 7904 maximum resident set size
1313 1356 average shared memory size
1314 18566 average unshared data size
1315 229 average unshared stack size
1319 With %seen holding simple scalars:
1321 342.16 real 27.11 user 3.54 sys
1322 8388 maximum resident set size
1323 1394 average shared memory size
1324 14969 average unshared data size
1325 236 average unshared stack size
1329 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1330 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1331 RAM machine, but it's going to help even on modern machines.
1332 Swapping is bad, m'kay :-)