3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
12 my @orig_ARGV = @ARGV;
13 our $VERSION = do { my @r = (q$Revision: 2.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
15 # These may get re-ordered.
16 # RAW is a do_now as inserted by &enter
17 # AGG is an aggreagated do_now, as built up by &process
34 # (See the algorithm in encengine.c - we're building structures for it)
36 # There are two sorts of structures.
37 # "do_now" (an array, two variants of what needs storing) is whatever we need
38 # to do now we've read an input byte.
39 # It's housed in a "do_next" (which is how we got to it), and in turn points
40 # to a "do_next" which contains all the "do_now"s for the next input byte.
42 # There will be a "do_next" which is the start state.
43 # For a single byte encoding it's the only "do_next" - each "do_now" points
44 # back to it, and each "do_now" will cause bytes. There is no state.
46 # For a multi-byte encoding where all characters in the input are the same
47 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
48 # branching out from the start state, one step for each input byte.
49 # The leaf "do_now"s will all be at the same distance from the start state,
50 # only the leaf "do_now"s cause output bytes, and they in turn point back to
53 # For an encoding where there are varaible length input byte sequences, you
54 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
55 # as before the leaves will point back to the start state.
57 # The system will cope with escape encodings (imagine them as a mostly
58 # self-contained tree for each escape state, and cross links between trees
59 # at the state-switching characters) but so far no input format defines these.
61 # The system will also cope with having output "leaves" in the middle of
62 # the bifurcating branches, not just at the extremities, but again no
63 # input format does this yet.
65 # There are two variants of the "do_now" structure. The first, smaller variant
66 # is generated by &enter as the input file is read. There is one structure
67 # for each input byte. Say we are mapping a single byte encoding to a
68 # single byte encoding, with "ABCD" going "abcd". There will be
69 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
71 # &process then walks the tree, building aggregate "do_now" structres for
72 # adjacent bytes where possible. The aggregate is for a contiguous range of
73 # bytes which each produce the same length of output, each move to the
74 # same next state, and each have the same fallback flag.
75 # So our 4 RAW "do_now"s above become replaced by a single structure
77 # ["A", "D", "abcd", 1, ...]
78 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
79 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
80 # which maps very nicely into pointer arithmetic in C for encengine.c
84 # UTF-8 encode long hand - only covers part of perl's range
86 # chr() works in native space so convert value from table
87 # into that space before using chr().
88 my $ch = chr(utf8::unicode_to_native($_[0]));
89 # Now get core perl to encode that the way it likes.
97 ## my ($ch,$page) = @_; return chr($ch);
103 # encode double byte MS byte first
104 ## my ($ch,$page) = @_; return chr($page).chr($ch);
105 return chr ($_[1]) . chr $_[0];
110 # encode Multi-byte - single for 0..255 otherwise double
111 ## my ($ch,$page) = @_;
112 ## return &encode_D if $page;
114 return chr ($_[1]) . chr $_[0] if $_[1];
118 my %encode_types = (U => \&encode_U,
124 # Win32 does not expand globs on command line
125 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
129 # -Q to disable the duplicate codepoint test
130 # -S make mapping errors fatal
131 # -q to remove comments written to output files
132 # -O to enable the (brute force) substring optimiser
133 # -o <output> to specify the output file name (else it's the first arg)
134 # -f <inlist> to give a file with a list of input files (else use the args)
135 # -n <name> to name the encoding (else use the basename of the input file.
136 getopts('CM:SQqOo:f:n:',\%opt);
138 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
139 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
141 # This really should go first, else the die here causes empty (non-erroneous)
142 # output files to be written.
144 if (exists $opt{'f'}) {
145 # -F is followed by name of file containing list of filenames
146 my $flist = $opt{'f'};
147 open(FLIST,$flist) || die "Cannot open $flist:$!";
148 chomp(@encfiles = <FLIST>);
154 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
155 chmod(0666,$cname) if -f $cname && !-w $cname;
156 open(C,">$cname") || die "Cannot open $cname:$!";
161 my ($doC,$doEnc,$doUcm,$doPet);
163 if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
166 $dname =~ s/(\.[^\.]*)?$/.exh/;
167 chmod(0666,$dname) if -f $cname && !-w $dname;
168 open(D,">$dname") || die "Cannot open $dname:$!";
169 $hname =~ s/(\.[^\.]*)?$/.h/;
170 chmod(0666,$hname) if -f $cname && !-w $hname;
171 open(H,">$hname") || die "Cannot open $hname:$!";
173 foreach my $fh (\*C,\*D,\*H)
175 print $fh <<"END" unless $opt{'q'};
177 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
178 This file was autogenerated by:
180 enc2xs VERSION $VERSION
185 if ($cname =~ /(\w+)\.xs$/)
187 print C "#include <EXTERN.h>\n";
188 print C "#include <perl.h>\n";
189 print C "#include <XSUB.h>\n";
190 print C "#define U8 U8\n";
192 print C "#include \"encode.h\"\n\n";
195 elsif ($cname =~ /\.enc$/)
199 elsif ($cname =~ /\.ucm$/)
203 elsif ($cname =~ /\.pet$/)
219 if ($a =~ /^.*-(\d+)/)
222 if ($b =~ /^.*-(\d+)/)
232 foreach my $enc (sort cmp_name @encfiles)
234 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
235 $name = $opt{'n'} if exists $opt{'n'};
240 compile_enc(\*E,lc($name));
244 compile_ucm(\*E,lc($name));
249 warn "Cannot open $enc for $name:$!";
255 print STDERR "Writing compiled form\n";
256 foreach my $name (sort cmp_name keys %encoding)
258 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
259 process($name.'_utf8',$e2u);
260 addstrings(\*C,$e2u);
262 process('utf8_'.$name,$u2e);
263 addstrings(\*C,$u2e);
265 outbigstring(\*C,"enctable");
266 foreach my $name (sort cmp_name keys %encoding)
268 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
269 outtable(\*C,$e2u, "enctable");
270 outtable(\*C,$u2e, "enctable");
272 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
274 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
275 my $exta = $cpp ? 'extern "C" ' : "static";
276 my $extb = $cpp ? 'extern "C" ' : "";
277 foreach my $enc (sort cmp_name keys %encoding)
279 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
280 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
281 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
283 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
284 my $sym = "${enc}_encoding";
286 my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
288 print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";
289 print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";
290 print C "${extb} const encode_t $sym = \n";
291 # This is to make null encoding work -- dankogai
292 for (my $i = (scalar @info) - 1; $i >= 0; --$i){
295 # end of null tweak -- dankogai
296 print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
299 foreach my $enc (sort cmp_name keys %encoding)
301 my $sym = "${enc}_encoding";
303 print H "extern encode_t $sym;\n";
304 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
307 if ($cname =~ /(\w+)\.xs$/)
313 Encode_XSEncoding(pTHX_ encode_t *enc)
316 HV *stash = gv_stashpv("Encode::XS", TRUE);
317 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
323 const char *name = enc->name[i++];
324 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
327 call_pv("Encode::define_encoding",G_DISCARD);
333 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
334 print C "BOOT:\n{\n";
335 print C "#include \"$dname\"\n";
338 # Close in void context is bad, m'kay
339 close(D) or warn "Error closing '$dname': $!";
340 close(H) or warn "Error closing '$hname': $!";
342 my $perc_saved = $saved/($strings + $saved) * 100;
343 my $perc_subsaved = $subsave/($strings + $subsave) * 100;
344 printf STDERR "%d bytes in string tables\n",$strings;
345 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
346 $saved, $perc_saved if $saved;
347 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
348 $subsave, $perc_subsaved if $subsave;
352 foreach my $name (sort cmp_name keys %encoding)
354 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
355 output_enc(\*C,$name,$e2u);
360 foreach my $name (sort cmp_name keys %encoding)
362 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
363 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
367 # writing half meg files and then not checking to see if you just filled the
369 close(C) or die "Error closing '$cname': $!";
371 # End of the main program.
383 last if /^\s*CHARMAP\s*$/i;
384 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
389 if (!defined($cs = $attr{'code_set_name'}))
391 warn "No <code_set_name> in $name\n";
395 $name = $cs unless exists $opt{'n'};
401 if (exists $attr{'subchar'})
404 #$attr{'subchar'} =~ /^\s*/cg;
405 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
406 #$erep = join('',map(chr(hex($_)),@byte));
407 $erep = $attr{'subchar'};
408 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
410 print "Reading $name ($cs)\n";
416 last if /^\s*END\s+CHARMAP\s*$/i;
418 my (@uni, @byte) = ();
419 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
420 or die "Bad line: $_";
421 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
422 push @uni, map { substr($_, 1) } split(/\+/, $1);
424 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
429 my $uch = join('', map { encode_U(hex($_)) } @uni );
430 my $ech = join('',map(chr(hex($_)),@byte));
431 my $el = length($ech);
432 $max_el = $el if (!defined($max_el) || $el > $max_el);
433 $min_el = $el if (!defined($min_el) || $el < $min_el);
444 # $fb is fallback flag
445 # 0 - round trip safe
446 # 1 - fallback for unicode -> enc
447 # 2 - skip sub-char mapping
448 # 3 - fallback enc -> unicode
449 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
450 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
459 die "$nfb entries without fallback, $hfb entries with\n";
461 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
473 while ($type = <$fh>)
475 last if $type !~ /^\s*#/;
478 return if $type eq 'E';
479 # Do the hash lookup once, rather than once per function call. 4% speedup.
480 my $type_func = $encode_types{$type};
481 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
482 warn "$type encoded $name\n";
484 # Save a defined test by setting these to defined values.
485 my $min_el = ~0; # A very big integer
486 my $max_el = 0; # Anything must be longer than 0
489 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
493 # use -Q to silence the seen test. Makefile.PL uses this by default.
494 $seen = {} unless $opt{Q};
499 my $page = hex($line);
504 # So why is it 1% faster to leave the my here?
506 $line =~ s/\r\n$/\n/;
507 die "$.:${line}Line should be exactly 65 characters long including
508 newline (".length($line).")" unless length ($line) == 65;
509 # Split line into groups of 4 hex digits, convert groups to ints
511 # map {hex $_} $line =~ /(....)/g
512 # This takes 63.75 (2.5% less time)
513 # unpack "n*", pack "H*", $line
514 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
515 # Doing it as while ($line =~ /(....)/g) took 74.63
516 foreach my $val (unpack "n*", pack "H*", $line)
518 next if $val == 0xFFFD;
519 my $ech = &$type_func($ch,$page);
520 if ($val || (!$ch && !$page))
522 my $el = length($ech);
523 $max_el = $el if $el > $max_el;
524 $min_el = $el if $el < $min_el;
525 my $uch = encode_U($val);
527 # We're doing the test.
528 # We don't need to read this quickly, so storing it as a scalar,
529 # rather than 3 (anon array, plus the 2 scalars it holds) saves
530 # RAM and may make us faster on low RAM systems. [see __END__]
531 if (exists $seen->{$uch})
533 warn sprintf("U%04X is %02X%02X and %04X\n",
534 $val,$page,$ch,$seen->{$uch});
539 $seen->{$uch} = $page << 8 | $ch;
542 # Passing 2 extra args each time is 3.6% slower!
543 # Even with having to add $fallback ||= 0 later
544 enter_fb0($e2u,$ech,$uch);
545 enter_fb0($u2e,$uch,$ech);
549 # No character at this position
550 # enter($e2u,$ech,undef,$e2u);
556 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
557 if $min_el > $max_el;
558 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
559 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
562 # my ($a,$s,$d,$t,$fb) = @_;
564 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
565 # state we shift to after this (multibyte) input character defaults to same
568 # Making sure it is defined seems to be faster than {no warnings;} in
569 # &process, or passing it in as 0 explicity.
570 # XXX $fallback ||= 0;
572 # Start at the beginning and work forwards through the string to zero.
573 # effectively we are removing 1 character from the front each time
574 # but we don't actually edit the string. [this alone seems to be 14% speedup]
575 # Hence -$pos is the length of the remaining string.
576 my $pos = -length $inbytes;
578 my $byte = substr $inbytes, $pos, 1;
581 # RAW_OUT_BYTES => 2,
583 # to unicode an array would seem to be better, because the pages are dense.
584 # from unicode can be very sparse, favouring a hash.
585 # hash using the bytes (all length 1) as keys rather than ord value,
586 # as it's easier to sort these in &process.
588 # It's faster to always add $fallback even if it's undef, rather than
589 # choosing between 3 and 4 element array. (hence why we set it defined
591 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
592 # When $pos was -1 we were at the last input character.
594 $do_now->[RAW_OUT_BYTES] = $outbytes;
595 $do_now->[RAW_NEXT] = $next;
598 # Tail recursion. The intermdiate state may not have a name yet.
599 $current = $do_now->[RAW_NEXT];
603 # This is purely for optimistation. It's just &enter hard coded for $fallback
604 # of 0, using only a 3 entry array ref to save memory for every entry.
606 my ($current,$inbytes,$outbytes,$next) = @_;
609 my $pos = -length $inbytes;
611 my $byte = substr $inbytes, $pos, 1;
612 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
614 $do_now->[RAW_OUT_BYTES] = $outbytes;
615 $do_now->[RAW_NEXT] = $next;
618 $current = $do_now->[RAW_NEXT];
628 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
631 foreach my $key (sort keys %$raw) {
634 # RAW_OUT_BYTES => 2,
636 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
637 # Now we are converting from raw to aggregate, switch from 1 byte strings
642 # If this == fails, we're going to reset $agg_max_in below anyway.
643 $b == ++$agg_max_in &&
644 # References in numeric context give the pointer as an int.
645 $agg_next == $next &&
646 $agg_in_len == $in_len &&
647 $agg_out_len == length $out_bytes &&
648 $agg_fallback == $fallback
649 # && length($l->[AGG_OUT_BYTES]) < 16
651 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
652 # we can aggregate this byte onto the end.
653 $l->[AGG_MAX_IN] = $b;
654 $l->[AGG_OUT_BYTES] .= $out_bytes;
658 # AGG_OUT_BYTES => 2,
663 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
664 # (only gains .6% on euc-jp -- is it worth it?)
665 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
666 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
667 $agg_fallback = $fallback];
669 if (exists $next->{Cname}) {
670 $next->{'Forward'} = 1 if $next != $a;
672 process(sprintf("%s_%02x",$name,$b),$next);
675 # encengine.c rules say that last entry must be for 255
676 if ($agg_max_in < 255) {
677 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
679 $a->{'Entries'} = \@ent;
686 my $name = $a->{'Cname'};
688 foreach my $b (@{$a->{'Entries'}})
690 next unless $b->[AGG_OUT_LEN];
691 $strings{$b->[AGG_OUT_BYTES]} = undef;
695 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
696 my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static';
697 my $const = $cpp ? '' : 'const';
698 print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
700 $a->{'DoneStrings'} = 1;
701 foreach my $b (@{$a->{'Entries'}})
703 my ($s,$e,$out,$t,$end,$l) = @$b;
704 addstrings($fh,$t) unless $t->{'DoneStrings'};
714 # Make the big string in the string accumulator. Longest first, on the hope
715 # that this makes it more likely that we find the short strings later on.
716 # Not sure if it helps sorting strings of the same length lexcically.
717 foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
718 my $index = index $string_acc, $s;
720 $saved += length($s);
721 $strings_in_acc{$s} = $index;
725 my $sublength = length $s;
726 while (--$sublength > 0) {
727 # progressively lop characters off the end, to see if the start of
728 # the new string overlaps the end of the accumulator.
729 if (substr ($string_acc, -$sublength)
730 eq substr ($s, 0, $sublength)) {
731 $subsave += $sublength;
732 $strings_in_acc{$s} = length ($string_acc) - $sublength;
733 # append the last bit on the end.
734 $string_acc .= substr ($s, $sublength);
737 # or if the end of the new string overlaps the start of the
739 next unless substr ($string_acc, 0, $sublength)
740 eq substr ($s, -$sublength);
741 # well, the last $sublength characters of the accumulator match.
742 # so as we're prepending to the accumulator, need to shift all our
743 # existing offsets forwards
744 $_ += $sublength foreach values %strings_in_acc;
745 $subsave += $sublength;
746 $strings_in_acc{$s} = 0;
747 # append the first bit on the start.
748 $string_acc = substr ($s, 0, -$sublength) . $string_acc;
752 # Optimiser (if it ran) found nothing, so just going have to tack the
753 # whole thing on the end.
754 $strings_in_acc{$s} = length $string_acc;
760 $strings = length $string_acc;
761 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
762 my $var = $cpp ? '' : 'static';
763 my $definition = "\n$var const U8 $name\[$strings] = { " .
764 join(',',unpack "C*",$string_acc);
765 # We have a single long line. Split it at convenient commas.
766 print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
767 print $fh substr ($definition, pos $definition), " };\n";
772 my $offset = $strings_in_acc{$s};
773 die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
774 unless defined $offset;
780 my ($fh,$a,$bigname) = @_;
781 my $name = $a->{'Cname'};
783 foreach my $b (@{$a->{'Entries'}})
785 my ($s,$e,$out,$t,$end,$l) = @$b;
786 outtable($fh,$t,$bigname) unless $t->{'Done'};
788 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
789 my $var = $cpp ? '' : 'static';
790 my $const = $cpp ? '' : 'const';
791 print $fh "\n$var $const encpage_t $name\[",
792 scalar(@{$a->{'Entries'}}), "] = {\n";
793 foreach my $b (@{$a->{'Entries'}})
795 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
796 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
800 printf $fh findstring($bigname,$out);
806 print $fh ",",$t->{Cname};
807 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
814 my ($fh,$name,$a) = @_;
815 die "Changed - fix me for new structure";
816 foreach my $b (sort keys %$a)
818 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
830 my $s = do "unicore/Name.pl";
831 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
833 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
837 last if $s >= 0x10000;
838 my $e = length($2) ? hex($2) : $s;
839 for (my $i = $s; $i <= $e; $i++)
842 # print sprintf("U%04X $name\n",$i);
849 my ($cmap,$a,$t,$pre) = @_;
850 # warn sprintf("Page %x\n",$pre);
852 foreach my $key (sort keys %$raw) {
855 # RAW_OUT_BYTES => 2,
857 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
861 if ($next != $a && $next != $t) {
862 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
863 } elsif (length $out_bytes) {
865 $u = $pre|($u &0x3f);
867 my $s = sprintf "<U%04X> ",$u;
868 #foreach my $c (split(//,$out_bytes)) {
869 # $s .= sprintf "\\x%02X",ord($c);
871 # 9.5% faster changing that loop to this:
872 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
873 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
876 warn join(',',$u, @{$raw->{$key}},$a,$t);
883 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
884 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
885 print $fh "<code_set_name> \"$name\"\n";
889 print $fh "<mb_cur_min> $min_el\n";
893 print $fh "<mb_cur_max> $max_el\n";
897 print $fh "<subchar> ";
898 foreach my $c (split(//,$rep))
900 printf $fh "\\x%02X",ord($c);
905 output_ucm_page(\@cmap,$h,$h,0);
906 print $fh "#\nCHARMAP\n";
907 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
911 print $fh "END CHARMAP\n";
925 eval { require File::Find; };
928 push @inc, $inc unless $inc eq '.'; #skip current dir
932 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
933 $atime,$mtime,$ctime,$blksize,$blocks)
934 = lstat($_) or return;
938 $e2x_dir{$File::Find::dir} ||= $mtime;
942 warn join("\n", keys %e2x_dir), "\n";
943 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
945 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
952 eval { require Encode; };
953 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
954 # our used for variable expanstion
956 $_Version = $VERSION;
959 $_TableFiles = join(",", map {qq('$_')} @_);
960 $_Now = scalar localtime();
962 eval { require File::Spec; };
963 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
964 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
965 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
966 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
967 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
976 sub make_configlocal_pm {
977 eval { require Encode; };
978 $@ and die "Unable to require Encode: $@\n";
979 eval { require File::Spec; };
981 # our used for variable expanstion
982 my %in_core = map { $_ => 1 } (
983 'ascii', 'iso-8859-1', 'utf8',
984 'ascii-ctrl', 'null', 'utf-8-strict'
991 $File::Find::name =~ /\A\./ and return;
992 $File::Find::name =~ /\.pm\z/ or return;
993 $File::Find::name =~ m/\bEncode\b/ or return;
994 my $mod = $File::Find::name;
995 $mod =~ s/.*\bEncode\b/Encode/o;
998 warn qq{ require $mod;\n};
999 eval qq{ require $mod; };
1000 $@ and die "Can't require $mod: $@\n";
1001 for my $enc ( Encode->encodings() ) {
1003 $in_core{$enc} and next;
1004 $Encode::Config::ExtModule{$enc} and next;
1005 $LocalMod{$enc} ||= $mod;
1008 File::Find::find({wanted => $wanted}, @INC);
1010 for my $enc ( sort keys %LocalMod ) {
1012 qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
1015 $_LocalVer = _mkversion();
1017 $_Inc = $INC{"Encode.pm"};
1018 $_Inc =~ s/\.pm$//o;
1019 _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1020 File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
1025 # v-string is now depreciated; use time() instead;
1026 #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1027 #$yyyy += 1900, $mo +=1;
1028 #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1033 eval { require File::Basename; };
1034 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
1035 File::Basename->import();
1036 my ($src, $dst, $clobber) = @_;
1037 if (!$clobber and -e $dst){
1038 warn "$dst exists. skipping\n";
1041 warn "Generating $dst...\n";
1042 open my $in, $src or die "$src : $!";
1043 if ((my $d = dirname($dst)) ne '.'){
1044 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
1046 open my $out, ">$dst" or die "$!";
1049 if (/^#### END_OF_HEADER/){
1052 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1060 enc2xs -- Perl Encode Module Generator
1065 enc2xs -M ModName mapfiles...
1070 F<enc2xs> builds a Perl extension for use by Encode from either
1071 Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1072 Besides being used internally during the build process of the Encode
1073 module, you can use F<enc2xs> to add your own encoding to perl.
1074 No knowledge of XS is necessary.
1078 If you want to know as little about Perl as possible but need to
1079 add a new encoding, just read this chapter and forget the rest.
1085 Have a .ucm file ready. You can get it from somewhere or you can write
1086 your own from scratch or you can grab one from the Encode distribution
1087 and customize it. For the UCM format, see the next Chapter. In the
1088 example below, I'll call my theoretical encoding myascii, defined
1089 in I<my.ucm>. C<$> is a shell prompt.
1096 Issue a command as follows;
1098 $ enc2xs -M My my.ucm
1099 generating Makefile.PL
1104 Now take a look at your current directory. It should look like this.
1107 Makefile.PL My.pm my.ucm t/
1109 The following files were created.
1111 Makefile.PL - MakeMaker script
1112 My.pm - Encode submodule
1119 If you want *.ucm installed together with the modules, do as follows;
1123 $ enc2xs -M My Encode/*ucm
1129 Edit the files generated. You don't have to if you have no time AND no
1130 intention to give it to someone else. But it is a good idea to edit
1131 the pod and to add more tests.
1135 Now issue a command all Perl Mongers love:
1138 Writing Makefile for Encode::My
1142 Now all you have to do is make.
1145 cp My.pm blib/lib/Encode/My.pm
1146 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1147 -o encode_t.c -f encode_t.fnm
1148 Reading myascii (myascii)
1149 Writing compiled form
1150 128 bytes in string tables
1151 384 bytes (75%) saved spotting duplicates
1152 1 bytes (0.775%) saved using substrings
1154 chmod 644 blib/arch/auto/Encode/My/My.bs
1157 The time it takes varies depending on how fast your machine is and
1158 how large your encoding is. Unless you are working on something big
1159 like euc-tw, it won't take too long.
1163 You can "make install" already but you should test first.
1166 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1167 -e 'use Test::Harness qw(&runtests $verbose); \
1168 $verbose=0; runtests @ARGV;' t/*.t
1170 All tests successful.
1171 Files=1, Tests=2, 0 wallclock secs
1172 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1176 If you are content with the test result, just "make install"
1180 If you want to add your encoding to Encode's demand-loading list
1181 (so you don't have to "use Encode::YourEncoding"), run
1185 to update Encode::ConfigLocal, a module that controls local settings.
1186 After that, "use Encode;" is enough to load your encodings on demand.
1190 =head1 The Unicode Character Map
1192 Encode uses the Unicode Character Map (UCM) format for source character
1193 mappings. This format is used by IBM's ICU package and was adopted
1194 by Nick Ing-Simmons for use with the Encode module. Since UCM is
1195 more flexible than Tcl's Encoding Map and far more user-friendly,
1196 this is the recommended format for Encode now.
1198 A UCM file looks like this.
1203 <code_set_name> "US-ascii" # Required
1204 <code_set_alias> "ascii" # Optional
1205 <mb_cur_min> 1 # Required; usually 1
1206 <mb_cur_max> 1 # Max. # of bytes/char
1207 <subchar> \x3F # Substitution char
1210 <U0000> \x00 |0 # <control>
1211 <U0001> \x01 |0 # <control>
1212 <U0002> \x02 |0 # <control>
1214 <U007C> \x7C |0 # VERTICAL LINE
1215 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1216 <U007E> \x7E |0 # TILDE
1217 <U007F> \x7F |0 # <control>
1224 Anything that follows C<#> is treated as a comment.
1228 The header section continues until a line containing the word
1229 CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1230 pair per line. Strings used as values must be quoted. Barewords are
1231 treated as numbers. I<\xXX> represents a byte.
1233 Most of the keywords are self-explanatory. I<subchar> means
1234 substitution character, not subcharacter. When you decode a Unicode
1235 sequence to this encoding but no matching character is found, the byte
1236 sequence defined here will be used. For most cases, the value here is
1237 \x3F; in ASCII, this is a question mark.
1241 CHARMAP starts the character map section. Each line has a form as
1244 <UXXXX> \xXX.. |0 # comment
1246 | | +- Fallback flag
1247 | +-------- Encoded byte sequence
1248 +-------------- Unicode Character ID in hex
1250 The format is roughly the same as a header section except for the
1251 fallback flag: | followed by 0..3. The meaning of the possible
1252 values is as follows:
1258 Round trip safe. A character decoded to Unicode encodes back to the
1259 same byte sequence. Most characters have this flag.
1263 Fallback for unicode -> encoding. When seen, enc2xs adds this
1264 character for the encode map only.
1268 Skip sub-char mapping should there be no code point.
1272 Fallback for encoding -> unicode. When seen, enc2xs adds this
1273 character for the decode map only.
1279 And finally, END OF CHARMAP ends the section.
1283 When you are manually creating a UCM file, you should copy ascii.ucm
1284 or an existing encoding which is close to yours, rather than write
1285 your own from scratch.
1287 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1288 is, unless your environment is EBCDIC.
1290 B<CAVEAT>: not all features in UCM are implemented. For example,
1291 icu:state is not used. Because of that, you need to write a perl
1292 module if you want to support algorithmical encodings, notably
1293 the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
1294 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1296 =head2 Coping with duplicate mappings
1298 When you create a map, you SHOULD make your mappings round-trip safe.
1299 That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1300 $data> stands for all characters that are marked as C<|0>. Here is
1307 Sort your map in Unicode order.
1311 When you have a duplicate entry, mark either one with '|1' or '|3'.
1315 And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1319 Here is an example from big5-eten.
1324 Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1328 --------------------------------------
1329 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1332 So it is round-trip safe for \xF9\xF9. But if the line above is upside
1333 down, here is what happens.
1336 --------------------------------------
1337 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1338 (\xF9\xF9 => U2550 is now overwritten!)
1340 The Encode package comes with F<ucmlint>, a crude but sufficient
1341 utility to check the integrity of a UCM file. Check under the
1342 Encode/bin directory for this.
1344 When in doubt, you can use F<ucmsort>, yet another utility under
1345 Encode/bin directory.
1354 L<http://www.icu-project.org/>
1358 ICU Character Mapping Tables
1359 L<http://www.icu-project.org/charset/>
1364 L<http://www.icu-project.org/userguide/conversion-data.html>
1376 # -Q to disable the duplicate codepoint test
1377 # -S make mapping errors fatal
1378 # -q to remove comments written to output files
1379 # -O to enable the (brute force) substring optimiser
1380 # -o <output> to specify the output file name (else it's the first arg)
1381 # -f <inlist> to give a file with a list of input files (else use the args)
1382 # -n <name> to name the encoding (else use the basename of the input file.
1384 With %seen holding array refs:
1386 865.66 real 28.80 user 8.79 sys
1387 7904 maximum resident set size
1388 1356 average shared memory size
1389 18566 average unshared data size
1390 229 average unshared stack size
1394 With %seen holding simple scalars:
1396 342.16 real 27.11 user 3.54 sys
1397 8388 maximum resident set size
1398 1394 average shared memory size
1399 14969 average unshared data size
1400 236 average unshared stack size
1404 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1405 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1406 RAM machine, but it's going to help even on modern machines.
1407 Swapping is bad, m'kay :-)