3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
12 my @orig_ARGV = @ARGV;
13 our $VERSION = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
15 # These may get re-ordered.
16 # RAW is a do_now as inserted by &enter
17 # AGG is an aggreagated do_now, as built up by &process
34 # (See the algorithm in encengine.c - we're building structures for it)
36 # There are two sorts of structures.
37 # "do_now" (an array, two variants of what needs storing) is whatever we need
38 # to do now we've read an input byte.
39 # It's housed in a "do_next" (which is how we got to it), and in turn points
40 # to a "do_next" which contains all the "do_now"s for the next input byte.
42 # There will be a "do_next" which is the start state.
43 # For a single byte encoding it's the only "do_next" - each "do_now" points
44 # back to it, and each "do_now" will cause bytes. There is no state.
46 # For a multi-byte encoding where all characters in the input are the same
47 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
48 # branching out from the start state, one step for each input byte.
49 # The leaf "do_now"s will all be at the same distance from the start state,
50 # only the leaf "do_now"s cause output bytes, and they in turn point back to
53 # For an encoding where there are varaible length input byte sequences, you
54 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
55 # as before the leaves will point back to the start state.
57 # The system will cope with escape encodings (imagine them as a mostly
58 # self-contained tree for each escape state, and cross links between trees
59 # at the state-switching characters) but so far no input format defines these.
61 # The system will also cope with having output "leaves" in the middle of
62 # the bifurcating branches, not just at the extremities, but again no
63 # input format does this yet.
65 # There are two variants of the "do_now" structure. The first, smaller variant
66 # is generated by &enter as the input file is read. There is one structure
67 # for each input byte. Say we are mapping a single byte encoding to a
68 # single byte encoding, with "ABCD" going "abcd". There will be
69 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
71 # &process then walks the tree, building aggregate "do_now" structres for
72 # adjacent bytes where possible. The aggregate is for a contiguous range of
73 # bytes which each produce the same length of output, each move to the
74 # same next state, and each have the same fallback flag.
75 # So our 4 RAW "do_now"s above become replaced by a single structure
77 # ["A", "D", "abcd", 1, ...]
78 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
79 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
80 # which maps very nicely into pointer arithmetic in C for encengine.c
84 # UTF-8 encode long hand - only covers part of perl's range
86 # chr() works in native space so convert value from table
87 # into that space before using chr().
88 my $ch = chr(utf8::unicode_to_native($_[0]));
89 # Now get core perl to encode that the way it likes.
97 ## my ($ch,$page) = @_; return chr($ch);
103 # encode double byte MS byte first
104 ## my ($ch,$page) = @_; return chr($page).chr($ch);
105 return chr ($_[1]) . chr $_[0];
110 # encode Multi-byte - single for 0..255 otherwise double
111 ## my ($ch,$page) = @_;
112 ## return &encode_D if $page;
114 return chr ($_[1]) . chr $_[0] if $_[1];
118 my %encode_types = (U => \&encode_U,
124 # Win32 does not expand globs on command line
125 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
129 # -Q to disable the duplicate codepoint test
130 # -S make mapping errors fatal
131 # -q to remove comments written to output files
132 # -O to enable the (brute force) substring optimiser
133 # -o <output> to specify the output file name (else it's the first arg)
134 # -f <inlist> to give a file with a list of input files (else use the args)
135 # -n <name> to name the encoding (else use the basename of the input file.
136 getopts('CM:SQqOo:f:n:',\%opt);
138 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
139 $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
141 # This really should go first, else the die here causes empty (non-erroneous)
142 # output files to be written.
144 if (exists $opt{'f'}) {
145 # -F is followed by name of file containing list of filenames
146 my $flist = $opt{'f'};
147 open(FLIST,$flist) || die "Cannot open $flist:$!";
148 chomp(@encfiles = <FLIST>);
154 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
155 chmod(0666,$cname) if -f $cname && !-w $cname;
156 open(C,">$cname") || die "Cannot open $cname:$!";
161 my ($doC,$doEnc,$doUcm,$doPet);
163 if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
166 $dname =~ s/(\.[^\.]*)?$/.exh/;
167 chmod(0666,$dname) if -f $cname && !-w $dname;
168 open(D,">$dname") || die "Cannot open $dname:$!";
169 $hname =~ s/(\.[^\.]*)?$/.h/;
170 chmod(0666,$hname) if -f $cname && !-w $hname;
171 open(H,">$hname") || die "Cannot open $hname:$!";
173 foreach my $fh (\*C,\*D,\*H)
175 print $fh <<"END" unless $opt{'q'};
177 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
178 This file was autogenerated by:
180 enc2xs VERSION $VERSION
185 if ($cname =~ /(\w+)\.xs$/)
187 print C "#include <EXTERN.h>\n";
188 print C "#include <perl.h>\n";
189 print C "#include <XSUB.h>\n";
190 print C "#define U8 U8\n";
192 print C "#include \"encode.h\"\n\n";
195 elsif ($cname =~ /\.enc$/)
199 elsif ($cname =~ /\.ucm$/)
203 elsif ($cname =~ /\.pet$/)
219 if ($a =~ /^.*-(\d+)/)
222 if ($b =~ /^.*-(\d+)/)
232 foreach my $enc (sort cmp_name @encfiles)
234 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
235 $name = $opt{'n'} if exists $opt{'n'};
240 compile_enc(\*E,lc($name));
244 compile_ucm(\*E,lc($name));
249 warn "Cannot open $enc for $name:$!";
255 print STDERR "Writing compiled form\n";
256 foreach my $name (sort cmp_name keys %encoding)
258 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
259 process($name.'_utf8',$e2u);
260 addstrings(\*C,$e2u);
262 process('utf8_'.$name,$u2e);
263 addstrings(\*C,$u2e);
265 outbigstring(\*C,"enctable");
266 foreach my $name (sort cmp_name keys %encoding)
268 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
269 outtable(\*C,$e2u, "enctable");
270 outtable(\*C,$u2e, "enctable");
272 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
274 foreach my $enc (sort cmp_name keys %encoding)
276 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
277 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
278 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
280 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
281 my $sym = "${enc}_encoding";
283 my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
285 print C "static const U8 ${sym}_rep_character[] = \"$rep\";\n";
286 print C "static const char ${sym}_enc_name[] = \"$enc\";\n\n";
287 print C "const encode_t $sym = \n";
288 # This is to make null encoding work -- dankogai
289 for (my $i = (scalar @info) - 1; $i >= 0; --$i){
292 # end of null tweak -- dankogai
293 print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
296 foreach my $enc (sort cmp_name keys %encoding)
298 my $sym = "${enc}_encoding";
300 print H "extern encode_t $sym;\n";
301 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
304 if ($cname =~ /(\w+)\.xs$/)
310 Encode_XSEncoding(pTHX_ encode_t *enc)
313 HV *stash = gv_stashpv("Encode::XS", TRUE);
314 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
320 const char *name = enc->name[i++];
321 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
324 call_pv("Encode::define_encoding",G_DISCARD);
330 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
331 print C "BOOT:\n{\n";
332 print C "#include \"$dname\"\n";
335 # Close in void context is bad, m'kay
336 close(D) or warn "Error closing '$dname': $!";
337 close(H) or warn "Error closing '$hname': $!";
339 my $perc_saved = $saved/($strings + $saved) * 100;
340 my $perc_subsaved = $subsave/($strings + $subsave) * 100;
341 printf STDERR "%d bytes in string tables\n",$strings;
342 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
343 $saved, $perc_saved if $saved;
344 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
345 $subsave, $perc_subsaved if $subsave;
349 foreach my $name (sort cmp_name keys %encoding)
351 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
352 output_enc(\*C,$name,$e2u);
357 foreach my $name (sort cmp_name keys %encoding)
359 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
360 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
364 # writing half meg files and then not checking to see if you just filled the
366 close(C) or die "Error closing '$cname': $!";
368 # End of the main program.
380 last if /^\s*CHARMAP\s*$/i;
381 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
386 if (!defined($cs = $attr{'code_set_name'}))
388 warn "No <code_set_name> in $name\n";
392 $name = $cs unless exists $opt{'n'};
398 if (exists $attr{'subchar'})
401 #$attr{'subchar'} =~ /^\s*/cg;
402 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
403 #$erep = join('',map(chr(hex($_)),@byte));
404 $erep = $attr{'subchar'};
405 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
407 print "Reading $name ($cs)\n";
413 last if /^\s*END\s+CHARMAP\s*$/i;
415 my (@uni, @byte) = ();
416 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
417 or die "Bad line: $_";
418 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
419 push @uni, map { substr($_, 1) } split(/\+/, $1);
421 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
426 my $uch = join('', map { encode_U(hex($_)) } @uni );
427 my $ech = join('',map(chr(hex($_)),@byte));
428 my $el = length($ech);
429 $max_el = $el if (!defined($max_el) || $el > $max_el);
430 $min_el = $el if (!defined($min_el) || $el < $min_el);
441 # $fb is fallback flag
442 # 0 - round trip safe
443 # 1 - fallback for unicode -> enc
444 # 2 - skip sub-char mapping
445 # 3 - fallback enc -> unicode
446 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
447 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
456 die "$nfb entries without fallback, $hfb entries with\n";
458 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
470 while ($type = <$fh>)
472 last if $type !~ /^\s*#/;
475 return if $type eq 'E';
476 # Do the hash lookup once, rather than once per function call. 4% speedup.
477 my $type_func = $encode_types{$type};
478 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
479 warn "$type encoded $name\n";
481 # Save a defined test by setting these to defined values.
482 my $min_el = ~0; # A very big integer
483 my $max_el = 0; # Anything must be longer than 0
486 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
490 # use -Q to silence the seen test. Makefile.PL uses this by default.
491 $seen = {} unless $opt{Q};
496 my $page = hex($line);
501 # So why is it 1% faster to leave the my here?
503 $line =~ s/\r\n$/\n/;
504 die "$.:${line}Line should be exactly 65 characters long including
505 newline (".length($line).")" unless length ($line) == 65;
506 # Split line into groups of 4 hex digits, convert groups to ints
508 # map {hex $_} $line =~ /(....)/g
509 # This takes 63.75 (2.5% less time)
510 # unpack "n*", pack "H*", $line
511 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
512 # Doing it as while ($line =~ /(....)/g) took 74.63
513 foreach my $val (unpack "n*", pack "H*", $line)
515 next if $val == 0xFFFD;
516 my $ech = &$type_func($ch,$page);
517 if ($val || (!$ch && !$page))
519 my $el = length($ech);
520 $max_el = $el if $el > $max_el;
521 $min_el = $el if $el < $min_el;
522 my $uch = encode_U($val);
524 # We're doing the test.
525 # We don't need to read this quickly, so storing it as a scalar,
526 # rather than 3 (anon array, plus the 2 scalars it holds) saves
527 # RAM and may make us faster on low RAM systems. [see __END__]
528 if (exists $seen->{$uch})
530 warn sprintf("U%04X is %02X%02X and %04X\n",
531 $val,$page,$ch,$seen->{$uch});
536 $seen->{$uch} = $page << 8 | $ch;
539 # Passing 2 extra args each time is 3.6% slower!
540 # Even with having to add $fallback ||= 0 later
541 enter_fb0($e2u,$ech,$uch);
542 enter_fb0($u2e,$uch,$ech);
546 # No character at this position
547 # enter($e2u,$ech,undef,$e2u);
553 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
554 if $min_el > $max_el;
555 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
556 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
559 # my ($a,$s,$d,$t,$fb) = @_;
561 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
562 # state we shift to after this (multibyte) input character defaults to same
565 # Making sure it is defined seems to be faster than {no warnings;} in
566 # &process, or passing it in as 0 explicity.
567 # XXX $fallback ||= 0;
569 # Start at the beginning and work forwards through the string to zero.
570 # effectively we are removing 1 character from the front each time
571 # but we don't actually edit the string. [this alone seems to be 14% speedup]
572 # Hence -$pos is the length of the remaining string.
573 my $pos = -length $inbytes;
575 my $byte = substr $inbytes, $pos, 1;
578 # RAW_OUT_BYTES => 2,
580 # to unicode an array would seem to be better, because the pages are dense.
581 # from unicode can be very sparse, favouring a hash.
582 # hash using the bytes (all length 1) as keys rather than ord value,
583 # as it's easier to sort these in &process.
585 # It's faster to always add $fallback even if it's undef, rather than
586 # choosing between 3 and 4 element array. (hence why we set it defined
588 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
589 # When $pos was -1 we were at the last input character.
591 $do_now->[RAW_OUT_BYTES] = $outbytes;
592 $do_now->[RAW_NEXT] = $next;
595 # Tail recursion. The intermdiate state may not have a name yet.
596 $current = $do_now->[RAW_NEXT];
600 # This is purely for optimistation. It's just &enter hard coded for $fallback
601 # of 0, using only a 3 entry array ref to save memory for every entry.
603 my ($current,$inbytes,$outbytes,$next) = @_;
606 my $pos = -length $inbytes;
608 my $byte = substr $inbytes, $pos, 1;
609 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
611 $do_now->[RAW_OUT_BYTES] = $outbytes;
612 $do_now->[RAW_NEXT] = $next;
615 $current = $do_now->[RAW_NEXT];
625 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
628 foreach my $key (sort keys %$raw) {
631 # RAW_OUT_BYTES => 2,
633 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
634 # Now we are converting from raw to aggregate, switch from 1 byte strings
639 # If this == fails, we're going to reset $agg_max_in below anyway.
640 $b == ++$agg_max_in &&
641 # References in numeric context give the pointer as an int.
642 $agg_next == $next &&
643 $agg_in_len == $in_len &&
644 $agg_out_len == length $out_bytes &&
645 $agg_fallback == $fallback
646 # && length($l->[AGG_OUT_BYTES]) < 16
648 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
649 # we can aggregate this byte onto the end.
650 $l->[AGG_MAX_IN] = $b;
651 $l->[AGG_OUT_BYTES] .= $out_bytes;
655 # AGG_OUT_BYTES => 2,
660 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
661 # (only gains .6% on euc-jp -- is it worth it?)
662 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
663 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
664 $agg_fallback = $fallback];
666 if (exists $next->{Cname}) {
667 $next->{'Forward'} = 1 if $next != $a;
669 process(sprintf("%s_%02x",$name,$b),$next);
672 # encengine.c rules say that last entry must be for 255
673 if ($agg_max_in < 255) {
674 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
676 $a->{'Entries'} = \@ent;
683 my $name = $a->{'Cname'};
685 foreach my $b (@{$a->{'Entries'}})
687 next unless $b->[AGG_OUT_LEN];
688 $strings{$b->[AGG_OUT_BYTES]} = undef;
692 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
693 my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static';
694 my $const = $cpp ? '' : 'const';
695 print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
697 $a->{'DoneStrings'} = 1;
698 foreach my $b (@{$a->{'Entries'}})
700 my ($s,$e,$out,$t,$end,$l) = @$b;
701 addstrings($fh,$t) unless $t->{'DoneStrings'};
711 # Make the big string in the string accumulator. Longest first, on the hope
712 # that this makes it more likely that we find the short strings later on.
713 # Not sure if it helps sorting strings of the same length lexcically.
714 foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
715 my $index = index $string_acc, $s;
717 $saved += length($s);
718 $strings_in_acc{$s} = $index;
722 my $sublength = length $s;
723 while (--$sublength > 0) {
724 # progressively lop characters off the end, to see if the start of
725 # the new string overlaps the end of the accumulator.
726 if (substr ($string_acc, -$sublength)
727 eq substr ($s, 0, $sublength)) {
728 $subsave += $sublength;
729 $strings_in_acc{$s} = length ($string_acc) - $sublength;
730 # append the last bit on the end.
731 $string_acc .= substr ($s, $sublength);
734 # or if the end of the new string overlaps the start of the
736 next unless substr ($string_acc, 0, $sublength)
737 eq substr ($s, -$sublength);
738 # well, the last $sublength characters of the accumulator match.
739 # so as we're prepending to the accumulator, need to shift all our
740 # existing offsets forwards
741 $_ += $sublength foreach values %strings_in_acc;
742 $subsave += $sublength;
743 $strings_in_acc{$s} = 0;
744 # append the first bit on the start.
745 $string_acc = substr ($s, 0, -$sublength) . $string_acc;
749 # Optimiser (if it ran) found nothing, so just going have to tack the
750 # whole thing on the end.
751 $strings_in_acc{$s} = length $string_acc;
757 $strings = length $string_acc;
758 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
759 my $var = $cpp ? '' : 'static';
760 my $definition = "\n$var const U8 $name\[$strings] = { " .
761 join(',',unpack "C*",$string_acc);
762 # We have a single long line. Split it at convenient commas.
763 print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
764 print $fh substr ($definition, pos $definition), " };\n";
769 my $offset = $strings_in_acc{$s};
770 die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
771 unless defined $offset;
777 my ($fh,$a,$bigname) = @_;
778 my $name = $a->{'Cname'};
780 foreach my $b (@{$a->{'Entries'}})
782 my ($s,$e,$out,$t,$end,$l) = @$b;
783 outtable($fh,$t,$bigname) unless $t->{'Done'};
785 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
786 my $var = $cpp ? '' : 'static';
787 my $const = $cpp ? '' : 'const';
788 print $fh "\n$var $const encpage_t $name\[",
789 scalar(@{$a->{'Entries'}}), "] = {\n";
790 foreach my $b (@{$a->{'Entries'}})
792 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
793 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
797 printf $fh findstring($bigname,$out);
803 print $fh ",",$t->{Cname};
804 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
811 my ($fh,$name,$a) = @_;
812 die "Changed - fix me for new structure";
813 foreach my $b (sort keys %$a)
815 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
827 my $s = do "unicore/Name.pl";
828 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
830 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
834 last if $s >= 0x10000;
835 my $e = length($2) ? hex($2) : $s;
836 for (my $i = $s; $i <= $e; $i++)
839 # print sprintf("U%04X $name\n",$i);
846 my ($cmap,$a,$t,$pre) = @_;
847 # warn sprintf("Page %x\n",$pre);
849 foreach my $key (sort keys %$raw) {
852 # RAW_OUT_BYTES => 2,
854 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
858 if ($next != $a && $next != $t) {
859 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
860 } elsif (length $out_bytes) {
862 $u = $pre|($u &0x3f);
864 my $s = sprintf "<U%04X> ",$u;
865 #foreach my $c (split(//,$out_bytes)) {
866 # $s .= sprintf "\\x%02X",ord($c);
868 # 9.5% faster changing that loop to this:
869 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
870 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
873 warn join(',',$u, @{$raw->{$key}},$a,$t);
880 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
881 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
882 print $fh "<code_set_name> \"$name\"\n";
886 print $fh "<mb_cur_min> $min_el\n";
890 print $fh "<mb_cur_max> $max_el\n";
894 print $fh "<subchar> ";
895 foreach my $c (split(//,$rep))
897 printf $fh "\\x%02X",ord($c);
902 output_ucm_page(\@cmap,$h,$h,0);
903 print $fh "#\nCHARMAP\n";
904 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
908 print $fh "END CHARMAP\n";
922 eval { require File::Find; };
925 push @inc, $inc unless $inc eq '.'; #skip current dir
929 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
930 $atime,$mtime,$ctime,$blksize,$blocks)
931 = lstat($_) or return;
935 $e2x_dir{$File::Find::dir} ||= $mtime;
939 warn join("\n", keys %e2x_dir), "\n";
940 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
942 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
949 eval { require Encode; };
950 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
951 # our used for variable expanstion
953 $_Version = $VERSION;
956 $_TableFiles = join(",", map {qq('$_')} @_);
957 $_Now = scalar localtime();
959 eval { require File::Spec; };
960 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
961 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
962 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
963 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
964 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
973 sub make_configlocal_pm {
974 eval { require Encode; };
975 $@ and die "Unable to require Encode: $@\n";
976 eval { require File::Spec; };
978 # our used for variable expanstion
979 my %in_core = map { $_ => 1 } (
980 'ascii', 'iso-8859-1', 'utf8',
981 'ascii-ctrl', 'null', 'utf-8-strict'
988 $File::Find::name =~ /\A\./ and return;
989 $File::Find::name =~ /\.pm\z/ or return;
990 $File::Find::name =~ m/\bEncode\b/ or return;
991 my $mod = $File::Find::name;
992 $mod =~ s/.*\bEncode\b/Encode/o;
995 warn qq{ require $mod;\n};
996 eval qq{ require $mod; };
997 $@ and die "Can't require $mod: $@\n";
998 for my $enc ( Encode->encodings() ) {
1000 $in_core{$enc} and next;
1001 $Encode::Config::ExtModule{$enc} and next;
1002 $LocalMod{$enc} ||= $mod;
1005 File::Find::find({wanted => $wanted}, @INC);
1007 for my $enc ( sort keys %LocalMod ) {
1009 qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
1012 $_LocalVer = _mkversion();
1014 $_Inc = $INC{"Encode.pm"};
1015 $_Inc =~ s/\.pm$//o;
1016 _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1017 File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
1022 # v-string is now depreciated; use time() instead;
1023 #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1024 #$yyyy += 1900, $mo +=1;
1025 #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1030 eval { require File::Basename; };
1031 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
1032 File::Basename->import();
1033 my ($src, $dst, $clobber) = @_;
1034 if (!$clobber and -e $dst){
1035 warn "$dst exists. skipping\n";
1038 warn "Generating $dst...\n";
1039 open my $in, $src or die "$src : $!";
1040 if ((my $d = dirname($dst)) ne '.'){
1041 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
1043 open my $out, ">$dst" or die "$!";
1046 if (/^#### END_OF_HEADER/){
1049 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1057 enc2xs -- Perl Encode Module Generator
1062 enc2xs -M ModName mapfiles...
1067 F<enc2xs> builds a Perl extension for use by Encode from either
1068 Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1069 Besides being used internally during the build process of the Encode
1070 module, you can use F<enc2xs> to add your own encoding to perl.
1071 No knowledge of XS is necessary.
1075 If you want to know as little about Perl as possible but need to
1076 add a new encoding, just read this chapter and forget the rest.
1082 Have a .ucm file ready. You can get it from somewhere or you can write
1083 your own from scratch or you can grab one from the Encode distribution
1084 and customize it. For the UCM format, see the next Chapter. In the
1085 example below, I'll call my theoretical encoding myascii, defined
1086 in I<my.ucm>. C<$> is a shell prompt.
1093 Issue a command as follows;
1095 $ enc2xs -M My my.ucm
1096 generating Makefile.PL
1101 Now take a look at your current directory. It should look like this.
1104 Makefile.PL My.pm my.ucm t/
1106 The following files were created.
1108 Makefile.PL - MakeMaker script
1109 My.pm - Encode submodule
1116 If you want *.ucm installed together with the modules, do as follows;
1120 $ enc2xs -M My Encode/*ucm
1126 Edit the files generated. You don't have to if you have no time AND no
1127 intention to give it to someone else. But it is a good idea to edit
1128 the pod and to add more tests.
1132 Now issue a command all Perl Mongers love:
1135 Writing Makefile for Encode::My
1139 Now all you have to do is make.
1142 cp My.pm blib/lib/Encode/My.pm
1143 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1144 -o encode_t.c -f encode_t.fnm
1145 Reading myascii (myascii)
1146 Writing compiled form
1147 128 bytes in string tables
1148 384 bytes (75%) saved spotting duplicates
1149 1 bytes (0.775%) saved using substrings
1151 chmod 644 blib/arch/auto/Encode/My/My.bs
1154 The time it takes varies depending on how fast your machine is and
1155 how large your encoding is. Unless you are working on something big
1156 like euc-tw, it won't take too long.
1160 You can "make install" already but you should test first.
1163 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1164 -e 'use Test::Harness qw(&runtests $verbose); \
1165 $verbose=0; runtests @ARGV;' t/*.t
1167 All tests successful.
1168 Files=1, Tests=2, 0 wallclock secs
1169 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1173 If you are content with the test result, just "make install"
1177 If you want to add your encoding to Encode's demand-loading list
1178 (so you don't have to "use Encode::YourEncoding"), run
1182 to update Encode::ConfigLocal, a module that controls local settings.
1183 After that, "use Encode;" is enough to load your encodings on demand.
1187 =head1 The Unicode Character Map
1189 Encode uses the Unicode Character Map (UCM) format for source character
1190 mappings. This format is used by IBM's ICU package and was adopted
1191 by Nick Ing-Simmons for use with the Encode module. Since UCM is
1192 more flexible than Tcl's Encoding Map and far more user-friendly,
1193 this is the recommended formet for Encode now.
1195 A UCM file looks like this.
1200 <code_set_name> "US-ascii" # Required
1201 <code_set_alias> "ascii" # Optional
1202 <mb_cur_min> 1 # Required; usually 1
1203 <mb_cur_max> 1 # Max. # of bytes/char
1204 <subchar> \x3F # Substitution char
1207 <U0000> \x00 |0 # <control>
1208 <U0001> \x01 |0 # <control>
1209 <U0002> \x02 |0 # <control>
1211 <U007C> \x7C |0 # VERTICAL LINE
1212 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1213 <U007E> \x7E |0 # TILDE
1214 <U007F> \x7F |0 # <control>
1221 Anything that follows C<#> is treated as a comment.
1225 The header section continues until a line containing the word
1226 CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1227 pair per line. Strings used as values must be quoted. Barewords are
1228 treated as numbers. I<\xXX> represents a byte.
1230 Most of the keywords are self-explanatory. I<subchar> means
1231 substitution character, not subcharacter. When you decode a Unicode
1232 sequence to this encoding but no matching character is found, the byte
1233 sequence defined here will be used. For most cases, the value here is
1234 \x3F; in ASCII, this is a question mark.
1238 CHARMAP starts the character map section. Each line has a form as
1241 <UXXXX> \xXX.. |0 # comment
1243 | | +- Fallback flag
1244 | +-------- Encoded byte sequence
1245 +-------------- Unicode Character ID in hex
1247 The format is roughly the same as a header section except for the
1248 fallback flag: | followed by 0..3. The meaning of the possible
1249 values is as follows:
1255 Round trip safe. A character decoded to Unicode encodes back to the
1256 same byte sequence. Most characters have this flag.
1260 Fallback for unicode -> encoding. When seen, enc2xs adds this
1261 character for the encode map only.
1265 Skip sub-char mapping should there be no code point.
1269 Fallback for encoding -> unicode. When seen, enc2xs adds this
1270 character for the decode map only.
1276 And finally, END OF CHARMAP ends the section.
1280 When you are manually creating a UCM file, you should copy ascii.ucm
1281 or an existing encoding which is close to yours, rather than write
1282 your own from scratch.
1284 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1285 is, unless your environment is EBCDIC.
1287 B<CAVEAT>: not all features in UCM are implemented. For example,
1288 icu:state is not used. Because of that, you need to write a perl
1289 module if you want to support algorithmical encodings, notably
1290 the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
1291 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1293 =head2 Coping with duplicate mappings
1295 When you create a map, you SHOULD make your mappings round-trip safe.
1296 That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1297 $data> stands for all characters that are marked as C<|0>. Here is
1304 Sort your map in Unicode order.
1308 When you have a duplicate entry, mark either one with '|1' or '|3'.
1312 And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1316 Here is an example from big5-eten.
1321 Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1325 --------------------------------------
1326 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1329 So it is round-trip safe for \xF9\xF9. But if the line above is upside
1330 down, here is what happens.
1333 --------------------------------------
1334 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1335 (\xF9\xF9 => U2550 is now overwritten!)
1337 The Encode package comes with F<ucmlint>, a crude but sufficient
1338 utility to check the integrity of a UCM file. Check under the
1339 Encode/bin directory for this.
1341 When in doubt, you can use F<ucmsort>, yet another utility under
1342 Encode/bin directory.
1351 L<http://oss.software.ibm.com/icu/>
1355 ICU Character Mapping Tables
1356 L<http://oss.software.ibm.com/icu/charset/>
1361 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1373 # -Q to disable the duplicate codepoint test
1374 # -S make mapping errors fatal
1375 # -q to remove comments written to output files
1376 # -O to enable the (brute force) substring optimiser
1377 # -o <output> to specify the output file name (else it's the first arg)
1378 # -f <inlist> to give a file with a list of input files (else use the args)
1379 # -n <name> to name the encoding (else use the basename of the input file.
1381 With %seen holding array refs:
1383 865.66 real 28.80 user 8.79 sys
1384 7904 maximum resident set size
1385 1356 average shared memory size
1386 18566 average unshared data size
1387 229 average unshared stack size
1391 With %seen holding simple scalars:
1393 342.16 real 27.11 user 3.54 sys
1394 8388 maximum resident set size
1395 1394 average shared memory size
1396 14969 average unshared data size
1397 236 average unshared stack size
1401 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1402 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1403 RAM machine, but it's going to help even on modern machines.
1404 Swapping is bad, m'kay :-)