3 unshift @INC, qw(../../lib ../../../lib ../../../../lib);
4 $ENV{PATH} .= ';../..;../../..;../../../..' if $^O eq 'MSWin32';
9 our $VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
12 # These may get re-ordered.
13 # RAW is a do_now as inserted by &enter
14 # AGG is an aggreagated do_now, as built up by &process
29 # (See the algorithm in encengine.c - we're building structures for it)
31 # There are two sorts of structures.
32 # "do_now" (an array, two variants of what needs storing) is whatever we need
33 # to do now we've read an input byte.
34 # It's housed in a "do_next" (which is how we got to it), and in turn points
35 # to a "do_next" which contains all the "do_now"s for the next input byte.
37 # There will be a "do_next" which is the start state.
38 # For a single byte encoding it's the only "do_next" - each "do_now" points
39 # back to it, and each "do_now" will cause bytes. There is no state.
41 # For a multi-byte encoding where all characters in the input are the same
42 # length, then there will be a tree of "do_now"->"do_next"->"do_now"
43 # branching out from the start state, one step for each input byte.
44 # The leaf "do_now"s will all be at the same distance from the start state,
45 # only the leaf "do_now"s cause output bytes, and they in turn point back to
48 # For an encoding where there are varaible length input byte sequences, you
49 # will encounter a leaf "do_now" sooner for the shorter input sequences, but
50 # as before the leaves will point back to the start state.
52 # The system will cope with escape encodings (imagine them as a mostly
53 # self-contained tree for each escape state, and cross links between trees
54 # at the state-switching characters) but so far no input format defines these.
56 # The system will also cope with having output "leaves" in the middle of
57 # the bifurcating branches, not just at the extremities, but again no
58 # input format does this yet.
60 # There are two variants of the "do_now" structure. The first, smaller variant
61 # is generated by &enter as the input file is read. There is one structure
62 # for each input byte. Say we are mapping a single byte encoding to a
63 # single byte encoding, with "ABCD" going "abcd". There will be
64 # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
66 # &process then walks the tree, building aggregate "do_now" structres for
67 # adjacent bytes where possible. The aggregate is for a contiguous range of
68 # bytes which each produce the same length of output, each move to the
69 # same next state, and each have the same fallback flag.
70 # So our 4 RAW "do_now"s above become replaced by a single structure
72 # ["A", "D", "abcd", 1, ...]
73 # ie, for an input byte $_ in "A".."D", output 1 byte, found as
74 # substr ("abcd", (ord $_ - ord "A") * 1, 1)
75 # which maps very nicely into pointer arithmetic in C for encengine.c
79 # UTF-8 encode long hand - only covers part of perl's range
81 # chr() works in native space so convert value from table
82 # into that space before using chr().
83 my $ch = chr(utf8::unicode_to_native($_[0]));
84 # Now get core perl to encode that the way it likes.
92 ## my ($ch,$page) = @_; return chr($ch);
98 # encode double byte MS byte first
99 ## my ($ch,$page) = @_; return chr($page).chr($ch);
100 return chr ($_[1]) . chr $_[0];
105 # encode Multi-byte - single for 0..255 otherwise double
106 ## my ($ch,$page) = @_;
107 ## return &encode_D if $page;
109 return chr ($_[1]) . chr $_[0] if $_[1];
113 my %encode_types = (U => \&encode_U,
119 # Win32 does not expand globs on command line
120 eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
124 # -Q to disable the duplicate codepoint test
125 # -S make mapping errors fatal
126 # -q to remove comments written to output files
127 # -O to enable the (brute force) substring optimiser
128 # -o <output> to specify the output file name (else it's the first arg)
129 # -f <inlist> to give a file with a list of input files (else use the args)
130 # -n <name> to name the encoding (else use the basename of the input file.
131 getopts('M:SQqOo:f:n:',\%opt);
133 $opt{M} and make_makefile_pl($opt{M}, @ARGV);
135 # This really should go first, else the die here causes empty (non-erroneous)
136 # output files to be written.
138 if (exists $opt{'f'}) {
139 # -F is followed by name of file containing list of filenames
140 my $flist = $opt{'f'};
141 open(FLIST,$flist) || die "Cannot open $flist:$!";
142 chomp(@encfiles = <FLIST>);
148 my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
149 chmod(0666,$cname) if -f $cname && !-w $cname;
150 open(C,">$cname") || die "Cannot open $cname:$!";
155 my ($doC,$doEnc,$doUcm,$doPet);
157 if ($cname =~ /\.(c|xs)$/)
160 $dname =~ s/(\.[^\.]*)?$/_def.h/;
161 chmod(0666,$dname) if -f $cname && !-w $dname;
162 open(D,">$dname") || die "Cannot open $dname:$!";
163 $hname =~ s/(\.[^\.]*)?$/.h/;
164 chmod(0666,$hname) if -f $cname && !-w $hname;
165 open(H,">$hname") || die "Cannot open $hname:$!";
167 foreach my $fh (\*C,\*D,\*H)
169 print $fh <<"END" unless $opt{'q'};
171 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
172 This file was autogenerated by:
178 if ($cname =~ /(\w+)\.xs$/)
180 print C "#include <EXTERN.h>\n";
181 print C "#include <perl.h>\n";
182 print C "#include <XSUB.h>\n";
183 print C "#define U8 U8\n";
185 print C "#include \"encode.h\"\n";
188 elsif ($cname =~ /\.enc$/)
192 elsif ($cname =~ /\.ucm$/)
196 elsif ($cname =~ /\.pet$/)
209 if ($a =~ /^.*-(\d+)/)
212 if ($b =~ /^.*-(\d+)/)
222 foreach my $enc (sort cmp_name @encfiles)
224 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
225 $name = $opt{'n'} if exists $opt{'n'};
230 compile_enc(\*E,lc($name));
234 compile_ucm(\*E,lc($name));
239 warn "Cannot open $enc for $name:$!";
245 print STDERR "Writing compiled form\n";
246 foreach my $name (sort cmp_name keys %encoding)
248 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
249 output(\*C,$name.'_utf8',$e2u);
250 output(\*C,'utf8_'.$name,$u2e);
251 push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
253 foreach my $enc (sort cmp_name keys %encoding)
255 my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
256 my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
257 my $sym = "${enc}_encoding";
259 print C "encode_t $sym = \n";
260 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
263 foreach my $enc (sort cmp_name keys %encoding)
265 my $sym = "${enc}_encoding";
267 print H "extern encode_t $sym;\n";
268 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
271 if ($cname =~ /(\w+)\.xs$/)
277 Encode_XSEncoding(pTHX_ encode_t *enc)
280 HV *stash = gv_stashpv("Encode::XS", TRUE);
281 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
287 const char *name = enc->name[i++];
288 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
291 call_pv("Encode::define_encoding",G_DISCARD);
297 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
298 print C "BOOT:\n{\n";
299 print C "#include \"$dname\"\n";
302 # Close in void context is bad, m'kay
303 close(D) or warn "Error closing '$dname': $!";
304 close(H) or warn "Error closing '$hname': $!";
306 my $perc_saved = $strings/($strings + $saved) * 100;
307 my $perc_subsaved = $strings/($strings + $subsave) * 100;
308 printf STDERR "%d bytes in string tables\n",$strings;
309 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
310 $saved, $perc_saved if $saved;
311 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
312 $subsave, $perc_subsaved if $subsave;
316 foreach my $name (sort cmp_name keys %encoding)
318 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
319 output_enc(\*C,$name,$e2u);
324 foreach my $name (sort cmp_name keys %encoding)
326 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
327 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
331 # writing half meg files and then not checking to see if you just filled the
333 close(C) or die "Error closing '$cname': $!";
335 # End of the main program.
347 last if /^\s*CHARMAP\s*$/i;
348 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
353 if (!defined($cs = $attr{'code_set_name'}))
355 warn "No <code_set_name> in $name\n";
359 $name = $cs unless exists $opt{'n'};
365 if (exists $attr{'subchar'})
368 $attr{'subchar'} =~ /^\s*/cg;
369 push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
370 $erep = join('',map(chr(hex($_)),@byte));
372 print "Reading $name ($cs)\n";
378 last if /^\s*END\s+CHARMAP\s*$/i;
382 $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
383 push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
384 $fb = $1 if /\G\s*(\|[0-3])/gc;
385 # warn "$_: $u @byte | $fb\n";
386 die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
389 my $uch = encode_U(hex($u));
390 my $ech = join('',map(chr(hex($_)),@byte));
391 my $el = length($ech);
392 $max_el = $el if (!defined($max_el) || $el > $max_el);
393 $min_el = $el if (!defined($min_el) || $el < $min_el);
404 # $fb is fallback flag
405 # 0 - round trip safe
406 # 1 - fallback for unicode -> enc
407 # 2 - skip sub-char mapping
408 # 3 - fallback enc -> unicode
409 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
410 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
419 die "$nfb entries without fallback, $hfb entries with\n";
421 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
433 while ($type = <$fh>)
435 last if $type !~ /^\s*#/;
438 return if $type eq 'E';
439 # Do the hash lookup once, rather than once per function call. 4% speedup.
440 my $type_func = $encode_types{$type};
441 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
442 warn "$type encoded $name\n";
444 # Save a defined test by setting these to defined values.
445 my $min_el = ~0; # A very big integer
446 my $max_el = 0; # Anything must be longer than 0
449 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
453 # use -Q to silence the seen test. Makefile.PL uses this by default.
454 $seen = {} unless $opt{Q};
459 my $page = hex($line);
464 # So why is it 1% faster to leave the my here?
466 $line =~ s/\r\n$/\n/;
467 die "$.:${line}Line should be exactly 65 characters long including
468 newline (".length($line).")" unless length ($line) == 65;
469 # Split line into groups of 4 hex digits, convert groups to ints
471 # map {hex $_} $line =~ /(....)/g
472 # This takes 63.75 (2.5% less time)
473 # unpack "n*", pack "H*", $line
474 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
475 # Doing it as while ($line =~ /(....)/g) took 74.63
476 foreach my $val (unpack "n*", pack "H*", $line)
478 next if $val == 0xFFFD;
479 my $ech = &$type_func($ch,$page);
480 if ($val || (!$ch && !$page))
482 my $el = length($ech);
483 $max_el = $el if $el > $max_el;
484 $min_el = $el if $el < $min_el;
485 my $uch = encode_U($val);
487 # We're doing the test.
488 # We don't need to read this quickly, so storing it as a scalar,
489 # rather than 3 (anon array, plus the 2 scalars it holds) saves
490 # RAM and may make us faster on low RAM systems. [see __END__]
491 if (exists $seen->{$uch})
493 warn sprintf("U%04X is %02X%02X and %04X\n",
494 $val,$page,$ch,$seen->{$uch});
499 $seen->{$uch} = $page << 8 | $ch;
502 # Passing 2 extra args each time is 3.6% slower!
503 # Even with having to add $fallback ||= 0 later
504 enter_fb0($e2u,$ech,$uch);
505 enter_fb0($u2e,$uch,$ech);
509 # No character at this position
510 # enter($e2u,$ech,undef,$e2u);
516 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
517 if $min_el > $max_el;
518 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
519 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
522 # my ($a,$s,$d,$t,$fb) = @_;
524 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
525 # state we shift to after this (multibyte) input character defaults to same
528 # Making sure it is defined seems to be faster than {no warnings;} in
529 # &process, or passing it in as 0 explicity.
530 # XXX $fallback ||= 0;
532 # Start at the beginning and work forwards through the string to zero.
533 # effectively we are removing 1 character from the front each time
534 # but we don't actually edit the string. [this alone seems to be 14% speedup]
535 # Hence -$pos is the length of the remaining string.
536 my $pos = -length $inbytes;
538 my $byte = substr $inbytes, $pos, 1;
541 # RAW_OUT_BYTES => 2,
543 # to unicode an array would seem to be better, because the pages are dense.
544 # from unicode can be very sparse, favouring a hash.
545 # hash using the bytes (all length 1) as keys rather than ord value,
546 # as it's easier to sort these in &process.
548 # It's faster to always add $fallback even if it's undef, rather than
549 # choosing between 3 and 4 element array. (hence why we set it defined
551 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
552 # When $pos was -1 we were at the last input character.
554 $do_now->[RAW_OUT_BYTES] = $outbytes;
555 $do_now->[RAW_NEXT] = $next;
558 # Tail recursion. The intermdiate state may not have a name yet.
559 $current = $do_now->[RAW_NEXT];
563 # This is purely for optimistation. It's just &enter hard coded for $fallback
564 # of 0, using only a 3 entry array ref to save memory for every entry.
566 my ($current,$inbytes,$outbytes,$next) = @_;
569 my $pos = -length $inbytes;
571 my $byte = substr $inbytes, $pos, 1;
572 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
574 $do_now->[RAW_OUT_BYTES] = $outbytes;
575 $do_now->[RAW_NEXT] = $next;
578 $current = $do_now->[RAW_NEXT];
585 my ($fh,$name,$s) = @_;
586 my $sym = $strings{$s};
589 $saved += length($s);
594 foreach my $o (keys %strings)
596 next unless (my $i = index($o,$s)) >= 0;
598 # gcc things that 0x0e+0x10 (anything with e+) starts to look like
599 # a hexadecimal floating point constant. Silly gcc. Only p
600 # introduces a floating point constant. Put the space in to stop it
602 $sym .= sprintf(" +0x%02x",$i) if ($i);
603 $subsave += length($s);
604 return $strings{$s} = $sym;
607 $strings{$s} = $sym = $name;
608 $strings += length($s);
609 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
610 # Maybe we should assert that these are all <256.
611 $definition .= join(',',unpack "C*",$s);
612 # We have a single long line. Split it at convenient commas.
613 $definition =~ s/(.{74,77},)/$1\n/g;
614 print $fh "$definition };\n\n";
625 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
628 foreach my $key (sort keys %$raw) {
631 # RAW_OUT_BYTES => 2,
633 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
634 # Now we are converting from raw to aggregate, switch from 1 byte strings
639 # If this == fails, we're going to reset $agg_max_in below anyway.
640 $b == ++$agg_max_in &&
641 # References in numeric context give the pointer as an int.
642 $agg_next == $next &&
643 $agg_in_len == $in_len &&
644 $agg_out_len == length $out_bytes &&
645 $agg_fallback == $fallback
646 # && length($l->[AGG_OUT_BYTES]) < 16
648 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
649 # we can aggregate this byte onto the end.
650 $l->[AGG_MAX_IN] = $b;
651 $l->[AGG_OUT_BYTES] .= $out_bytes;
655 # AGG_OUT_BYTES => 2,
660 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
661 # (only gains .6% on euc-jp -- is it worth it?)
662 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
663 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
664 $agg_fallback = $fallback];
666 if (exists $next->{Cname}) {
667 $next->{'Forward'} = 1 if $next != $a;
669 process(sprintf("%s_%02x",$name,$b),$next);
672 # encengine.c rules say that last entry must be for 255
673 if ($agg_max_in < 255) {
674 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
676 $a->{'Entries'} = \@ent;
682 my $name = $a->{'Cname'};
684 foreach my $b (@{$a->{'Entries'}})
686 next unless $b->[AGG_OUT_LEN];
687 my $s = $b->[AGG_MIN_IN];
688 my $e = $b->[AGG_MAX_IN];
689 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
693 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
696 foreach my $b (@{$a->{'Entries'}})
698 my ($s,$e,$out,$t,$end,$l) = @$b;
699 outtable($fh,$t) unless $t->{'Done'};
701 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
702 foreach my $b (@{$a->{'Entries'}})
704 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
709 printf $fh outstring($fh,'',$out);
715 print $fh ",",$t->{Cname};
716 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
723 my ($fh,$name,$a) = @_;
731 my ($fh,$name,$a) = @_;
732 die "Changed - fix me for new structure";
733 foreach my $b (sort keys %$a)
735 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
747 my $s = do "unicore/Name.pl";
748 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
750 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
754 last if $s >= 0x10000;
755 my $e = length($2) ? hex($2) : $s;
756 for (my $i = $s; $i <= $e; $i++)
759 # print sprintf("U%04X $name\n",$i);
766 my ($cmap,$a,$t,$pre) = @_;
767 # warn sprintf("Page %x\n",$pre);
769 foreach my $key (sort keys %$raw) {
772 # RAW_OUT_BYTES => 2,
774 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
778 if ($next != $a && $next != $t) {
779 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
780 } elsif (length $out_bytes) {
782 $u = $pre|($u &0x3f);
784 my $s = sprintf "<U%04X> ",$u;
785 #foreach my $c (split(//,$out_bytes)) {
786 # $s .= sprintf "\\x%02X",ord($c);
788 # 9.5% faster changing that loop to this:
789 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
790 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
793 warn join(',',$u, @{$raw->{$key}},$a,$t);
800 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
801 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
802 print $fh "<code_set_name> \"$name\"\n";
806 print $fh "<mb_cur_min> $min_el\n";
810 print $fh "<mb_cur_max> $max_el\n";
814 print $fh "<subchar> ";
815 foreach my $c (split(//,$rep))
817 printf $fh "\\x%02X",ord($c);
822 output_ucm_page(\@cmap,$h,$h,0);
823 print $fh "#\nCHARMAP\n";
824 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
828 print $fh "END CHARMAP\n";
833 eval { require Encode; };
834 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
835 eval { require File::Basename; };
836 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
837 File::Basename->import();
838 my $inc = dirname($INC{"Encode/Internal.pm"});
840 my $table_files = join(",", map {qq('$_')} @_);
841 my $now = scalar localtime();
842 open my $fh, ">Makefile.PL" or die "$!";
843 print $fh <<"END_OF_HEADER";
845 # This file is auto-generated by:
851 use ExtUtils::MakeMaker;
853 # Please edit the following to the taste!
856 encode_t => [ $table_files ],
859 # And leave the rest!
865 print $fh <<'END_OF_MAKEFILE_PL';
866 NAME => 'Encode::'.$name,
867 VERSION_FROM => "$name.pm",
868 OBJECT => '$(O_FILES)',
870 COMPRESS => 'gzip -9f',
872 DIST_DEFAULT => 'all tardist',
875 # OS 390 winges about line numbers > 64K ???
876 XSOPT => '-nolinenumbers',
885 my $x = $self->{'OBJ_EXT'};
886 # Add the table O_FILES
887 foreach my $e (keys %tables)
892 $self->{'O_FILES'} = [sort keys %o];
893 my @files = ("$name.xs");
894 $self->{'C'} = ["$name.c"];
895 # $self->{'H'} = [$self->catfile($self->updir,'encode.h')];
897 foreach my $table (keys %tables) {
898 push (@{$self->{'C'}},"$table.c");
899 # Do NOT add $table.h etc. to H_FILES unless we own up as to how they
901 foreach my $ext (qw($(OBJ_EXT) .c .h _def.h .fnm)) {
902 push (@files,$table.$ext);
905 $self->{'XS'} = { "$name.xs" => "$name.c" };
906 $self->{'clean'}{'FILES'} .= join(' ',@files);
907 open(XS,">$name.xs") || die "Cannot open $name.xs:$!";
915 foreach my $table (keys %tables) {
916 print XS qq[#include "${table}.h"\n];
921 Encode_XSEncoding(pTHX_ encode_t *enc)
924 HV *stash = gv_stashpv("Encode::XS", TRUE);
925 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
931 const char *name = enc->name[i++];
932 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
935 call_pv("Encode::define_encoding",G_DISCARD);
939 MODULE = Encode::$name PACKAGE = Encode::$name
944 foreach my $table (keys %tables) {
945 print XS qq[#include "${table}_def.h"\n];
949 return "# Built $name.xs\n\n";
955 my $dir = "."; # $self->catdir('Encode');
956 my $str = "# $name\$(OBJ_EXT) depends on .h and _def.h files not .c files - but all written by enc2xs\n";
957 $str .= "$name.c : $name.xs ";
958 foreach my $table (keys %tables)
963 $str .= "$name\$(OBJ_EXT) : $name.c\n\n";
965 foreach my $table (keys %tables)
968 my $lengthsofar = length($str);
969 my $continuator = '';
970 $str .= "$table.c : Makefile.PL";
971 foreach my $file (@{$tables{$table}})
973 $str .= $continuator.' '.$self->catfile($dir,$file);
974 if ( length($str)-$lengthsofar > 128*$numlines )
976 $continuator .= " \\\n\t";
982 $str .= $^O eq 'VMS' # In VMS quote to preserve case
983 ? qq{\n\t\$(PERL) $enc2xs -"Q" -"O" -o \$\@ -f $table.fnm\n\n}
984 : qq{\n\t\$(PERL) $enc2xs -Q -O -o \$\@ -f $table.fnm\n\n};
985 open (FILELIST, ">$table.fnm")
986 || die "Could not open $table.fnm: $!";
987 foreach my $file (@{$tables{$table}})
989 print FILELIST $self->catfile($dir,$file) . "\n";
997 (my $pm =<<"END_OF_PM") =~ s/^# //gm;
998 # package Encode::$name;
999 # our \$VERSION = "0.01";
1003 # XSLoader::load('Encode::$name', \$VERSION);
1010 # Encode::$name - New Encoding
1014 # You got to fill this in!
1022 open $fh, ">$name.pm" or die "$name.pm:$!";
1025 -d 't' or mkdir 't', 0755 or die "mkdir t:$!";
1026 open $fh, ">t/$name.t" or die "t/$name.t:$!";
1027 print $fh <<"END_OF_TEST";
1029 # Adjust the number here!
1030 use Test::More tests => 2;
1033 use_ok('Encode::$name');
1034 # Add more test here!
1044 enc2xs -- Perl Encode Module Generator
1048 enc2xs -M ModName mapfiles...
1053 F<enc2xs> builds a Perl extension for use by Encode from either
1054 Unicode Character Mapping files (.ucm) or Tcl Encoding Files
1055 (.enc) Besides internally used during the build process of Encode
1056 module, you can use F<enc2xs> to add your own encoding to perl. No
1057 knowledge on XS is necessary.
1061 If what you want to know as little about Perl possible but needs to
1062 add a new encoding, just read this chapter and forget the rest.
1068 Have a .ucm file ready. You can get it from somewhere or you can
1069 write your own from scratch or you can grab one from Encode
1070 distribution and customize. For UCM format, see the next Chapter.
1071 In the example below, I'll call my theoretical encoding myascii,
1072 defined inI<my.ucm>. C<$> is a shell prompt.
1079 Issue a command as follows;
1081 $ enc2xs -M My my.ucm
1083 Now take a look at your current directory. It should look like this.
1086 Makefile.PL My.pm my.ucm t/
1088 The following files are created.
1090 Makefle.PL - MakeMaker script
1091 My.pm - Encode Submodule
1096 Edit the files generated. You don't have to if you have no time AND no
1097 intention to give it to someone else. But it is a good idea to edit
1098 pod and add more tests.
1102 Now issue a command all Perl Mongers love;
1104 $ perl5.7.3 Makefile.PL
1105 Writing Makefile for Encode::My
1109 Now all you have to do is make.
1112 cp My.pm blib/lib/Encode/My.pm
1113 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1114 -o encode_t.c -f encode_t.fnm
1115 Reading myascii (myascii)
1116 Writing compiled form
1117 128 bytes in string tables
1118 384 bytes (25%) saved spotting duplicates
1119 1 bytes (99.2%) saved using substrings
1121 chmod 644 blib/arch/auto/Encode/My/My.bs
1124 The time it takes varies how fast your machine is and how large your
1125 encoding is. Unless you are working on something big like euc-tw, it
1126 won't take too long.
1130 You can "make install" already but you should test first.
1133 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1134 -e 'use Test::Harness qw(&runtests $verbose); \
1135 $verbose=0; runtests @ARGV;' t/*.t
1137 All tests successful.
1138 Files=1, Tests=2, 0 wallclock secs
1139 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1143 If you are content with the test result, just "make install"
1147 =head1 The Unicode Character Map
1149 Encode uses The Unicode Character Map (UCM) for source character
1150 mappings. This format is used by ICU package of IBM and adopted by
1151 Nick Ing-Simmons. Since UCM is more flexible than Tcl's Encoding Map
1152 and far more user-friendly, This is the recommended formet for
1155 UCM file looks like this.
1160 <code_set_name> "US-ascii" # Required
1161 <code_set_alias> "ascii" # Optional
1162 <mb_cur_min> 1 # Required; usually 1
1163 <mb_cur_max> 1 # Max. # of bytes/char
1164 <subchar> \x3F # Substitution char
1167 <U0000> \x00 |0 # <control>
1168 <U0001> \x01 |0 # <control>
1169 <U0002> \x02 |0 # <control>
1171 <U007C> \x7C |0 # VERTICAL LINE
1172 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1173 <U007E> \x7E |0 # TILDE
1174 <U007F> \x7F |0 # <control>
1181 Anything that follows C<#> is treated as comments.
1185 The header section continues until CHARMAP. This section Has a form of
1186 I<E<lt>keywordE<gt> value>, one at a line. For a value, strings must
1187 be quoted. Barewords are treated as numbers. I<\xXX> represents a
1190 Most of the keywords are self-explanatory. I<subchar> means
1191 substitution character, not subcharacter. When you decode a Unicode
1192 sequence to this encoding but no matching character is found, the byte
1193 sequence defined here will be used. For most cases, the value here is
1194 \x3F, in ASCII this is a question mark.
1198 CHARMAP starts the character map section. Each line has a form as
1201 <UXXXX> \xXX.. |0 # comment
1203 | | +- Fallback flag
1204 | +-------- Encoded byte sequence
1205 +-------------- Unicode Character ID in hex
1207 The format is roughly the same as a header section except for fallback
1208 flag. It is | followed by 0..3. And their meaning as follows
1214 Round trip safe. A character decoded to Unicode encodes back to the
1215 same byte sequence. most character belong to this.
1219 Fallback for unicode -> encoding. When seen, enc2xs adds this
1220 character for encode map only
1224 Skip sub-char mapping should there be no code point.
1228 Fallback for encoding -> unicode. When seen, enc2xs adds this
1229 character for decode map only
1235 And finally, END OF CHARMAP ends the section.
1239 Needless to say, if you are manually creating a UCM file, you should
1240 copy ascii.ucm or existing encoding which is close to yours than write
1241 your own from scratch.
1243 When you do so, make sure you leave at least B<U0000> to B<U0020> as
1244 is, unless your environment is on EBCDIC.
1246 B<CAVEAT>: not all features in UCM are implemented. For example,
1247 icu:state is not used. Because of that, you need to write a perl
1248 module if you want to support algorithmical encodings, notablly
1249 ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
1250 L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1255 L<http://oss.software.ibm.com/icu/>
1257 ICU Character Mapping Tables
1258 L<http://oss.software.ibm.com/icu/charset/>
1261 L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1271 # -Q to disable the duplicate codepoint test
1272 # -S make mapping errors fatal
1273 # -q to remove comments written to output files
1274 # -O to enable the (brute force) substring optimiser
1275 # -o <output> to specify the output file name (else it's the first arg)
1276 # -f <inlist> to give a file with a list of input files (else use the args)
1277 # -n <name> to name the encoding (else use the basename of the input file.
1279 With %seen holding array refs:
1281 865.66 real 28.80 user 8.79 sys
1282 7904 maximum resident set size
1283 1356 average shared memory size
1284 18566 average unshared data size
1285 229 average unshared stack size
1289 With %seen holding simple scalars:
1291 342.16 real 27.11 user 3.54 sys
1292 8388 maximum resident set size
1293 1394 average shared memory size
1294 14969 average unshared data size
1295 236 average unshared stack size
1299 Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1300 how %seen is storing things its seen. So it is pathalogically bad on a 16M
1301 RAM machine, but it's going to help even on modern machines.
1302 Swapping is bad, m'kay :-)