PerlIO::Via reference
[p5sagit/p5-mst-13.2.git] / ext / Encode / bin / enc2xs
CommitLineData
3ef515df 1#!./perl
67d7b5ef 2BEGIN {
a999c27c 3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
67d7b5ef 7}
8use strict;
9use Getopt::Std;
10my @orig_ARGV = @ARGV;
0ab8f81e 11our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
67d7b5ef 12
13# These may get re-ordered.
14# RAW is a do_now as inserted by &enter
15# AGG is an aggreagated do_now, as built up by &process
a999c27c 16
67d7b5ef 17use constant {
18 RAW_NEXT => 0,
19 RAW_IN_LEN => 1,
20 RAW_OUT_BYTES => 2,
21 RAW_FALLBACK => 3,
22
23 AGG_MIN_IN => 0,
24 AGG_MAX_IN => 1,
25 AGG_OUT_BYTES => 2,
26 AGG_NEXT => 3,
27 AGG_IN_LEN => 4,
28 AGG_OUT_LEN => 5,
29 AGG_FALLBACK => 6,
30};
a999c27c 31
67d7b5ef 32# (See the algorithm in encengine.c - we're building structures for it)
33
34# There are two sorts of structures.
35# "do_now" (an array, two variants of what needs storing) is whatever we need
36# to do now we've read an input byte.
37# It's housed in a "do_next" (which is how we got to it), and in turn points
38# to a "do_next" which contains all the "do_now"s for the next input byte.
39
40# There will be a "do_next" which is the start state.
41# For a single byte encoding it's the only "do_next" - each "do_now" points
42# back to it, and each "do_now" will cause bytes. There is no state.
43
44# For a multi-byte encoding where all characters in the input are the same
45# length, then there will be a tree of "do_now"->"do_next"->"do_now"
46# branching out from the start state, one step for each input byte.
47# The leaf "do_now"s will all be at the same distance from the start state,
48# only the leaf "do_now"s cause output bytes, and they in turn point back to
49# the start state.
50
51# For an encoding where there are varaible length input byte sequences, you
52# will encounter a leaf "do_now" sooner for the shorter input sequences, but
53# as before the leaves will point back to the start state.
54
55# The system will cope with escape encodings (imagine them as a mostly
56# self-contained tree for each escape state, and cross links between trees
57# at the state-switching characters) but so far no input format defines these.
58
59# The system will also cope with having output "leaves" in the middle of
60# the bifurcating branches, not just at the extremities, but again no
61# input format does this yet.
62
63# There are two variants of the "do_now" structure. The first, smaller variant
64# is generated by &enter as the input file is read. There is one structure
65# for each input byte. Say we are mapping a single byte encoding to a
66# single byte encoding, with "ABCD" going "abcd". There will be
67# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
68
69# &process then walks the tree, building aggregate "do_now" structres for
70# adjacent bytes where possible. The aggregate is for a contiguous range of
71# bytes which each produce the same length of output, each move to the
72# same next state, and each have the same fallback flag.
73# So our 4 RAW "do_now"s above become replaced by a single structure
74# containing:
75# ["A", "D", "abcd", 1, ...]
76# ie, for an input byte $_ in "A".."D", output 1 byte, found as
77# substr ("abcd", (ord $_ - ord "A") * 1, 1)
78# which maps very nicely into pointer arithmetic in C for encengine.c
79
80sub encode_U
81{
82 # UTF-8 encode long hand - only covers part of perl's range
83 ## my $uv = shift;
84 # chr() works in native space so convert value from table
85 # into that space before using chr().
86 my $ch = chr(utf8::unicode_to_native($_[0]));
87 # Now get core perl to encode that the way it likes.
88 utf8::encode($ch);
89 return $ch;
90}
91
92sub encode_S
93{
94 # encode single byte
95 ## my ($ch,$page) = @_; return chr($ch);
96 return chr $_[0];
97}
98
99sub encode_D
100{
101 # encode double byte MS byte first
102 ## my ($ch,$page) = @_; return chr($page).chr($ch);
103 return chr ($_[1]) . chr $_[0];
104}
105
106sub encode_M
107{
108 # encode Multi-byte - single for 0..255 otherwise double
109 ## my ($ch,$page) = @_;
110 ## return &encode_D if $page;
111 ## return &encode_S;
112 return chr ($_[1]) . chr $_[0] if $_[1];
113 return chr $_[0];
114}
115
116my %encode_types = (U => \&encode_U,
117 S => \&encode_S,
118 D => \&encode_D,
119 M => \&encode_M,
120 );
121
122# Win32 does not expand globs on command line
123eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
124
125my %opt;
126# I think these are:
127# -Q to disable the duplicate codepoint test
128# -S make mapping errors fatal
129# -q to remove comments written to output files
130# -O to enable the (brute force) substring optimiser
131# -o <output> to specify the output file name (else it's the first arg)
132# -f <inlist> to give a file with a list of input files (else use the args)
133# -n <name> to name the encoding (else use the basename of the input file.
aae85ceb 134getopts('CM:SQqOo:f:n:',\%opt);
67d7b5ef 135
136$opt{M} and make_makefile_pl($opt{M}, @ARGV);
aae85ceb 137$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
67d7b5ef 138
139# This really should go first, else the die here causes empty (non-erroneous)
140# output files to be written.
141my @encfiles;
142if (exists $opt{'f'}) {
143 # -F is followed by name of file containing list of filenames
144 my $flist = $opt{'f'};
145 open(FLIST,$flist) || die "Cannot open $flist:$!";
146 chomp(@encfiles = <FLIST>);
147 close(FLIST);
148} else {
149 @encfiles = @ARGV;
150}
151
152my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
153chmod(0666,$cname) if -f $cname && !-w $cname;
154open(C,">$cname") || die "Cannot open $cname:$!";
155
156my $dname = $cname;
157my $hname = $cname;
158
159my ($doC,$doEnc,$doUcm,$doPet);
160
161if ($cname =~ /\.(c|xs)$/)
162 {
163 $doC = 1;
e7cbefb8 164 $dname =~ s/(\.[^\.]*)?$/.exh/;
67d7b5ef 165 chmod(0666,$dname) if -f $cname && !-w $dname;
166 open(D,">$dname") || die "Cannot open $dname:$!";
167 $hname =~ s/(\.[^\.]*)?$/.h/;
168 chmod(0666,$hname) if -f $cname && !-w $hname;
169 open(H,">$hname") || die "Cannot open $hname:$!";
170
171 foreach my $fh (\*C,\*D,\*H)
172 {
173 print $fh <<"END" unless $opt{'q'};
174/*
175 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
176 This file was autogenerated by:
177 $^X $0 @orig_ARGV
178*/
179END
180 }
181
182 if ($cname =~ /(\w+)\.xs$/)
183 {
184 print C "#include <EXTERN.h>\n";
185 print C "#include <perl.h>\n";
186 print C "#include <XSUB.h>\n";
187 print C "#define U8 U8\n";
188 }
189 print C "#include \"encode.h\"\n";
190
191 }
192elsif ($cname =~ /\.enc$/)
193 {
194 $doEnc = 1;
195 }
196elsif ($cname =~ /\.ucm$/)
197 {
198 $doUcm = 1;
199 }
200elsif ($cname =~ /\.pet$/)
201 {
202 $doPet = 1;
203 }
204
205my %encoding;
206my %strings;
207my $saved = 0;
208my $subsave = 0;
209my $strings = 0;
210
211sub cmp_name
212{
213 if ($a =~ /^.*-(\d+)/)
214 {
215 my $an = $1;
216 if ($b =~ /^.*-(\d+)/)
217 {
218 my $r = $an <=> $1;
219 return $r if $r;
220 }
221 }
222 return $a cmp $b;
223}
224
225
226foreach my $enc (sort cmp_name @encfiles)
227 {
228 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
229 $name = $opt{'n'} if exists $opt{'n'};
230 if (open(E,$enc))
231 {
232 if ($sfx eq 'enc')
233 {
234 compile_enc(\*E,lc($name));
235 }
236 else
237 {
238 compile_ucm(\*E,lc($name));
239 }
240 }
241 else
242 {
243 warn "Cannot open $enc for $name:$!";
244 }
245 }
246
247if ($doC)
248 {
249 print STDERR "Writing compiled form\n";
250 foreach my $name (sort cmp_name keys %encoding)
251 {
252 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
253 output(\*C,$name.'_utf8',$e2u);
254 output(\*C,'utf8_'.$name,$u2e);
b2704119 255 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
67d7b5ef 256 }
257 foreach my $enc (sort cmp_name keys %encoding)
258 {
b2704119 259 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
260 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
261 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
262 my $replen = 0;
263 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
85982a32 264 my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
67d7b5ef 265 my $sym = "${enc}_encoding";
266 $sym =~ s/\W+/_/g;
267 print C "encode_t $sym = \n";
268 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
269 }
270
271 foreach my $enc (sort cmp_name keys %encoding)
272 {
273 my $sym = "${enc}_encoding";
274 $sym =~ s/\W+/_/g;
275 print H "extern encode_t $sym;\n";
276 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
277 }
278
279 if ($cname =~ /(\w+)\.xs$/)
280 {
281 my $mod = $1;
282 print C <<'END';
283
284static void
285Encode_XSEncoding(pTHX_ encode_t *enc)
286{
287 dSP;
288 HV *stash = gv_stashpv("Encode::XS", TRUE);
289 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
290 int i = 0;
291 PUSHMARK(sp);
292 XPUSHs(sv);
293 while (enc->name[i])
294 {
295 const char *name = enc->name[i++];
296 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
297 }
298 PUTBACK;
299 call_pv("Encode::define_encoding",G_DISCARD);
300 SvREFCNT_dec(sv);
301}
302
303END
304
305 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
306 print C "BOOT:\n{\n";
307 print C "#include \"$dname\"\n";
308 print C "}\n";
309 }
310 # Close in void context is bad, m'kay
311 close(D) or warn "Error closing '$dname': $!";
312 close(H) or warn "Error closing '$hname': $!";
313
314 my $perc_saved = $strings/($strings + $saved) * 100;
315 my $perc_subsaved = $strings/($strings + $subsave) * 100;
316 printf STDERR "%d bytes in string tables\n",$strings;
317 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
318 $saved, $perc_saved if $saved;
319 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
320 $subsave, $perc_subsaved if $subsave;
321 }
322elsif ($doEnc)
323 {
324 foreach my $name (sort cmp_name keys %encoding)
325 {
326 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
327 output_enc(\*C,$name,$e2u);
328 }
329 }
330elsif ($doUcm)
331 {
332 foreach my $name (sort cmp_name keys %encoding)
333 {
334 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
335 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
336 }
337 }
338
339# writing half meg files and then not checking to see if you just filled the
340# disk is bad, m'kay
341close(C) or die "Error closing '$cname': $!";
342
343# End of the main program.
344
345sub compile_ucm
346{
347 my ($fh,$name) = @_;
348 my $e2u = {};
349 my $u2e = {};
350 my $cs;
351 my %attr;
352 while (<$fh>)
353 {
354 s/#.*$//;
355 last if /^\s*CHARMAP\s*$/i;
356 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
357 {
358 $attr{$1} = $2;
359 }
360 }
361 if (!defined($cs = $attr{'code_set_name'}))
362 {
363 warn "No <code_set_name> in $name\n";
364 }
365 else
366 {
367 $name = $cs unless exists $opt{'n'};
368 }
369 my $erep;
370 my $urep;
371 my $max_el;
372 my $min_el;
373 if (exists $attr{'subchar'})
374 {
b2704119 375 #my @byte;
376 #$attr{'subchar'} =~ /^\s*/cg;
377 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
378 #$erep = join('',map(chr(hex($_)),@byte));
379 $erep = $attr{'subchar'};
380 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
67d7b5ef 381 }
382 print "Reading $name ($cs)\n";
383 my $nfb = 0;
384 my $hfb = 0;
385 while (<$fh>)
386 {
387 s/#.*$//;
388 last if /^\s*END\s+CHARMAP\s*$/i;
389 next if /^\s*$/;
a999c27c 390 my (@uni, @byte) = ();
391 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
392 or die "Bad line: $_";
393 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
394 push @uni, map { substr($_, 1) } split(/\+/, $1);
395 }
396 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
397 push @byte, $1;
398 }
399 if (@uni)
67d7b5ef 400 {
a999c27c 401 my $uch = join('', map { encode_U(hex($_)) } @uni );
67d7b5ef 402 my $ech = join('',map(chr(hex($_)),@byte));
403 my $el = length($ech);
404 $max_el = $el if (!defined($max_el) || $el > $max_el);
405 $min_el = $el if (!defined($min_el) || $el < $min_el);
406 if (length($fb))
407 {
408 $fb = substr($fb,1);
409 $hfb++;
410 }
411 else
412 {
413 $nfb++;
414 $fb = '0';
415 }
416 # $fb is fallback flag
417 # 0 - round trip safe
418 # 1 - fallback for unicode -> enc
419 # 2 - skip sub-char mapping
420 # 3 - fallback enc -> unicode
421 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
422 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
423 }
424 else
425 {
426 warn $_;
427 }
428 }
429 if ($nfb && $hfb)
430 {
431 die "$nfb entries without fallback, $hfb entries with\n";
432 }
433 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
434}
435
436
437
438sub compile_enc
439{
440 my ($fh,$name) = @_;
441 my $e2u = {};
442 my $u2e = {};
443
444 my $type;
445 while ($type = <$fh>)
446 {
447 last if $type !~ /^\s*#/;
448 }
449 chomp($type);
450 return if $type eq 'E';
451 # Do the hash lookup once, rather than once per function call. 4% speedup.
452 my $type_func = $encode_types{$type};
453 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
454 warn "$type encoded $name\n";
455 my $rep = '';
456 # Save a defined test by setting these to defined values.
457 my $min_el = ~0; # A very big integer
458 my $max_el = 0; # Anything must be longer than 0
459 {
460 my $v = hex($def);
461 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
462 }
463 my $errors;
464 my $seen;
465 # use -Q to silence the seen test. Makefile.PL uses this by default.
466 $seen = {} unless $opt{Q};
467 do
468 {
469 my $line = <$fh>;
470 chomp($line);
471 my $page = hex($line);
472 my $ch = 0;
473 my $i = 16;
474 do
475 {
476 # So why is it 1% faster to leave the my here?
477 my $line = <$fh>;
478 $line =~ s/\r\n$/\n/;
479 die "$.:${line}Line should be exactly 65 characters long including
480 newline (".length($line).")" unless length ($line) == 65;
481 # Split line into groups of 4 hex digits, convert groups to ints
482 # This takes 65.35
483 # map {hex $_} $line =~ /(....)/g
484 # This takes 63.75 (2.5% less time)
485 # unpack "n*", pack "H*", $line
486 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
487 # Doing it as while ($line =~ /(....)/g) took 74.63
488 foreach my $val (unpack "n*", pack "H*", $line)
489 {
490 next if $val == 0xFFFD;
491 my $ech = &$type_func($ch,$page);
492 if ($val || (!$ch && !$page))
493 {
494 my $el = length($ech);
495 $max_el = $el if $el > $max_el;
496 $min_el = $el if $el < $min_el;
497 my $uch = encode_U($val);
498 if ($seen) {
499 # We're doing the test.
500 # We don't need to read this quickly, so storing it as a scalar,
501 # rather than 3 (anon array, plus the 2 scalars it holds) saves
502 # RAM and may make us faster on low RAM systems. [see __END__]
503 if (exists $seen->{$uch})
504 {
505 warn sprintf("U%04X is %02X%02X and %04X\n",
506 $val,$page,$ch,$seen->{$uch});
507 $errors++;
508 }
509 else
510 {
511 $seen->{$uch} = $page << 8 | $ch;
512 }
513 }
514 # Passing 2 extra args each time is 3.6% slower!
515 # Even with having to add $fallback ||= 0 later
516 enter_fb0($e2u,$ech,$uch);
517 enter_fb0($u2e,$uch,$ech);
518 }
519 else
520 {
521 # No character at this position
522 # enter($e2u,$ech,undef,$e2u);
523 }
524 $ch++;
525 }
526 } while --$i;
527 } while --$pages;
528 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
529 if $min_el > $max_el;
530 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
531 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
532}
533
534# my ($a,$s,$d,$t,$fb) = @_;
535sub enter {
536 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
537 # state we shift to after this (multibyte) input character defaults to same
538 # as current state.
539 $next ||= $current;
540 # Making sure it is defined seems to be faster than {no warnings;} in
541 # &process, or passing it in as 0 explicity.
542 # XXX $fallback ||= 0;
543
544 # Start at the beginning and work forwards through the string to zero.
545 # effectively we are removing 1 character from the front each time
546 # but we don't actually edit the string. [this alone seems to be 14% speedup]
547 # Hence -$pos is the length of the remaining string.
548 my $pos = -length $inbytes;
549 while (1) {
550 my $byte = substr $inbytes, $pos, 1;
551 # RAW_NEXT => 0,
552 # RAW_IN_LEN => 1,
553 # RAW_OUT_BYTES => 2,
554 # RAW_FALLBACK => 3,
555 # to unicode an array would seem to be better, because the pages are dense.
556 # from unicode can be very sparse, favouring a hash.
557 # hash using the bytes (all length 1) as keys rather than ord value,
558 # as it's easier to sort these in &process.
559
560 # It's faster to always add $fallback even if it's undef, rather than
561 # choosing between 3 and 4 element array. (hence why we set it defined
562 # above)
563 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
564 # When $pos was -1 we were at the last input character.
565 unless (++$pos) {
566 $do_now->[RAW_OUT_BYTES] = $outbytes;
567 $do_now->[RAW_NEXT] = $next;
568 return;
569 }
570 # Tail recursion. The intermdiate state may not have a name yet.
571 $current = $do_now->[RAW_NEXT];
572 }
573}
574
575# This is purely for optimistation. It's just &enter hard coded for $fallback
576# of 0, using only a 3 entry array ref to save memory for every entry.
577sub enter_fb0 {
578 my ($current,$inbytes,$outbytes,$next) = @_;
579 $next ||= $current;
580
581 my $pos = -length $inbytes;
582 while (1) {
583 my $byte = substr $inbytes, $pos, 1;
584 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
585 unless (++$pos) {
586 $do_now->[RAW_OUT_BYTES] = $outbytes;
587 $do_now->[RAW_NEXT] = $next;
588 return;
589 }
590 $current = $do_now->[RAW_NEXT];
591 }
592}
593
594
595sub outstring
596{
597 my ($fh,$name,$s) = @_;
598 my $sym = $strings{$s};
599 if ($sym)
600 {
601 $saved += length($s);
602 }
603 else
604 {
605 if ($opt{'O'}) {
606 foreach my $o (keys %strings)
607 {
608 next unless (my $i = index($o,$s)) >= 0;
609 $sym = $strings{$o};
610 # gcc things that 0x0e+0x10 (anything with e+) starts to look like
611 # a hexadecimal floating point constant. Silly gcc. Only p
612 # introduces a floating point constant. Put the space in to stop it
613 # getting confused.
614 $sym .= sprintf(" +0x%02x",$i) if ($i);
615 $subsave += length($s);
616 return $strings{$s} = $sym;
617 }
618 }
619 $strings{$s} = $sym = $name;
620 $strings += length($s);
621 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
622 # Maybe we should assert that these are all <256.
623 $definition .= join(',',unpack "C*",$s);
624 # We have a single long line. Split it at convenient commas.
625 $definition =~ s/(.{74,77},)/$1\n/g;
626 print $fh "$definition };\n\n";
627 }
628 return $sym;
629}
630
631sub process
632{
633 my ($name,$a) = @_;
634 $name =~ s/\W+/_/g;
635 $a->{Cname} = $name;
636 my $raw = $a->{Raw};
637 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
638 my @ent;
639 $agg_max_in = 0;
640 foreach my $key (sort keys %$raw) {
641 # RAW_NEXT => 0,
642 # RAW_IN_LEN => 1,
643 # RAW_OUT_BYTES => 2,
644 # RAW_FALLBACK => 3,
645 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
646 # Now we are converting from raw to aggregate, switch from 1 byte strings
647 # to numbers
648 my $b = ord $key;
649 $fallback ||= 0;
650 if ($l &&
651 # If this == fails, we're going to reset $agg_max_in below anyway.
652 $b == ++$agg_max_in &&
653 # References in numeric context give the pointer as an int.
654 $agg_next == $next &&
655 $agg_in_len == $in_len &&
656 $agg_out_len == length $out_bytes &&
657 $agg_fallback == $fallback
658 # && length($l->[AGG_OUT_BYTES]) < 16
659 ) {
660 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
661 # we can aggregate this byte onto the end.
662 $l->[AGG_MAX_IN] = $b;
663 $l->[AGG_OUT_BYTES] .= $out_bytes;
664 } else {
665 # AGG_MIN_IN => 0,
666 # AGG_MAX_IN => 1,
667 # AGG_OUT_BYTES => 2,
668 # AGG_NEXT => 3,
669 # AGG_IN_LEN => 4,
670 # AGG_OUT_LEN => 5,
671 # AGG_FALLBACK => 6,
672 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
673 # (only gains .6% on euc-jp -- is it worth it?)
674 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
675 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
676 $agg_fallback = $fallback];
677 }
678 if (exists $next->{Cname}) {
679 $next->{'Forward'} = 1 if $next != $a;
680 } else {
681 process(sprintf("%s_%02x",$name,$b),$next);
682 }
683 }
684 # encengine.c rules say that last entry must be for 255
685 if ($agg_max_in < 255) {
686 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
687 }
688 $a->{'Entries'} = \@ent;
689}
690
691sub outtable
692{
693 my ($fh,$a) = @_;
694 my $name = $a->{'Cname'};
695 # String tables
696 foreach my $b (@{$a->{'Entries'}})
697 {
698 next unless $b->[AGG_OUT_LEN];
699 my $s = $b->[AGG_MIN_IN];
700 my $e = $b->[AGG_MAX_IN];
701 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
702 }
703 if ($a->{'Forward'})
704 {
705 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
706 }
707 $a->{'Done'} = 1;
708 foreach my $b (@{$a->{'Entries'}})
709 {
710 my ($s,$e,$out,$t,$end,$l) = @$b;
711 outtable($fh,$t) unless $t->{'Done'};
712 }
713 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
714 foreach my $b (@{$a->{'Entries'}})
715 {
716 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
717 $end |= 0x80 if $fb;
718 print $fh "{";
719 if ($l)
720 {
721 printf $fh outstring($fh,'',$out);
722 }
723 else
724 {
725 print $fh "0";
726 }
727 print $fh ",",$t->{Cname};
728 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
729 }
730 print $fh "};\n";
731}
732
733sub output
734{
735 my ($fh,$name,$a) = @_;
736 process($name,$a);
737 # Sub-tables
738 outtable($fh,$a);
739}
740
741sub output_enc
742{
743 my ($fh,$name,$a) = @_;
744 die "Changed - fix me for new structure";
745 foreach my $b (sort keys %$a)
746 {
747 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
748 }
749}
750
751sub decode_U
752{
753 my $s = shift;
754}
755
756my @uname;
757sub char_names
758{
759 my $s = do "unicore/Name.pl";
760 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
761 pos($s) = 0;
762 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
763 {
764 my $name = $3;
765 my $s = hex($1);
766 last if $s >= 0x10000;
767 my $e = length($2) ? hex($2) : $s;
768 for (my $i = $s; $i <= $e; $i++)
769 {
770 $uname[$i] = $name;
771# print sprintf("U%04X $name\n",$i);
772 }
773 }
774}
775
776sub output_ucm_page
777{
778 my ($cmap,$a,$t,$pre) = @_;
779 # warn sprintf("Page %x\n",$pre);
780 my $raw = $t->{Raw};
781 foreach my $key (sort keys %$raw) {
782 # RAW_NEXT => 0,
783 # RAW_IN_LEN => 1,
784 # RAW_OUT_BYTES => 2,
785 # RAW_FALLBACK => 3,
786 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
787 my $u = ord $key;
788 $fallback ||= 0;
789
790 if ($next != $a && $next != $t) {
791 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
792 } elsif (length $out_bytes) {
793 if ($pre) {
794 $u = $pre|($u &0x3f);
795 }
796 my $s = sprintf "<U%04X> ",$u;
797 #foreach my $c (split(//,$out_bytes)) {
798 # $s .= sprintf "\\x%02X",ord($c);
799 #}
800 # 9.5% faster changing that loop to this:
801 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
802 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
803 push(@$cmap,$s);
804 } else {
805 warn join(',',$u, @{$raw->{$key}},$a,$t);
806 }
807 }
808}
809
810sub output_ucm
811{
812 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
813 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
814 print $fh "<code_set_name> \"$name\"\n";
815 char_names();
816 if (defined $min_el)
817 {
818 print $fh "<mb_cur_min> $min_el\n";
819 }
820 if (defined $max_el)
821 {
822 print $fh "<mb_cur_max> $max_el\n";
823 }
824 if (defined $rep)
825 {
826 print $fh "<subchar> ";
827 foreach my $c (split(//,$rep))
828 {
829 printf $fh "\\x%02X",ord($c);
830 }
831 print $fh "\n";
832 }
833 my @cmap;
834 output_ucm_page(\@cmap,$h,$h,0);
835 print $fh "#\nCHARMAP\n";
836 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
837 {
838 print $fh $line;
839 }
840 print $fh "END CHARMAP\n";
841}
842
3ef515df 843use vars qw(
844 $_Enc2xs
845 $_Version
846 $_Inc
b2704119 847 $_E2X
3ef515df 848 $_Name
849 $_TableFiles
850 $_Now
851);
852
b2704119 853sub find_e2x{
854 eval { require File::Find };
855 my (@inc, %e2x_dir);
856 for my $inc (@INC){
857 push @inc, $inc unless $inc eq '.'; #skip current dir
858 }
859 File::Find::find(
860 sub {
861 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
862 $atime,$mtime,$ctime,$blksize,$blocks)
863 = lstat($_) or return;
864 -f _ or return;
865 if (/^.*\.e2x$/o){
866 $e2x_dir{$File::Find::dir} ||= $mtime;
867 }
868 return;
869 }, @inc);
870 warn join("\n", keys %e2x_dir), "\n";
871 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
872 $_E2X = $d;
873 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
874 return $_E2X;
875 }
876}
877
67d7b5ef 878sub make_makefile_pl
879{
880 eval { require Encode; };
881 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
3ef515df 882 # our used for variable expanstion
883 $_Enc2xs = $0;
884 $_Version = $VERSION;
b2704119 885 $_E2X = find_e2x();
3ef515df 886 $_Name = shift;
887 $_TableFiles = join(",", map {qq('$_')} @_);
888 $_Now = scalar localtime();
b2704119 889
aae85ceb 890 eval { require File::Spec; };
3ef515df 891 warn "Generating Makefile.PL\n";
b2704119 892 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
3ef515df 893 warn "Generating $_Name.pm\n";
b2704119 894 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
3ef515df 895 warn "Generating t/$_Name.t\n";
b2704119 896 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
3ef515df 897 warn "Generating README\n";
b2704119 898 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
3ef515df 899 warn "Generating t/$_Name.t\n";
b2704119 900 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
3ef515df 901 exit;
902}
903
aae85ceb 904use vars qw(
905 $_ModLines
906 $_LocalVer
907 );
908
909sub make_configlocal_pm
910{
911 eval { require Encode; };
912 $@ and die "Unable to require Encode: $@\n";
913 eval { require File::Spec; };
914 # our used for variable expanstion
915 my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
916 my %LocalMod = ();
917 for my $d (@INC){
918 my $inc = File::Spec->catfile($d, "Encode");
919 -d $inc or next;
920 opendir my $dh, $inc or die "$inc:$!";
921 warn "Checking $inc...\n";
922 for my $f (grep /\.pm$/o, readdir($dh)){
923 -f File::Spec->catfile($inc, "$f") or next;
924 $INC{"Encode/$f"} and next;
925 warn "require Encode/$f;\n";
926 eval { require "Encode/$f"; };
927 $@ and die "Can't require Encode/$f: $@\n";
928 for my $enc (Encode->encodings()){
929 $in_core{$enc} and next;
930 $Encode::Config::ExtModule{$enc} and next;
931 my $mod = "Encode/$f";
932 $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
b2704119 933 $LocalMod{$enc} ||= $mod;
aae85ceb 934 }
935 }
936 }
937 $_ModLines = "";
938 for my $enc (sort keys %LocalMod){
939 $_ModLines .=
940 qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
941 }
b2704119 942 warn $_ModLines;
aae85ceb 943 $_LocalVer = _mkversion();
b2704119 944 $_E2X = find_e2x();
aae85ceb 945 $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
b2704119 946 warn "Writing ", File::Spec->catfile($_Inc,"ConfigLocal.pm"), "\n";
947 _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
aae85ceb 948 File::Spec->catfile($_Inc,"ConfigLocal.pm"));
949 exit;
950}
951
952sub _mkversion{
953 my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
954 $yyyy += 1900, $mo +=1;
955 return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
956}
957
3ef515df 958sub _print_expand{
67d7b5ef 959 eval { require File::Basename; };
960 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
961 File::Basename->import();
3ef515df 962 my ($src, $dst) = @_;
963 open my $in, $src or die "$src : $!";
964 if ((my $d = dirname($dst)) ne '.'){
965 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
966 }
967 open my $out, ">$dst" or die "$!";
968 my $asis = 0;
969 while (<$in>){
970 if (/^#### END_OF_HEADER/){
971 $asis = 1; next;
972 }
973 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
974 print $out $_;
67d7b5ef 975 }
67d7b5ef 976}
67d7b5ef 977__END__
978
979=head1 NAME
980
981enc2xs -- Perl Encode Module Generator
982
983=head1 SYNOPSIS
984
67d7b5ef 985 enc2xs -[options]
aae85ceb 986 enc2xs -M ModName mapfiles...
987 enc2xs -C
67d7b5ef 988
989=head1 DESCRIPTION
990
991F<enc2xs> builds a Perl extension for use by Encode from either
0ab8f81e 992Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
993Besides being used internally during the build process of the Encode
994module, you can use F<enc2xs> to add your own encoding to perl.
995No knowledge of XS is necessary.
67d7b5ef 996
997=head1 Quick Guide
998
0ab8f81e 999If you want to know as little about Perl as possible but need to
67d7b5ef 1000add a new encoding, just read this chapter and forget the rest.
1001
1002=over 4
1003
1004=item 0.
1005
0ab8f81e 1006Have a .ucm file ready. You can get it from somewhere or you can write
1007your own from scratch or you can grab one from the Encode distribution
1008and customize it. For the UCM format, see the next Chapter. In the
1009example below, I'll call my theoretical encoding myascii, defined
1010in I<my.ucm>. C<$> is a shell prompt.
67d7b5ef 1011
1012 $ ls -F
1013 my.ucm
1014
1015=item 1.
1016
1017Issue a command as follows;
1018
1019 $ enc2xs -M My my.ucm
3ef515df 1020 generating Makefile.PL
1021 generating My.pm
1022 generating README
1023 generating Changes
67d7b5ef 1024
1025Now take a look at your current directory. It should look like this.
1026
1027 $ ls -F
1028 Makefile.PL My.pm my.ucm t/
1029
0ab8f81e 1030The following files were created.
67d7b5ef 1031
0ab8f81e 1032 Makefile.PL - MakeMaker script
1033 My.pm - Encode submodule
1034 t/My.t - test file
1035
1036=over 4
67d7b5ef 1037
037b88d6 1038=item 1.1.
1039
1040If you want *.ucm installed together with the modules, do as follows;
1041
1042 $ mkdir Encode
1043 $ mv *.ucm Encode
1044 $ enc2xs -M My Encode/*ucm
1045
0ab8f81e 1046=back
1047
67d7b5ef 1048=item 2.
1049
1050Edit the files generated. You don't have to if you have no time AND no
1051intention to give it to someone else. But it is a good idea to edit
0ab8f81e 1052the pod and to add more tests.
67d7b5ef 1053
1054=item 3.
1055
0ab8f81e 1056Now issue a command all Perl Mongers love:
67d7b5ef 1057
1058 $ perl5.7.3 Makefile.PL
1059 Writing Makefile for Encode::My
1060
1061=item 4.
1062
1063Now all you have to do is make.
1064
1065 $ make
1066 cp My.pm blib/lib/Encode/My.pm
1067 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1068 -o encode_t.c -f encode_t.fnm
1069 Reading myascii (myascii)
1070 Writing compiled form
1071 128 bytes in string tables
1072 384 bytes (25%) saved spotting duplicates
1073 1 bytes (99.2%) saved using substrings
1074 ....
1075 chmod 644 blib/arch/auto/Encode/My/My.bs
1076 $
1077
0ab8f81e 1078The time it takes varies depending on how fast your machine is and
1079how large your encoding is. Unless you are working on something big
1080like euc-tw, it won't take too long.
67d7b5ef 1081
1082=item 5.
1083
1084You can "make install" already but you should test first.
1085
1086 $ make test
1087 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1088 -e 'use Test::Harness qw(&runtests $verbose); \
1089 $verbose=0; runtests @ARGV;' t/*.t
1090 t/My....ok
1091 All tests successful.
1092 Files=1, Tests=2, 0 wallclock secs
1093 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1094
1095=item 6.
1096
1097If you are content with the test result, just "make install"
1098
aae85ceb 1099=item 7.
1100
0ab8f81e 1101If you want to add your encoding to Encode's demand-loading list
aae85ceb 1102(so you don't have to "use Encode::YourEncoding"), run
1103
1104 enc2xs -C
1105
1106to update Encode::ConfigLocal, a module that controls local settings.
1107After that, "use Encode;" is enough to load your encodings on demand.
1108
67d7b5ef 1109=back
1110
1111=head1 The Unicode Character Map
1112
0ab8f81e 1113Encode uses the Unicode Character Map (UCM) format for source character
1114mappings. This format is used by IBM's ICU package and was adopted
1115by Nick Ing-Simmons for use with the Encode module. Since UCM is
1116more flexible than Tcl's Encoding Map and far more user-friendly,
1117this is the recommended formet for Encode now.
67d7b5ef 1118
0ab8f81e 1119A UCM file looks like this.
67d7b5ef 1120
1121 #
1122 # Comments
1123 #
1124 <code_set_name> "US-ascii" # Required
1125 <code_set_alias> "ascii" # Optional
1126 <mb_cur_min> 1 # Required; usually 1
1127 <mb_cur_max> 1 # Max. # of bytes/char
1128 <subchar> \x3F # Substitution char
1129 #
1130 CHARMAP
1131 <U0000> \x00 |0 # <control>
1132 <U0001> \x01 |0 # <control>
1133 <U0002> \x02 |0 # <control>
1134 ....
1135 <U007C> \x7C |0 # VERTICAL LINE
1136 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1137 <U007E> \x7E |0 # TILDE
1138 <U007F> \x7F |0 # <control>
1139 END CHARMAP
1140
1141=over 4
1142
1143=item *
1144
0ab8f81e 1145Anything that follows C<#> is treated as a comment.
67d7b5ef 1146
1147=item *
1148
0ab8f81e 1149The header section continues until a line containing the word
1150CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1151pair per line. Strings used as values must be quoted. Barewords are
1152treated as numbers. I<\xXX> represents a byte.
67d7b5ef 1153
1154Most of the keywords are self-explanatory. I<subchar> means
1155substitution character, not subcharacter. When you decode a Unicode
1156sequence to this encoding but no matching character is found, the byte
1157sequence defined here will be used. For most cases, the value here is
0ab8f81e 1158\x3F; in ASCII, this is a question mark.
67d7b5ef 1159
1160=item *
1161
1162CHARMAP starts the character map section. Each line has a form as
0ab8f81e 1163follows:
67d7b5ef 1164
1165 <UXXXX> \xXX.. |0 # comment
1166 ^ ^ ^
1167 | | +- Fallback flag
1168 | +-------- Encoded byte sequence
1169 +-------------- Unicode Character ID in hex
1170
0ab8f81e 1171The format is roughly the same as a header section except for the
1172fallback flag: | followed by 0..3. The meaning of the possible
1173values is as follows:
67d7b5ef 1174
0ab8f81e 1175=over 4
67d7b5ef 1176
1177=item |0
1178
0ab8f81e 1179Round trip safe. A character decoded to Unicode encodes back to the
1180same byte sequence. Most characters have this flag.
67d7b5ef 1181
1182=item |1
1183
1184Fallback for unicode -> encoding. When seen, enc2xs adds this
0ab8f81e 1185character for the encode map only.
67d7b5ef 1186
1187=item |2
1188
1189Skip sub-char mapping should there be no code point.
1190
1191=item |3
1192
1193Fallback for encoding -> unicode. When seen, enc2xs adds this
0ab8f81e 1194character for the decode map only.
67d7b5ef 1195
1196=back
1197
1198=item *
1199
1200And finally, END OF CHARMAP ends the section.
1201
1202=back
1203
6d1c0808 1204When you are manually creating a UCM file, you should copy ascii.ucm
0ab8f81e 1205or an existing encoding which is close to yours, rather than write
1206your own from scratch.
67d7b5ef 1207
1208When you do so, make sure you leave at least B<U0000> to B<U0020> as
0ab8f81e 1209is, unless your environment is EBCDIC.
67d7b5ef 1210
1211B<CAVEAT>: not all features in UCM are implemented. For example,
1212icu:state is not used. Because of that, you need to write a perl
0ab8f81e 1213module if you want to support algorithmical encodings, notably
1214the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
67d7b5ef 1215L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1216
6d1c0808 1217=head2 Coping with duplicate mappings
1218
1219When you create a map, you SHOULD make your mappings round-trip safe.
1220That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1221$data> stands for all characters that are marked as C<|0>. Here is
0ab8f81e 1222how to make sure:
6d1c0808 1223
0ab8f81e 1224=over 4
6d1c0808 1225
1226=item *
1227
1228Sort your map in Unicode order.
1229
1230=item *
1231
1232When you have a duplicate entry, mark either one with '|1' or '|3'.
1233
1234=item *
1235
0ab8f81e 1236And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
6d1c0808 1237
1238=back
1239
1240Here is an example from big5-eten.
1241
1242 <U2550> \xF9\xF9 |0
1243 <U2550> \xA2\xA4 |3
1244
1245Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1246this;
1247
1248 E to U U to E
1249 --------------------------------------
1250 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1251 \xA2\xA4 => U2550
1252
1253So it is round-trip safe for \xF9\xF9. But if the line above is upside
1254down, here is what happens.
1255
1256 E to U U to E
1257 --------------------------------------
1258 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1259 (\xF9\xF9 => U2550 is now overwritten!)
1260
1261The Encode package comes with F<ucmlint>, a crude but sufficient
0ab8f81e 1262utility to check the integrity of a UCM file. Check under the
1263Encode/bin directory for this.
6d1c0808 1264
1265
67d7b5ef 1266=head1 Bookmarks
1267
0ab8f81e 1268=over 4
1269
1270=item *
1271
67d7b5ef 1272ICU Home Page
1273L<http://oss.software.ibm.com/icu/>
1274
0ab8f81e 1275=item *
1276
67d7b5ef 1277ICU Character Mapping Tables
1278L<http://oss.software.ibm.com/icu/charset/>
1279
0ab8f81e 1280=item *
1281
67d7b5ef 1282ICU:Conversion Data
1283L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1284
0ab8f81e 1285=back
1286
67d7b5ef 1287=head1 SEE ALSO
1288
1289L<Encode>,
1290L<perlmod>,
1291L<perlpod>
1292
1293=cut
1294
1295# -Q to disable the duplicate codepoint test
1296# -S make mapping errors fatal
1297# -q to remove comments written to output files
1298# -O to enable the (brute force) substring optimiser
1299# -o <output> to specify the output file name (else it's the first arg)
1300# -f <inlist> to give a file with a list of input files (else use the args)
1301# -n <name> to name the encoding (else use the basename of the input file.
1302
1303With %seen holding array refs:
1304
1305 865.66 real 28.80 user 8.79 sys
1306 7904 maximum resident set size
1307 1356 average shared memory size
1308 18566 average unshared data size
1309 229 average unshared stack size
1310 46080 page reclaims
1311 33373 page faults
1312
1313With %seen holding simple scalars:
1314
1315 342.16 real 27.11 user 3.54 sys
1316 8388 maximum resident set size
1317 1394 average shared memory size
1318 14969 average unshared data size
1319 236 average unshared stack size
1320 28159 page reclaims
1321 9839 page faults
1322
1323Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1324how %seen is storing things its seen. So it is pathalogically bad on a 16M
1325RAM machine, but it's going to help even on modern machines.
1326Swapping is bad, m'kay :-)