CPAN.pm sync
[p5sagit/p5-mst-13.2.git] / ext / Encode / bin / enc2xs
CommitLineData
3ef515df 1#!./perl
67d7b5ef 2BEGIN {
a999c27c 3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
67d7b5ef 7}
8use strict;
9use Getopt::Std;
10my @orig_ARGV = @ARGV;
b2704119 11our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
67d7b5ef 12
13# These may get re-ordered.
14# RAW is a do_now as inserted by &enter
15# AGG is an aggreagated do_now, as built up by &process
a999c27c 16
67d7b5ef 17use constant {
18 RAW_NEXT => 0,
19 RAW_IN_LEN => 1,
20 RAW_OUT_BYTES => 2,
21 RAW_FALLBACK => 3,
22
23 AGG_MIN_IN => 0,
24 AGG_MAX_IN => 1,
25 AGG_OUT_BYTES => 2,
26 AGG_NEXT => 3,
27 AGG_IN_LEN => 4,
28 AGG_OUT_LEN => 5,
29 AGG_FALLBACK => 6,
30};
a999c27c 31
67d7b5ef 32# (See the algorithm in encengine.c - we're building structures for it)
33
34# There are two sorts of structures.
35# "do_now" (an array, two variants of what needs storing) is whatever we need
36# to do now we've read an input byte.
37# It's housed in a "do_next" (which is how we got to it), and in turn points
38# to a "do_next" which contains all the "do_now"s for the next input byte.
39
40# There will be a "do_next" which is the start state.
41# For a single byte encoding it's the only "do_next" - each "do_now" points
42# back to it, and each "do_now" will cause bytes. There is no state.
43
44# For a multi-byte encoding where all characters in the input are the same
45# length, then there will be a tree of "do_now"->"do_next"->"do_now"
46# branching out from the start state, one step for each input byte.
47# The leaf "do_now"s will all be at the same distance from the start state,
48# only the leaf "do_now"s cause output bytes, and they in turn point back to
49# the start state.
50
51# For an encoding where there are varaible length input byte sequences, you
52# will encounter a leaf "do_now" sooner for the shorter input sequences, but
53# as before the leaves will point back to the start state.
54
55# The system will cope with escape encodings (imagine them as a mostly
56# self-contained tree for each escape state, and cross links between trees
57# at the state-switching characters) but so far no input format defines these.
58
59# The system will also cope with having output "leaves" in the middle of
60# the bifurcating branches, not just at the extremities, but again no
61# input format does this yet.
62
63# There are two variants of the "do_now" structure. The first, smaller variant
64# is generated by &enter as the input file is read. There is one structure
65# for each input byte. Say we are mapping a single byte encoding to a
66# single byte encoding, with "ABCD" going "abcd". There will be
67# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
68
69# &process then walks the tree, building aggregate "do_now" structres for
70# adjacent bytes where possible. The aggregate is for a contiguous range of
71# bytes which each produce the same length of output, each move to the
72# same next state, and each have the same fallback flag.
73# So our 4 RAW "do_now"s above become replaced by a single structure
74# containing:
75# ["A", "D", "abcd", 1, ...]
76# ie, for an input byte $_ in "A".."D", output 1 byte, found as
77# substr ("abcd", (ord $_ - ord "A") * 1, 1)
78# which maps very nicely into pointer arithmetic in C for encengine.c
79
80sub encode_U
81{
82 # UTF-8 encode long hand - only covers part of perl's range
83 ## my $uv = shift;
84 # chr() works in native space so convert value from table
85 # into that space before using chr().
86 my $ch = chr(utf8::unicode_to_native($_[0]));
87 # Now get core perl to encode that the way it likes.
88 utf8::encode($ch);
89 return $ch;
90}
91
92sub encode_S
93{
94 # encode single byte
95 ## my ($ch,$page) = @_; return chr($ch);
96 return chr $_[0];
97}
98
99sub encode_D
100{
101 # encode double byte MS byte first
102 ## my ($ch,$page) = @_; return chr($page).chr($ch);
103 return chr ($_[1]) . chr $_[0];
104}
105
106sub encode_M
107{
108 # encode Multi-byte - single for 0..255 otherwise double
109 ## my ($ch,$page) = @_;
110 ## return &encode_D if $page;
111 ## return &encode_S;
112 return chr ($_[1]) . chr $_[0] if $_[1];
113 return chr $_[0];
114}
115
116my %encode_types = (U => \&encode_U,
117 S => \&encode_S,
118 D => \&encode_D,
119 M => \&encode_M,
120 );
121
122# Win32 does not expand globs on command line
123eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
124
125my %opt;
126# I think these are:
127# -Q to disable the duplicate codepoint test
128# -S make mapping errors fatal
129# -q to remove comments written to output files
130# -O to enable the (brute force) substring optimiser
131# -o <output> to specify the output file name (else it's the first arg)
132# -f <inlist> to give a file with a list of input files (else use the args)
133# -n <name> to name the encoding (else use the basename of the input file.
aae85ceb 134getopts('CM:SQqOo:f:n:',\%opt);
67d7b5ef 135
136$opt{M} and make_makefile_pl($opt{M}, @ARGV);
aae85ceb 137$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
67d7b5ef 138
139# This really should go first, else the die here causes empty (non-erroneous)
140# output files to be written.
141my @encfiles;
142if (exists $opt{'f'}) {
143 # -F is followed by name of file containing list of filenames
144 my $flist = $opt{'f'};
145 open(FLIST,$flist) || die "Cannot open $flist:$!";
146 chomp(@encfiles = <FLIST>);
147 close(FLIST);
148} else {
149 @encfiles = @ARGV;
150}
151
152my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
153chmod(0666,$cname) if -f $cname && !-w $cname;
154open(C,">$cname") || die "Cannot open $cname:$!";
155
156my $dname = $cname;
157my $hname = $cname;
158
159my ($doC,$doEnc,$doUcm,$doPet);
160
161if ($cname =~ /\.(c|xs)$/)
162 {
163 $doC = 1;
e7cbefb8 164 $dname =~ s/(\.[^\.]*)?$/.exh/;
67d7b5ef 165 chmod(0666,$dname) if -f $cname && !-w $dname;
166 open(D,">$dname") || die "Cannot open $dname:$!";
167 $hname =~ s/(\.[^\.]*)?$/.h/;
168 chmod(0666,$hname) if -f $cname && !-w $hname;
169 open(H,">$hname") || die "Cannot open $hname:$!";
170
171 foreach my $fh (\*C,\*D,\*H)
172 {
173 print $fh <<"END" unless $opt{'q'};
174/*
175 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
176 This file was autogenerated by:
177 $^X $0 @orig_ARGV
178*/
179END
180 }
181
182 if ($cname =~ /(\w+)\.xs$/)
183 {
184 print C "#include <EXTERN.h>\n";
185 print C "#include <perl.h>\n";
186 print C "#include <XSUB.h>\n";
187 print C "#define U8 U8\n";
188 }
189 print C "#include \"encode.h\"\n";
190
191 }
192elsif ($cname =~ /\.enc$/)
193 {
194 $doEnc = 1;
195 }
196elsif ($cname =~ /\.ucm$/)
197 {
198 $doUcm = 1;
199 }
200elsif ($cname =~ /\.pet$/)
201 {
202 $doPet = 1;
203 }
204
205my %encoding;
206my %strings;
207my $saved = 0;
208my $subsave = 0;
209my $strings = 0;
210
211sub cmp_name
212{
213 if ($a =~ /^.*-(\d+)/)
214 {
215 my $an = $1;
216 if ($b =~ /^.*-(\d+)/)
217 {
218 my $r = $an <=> $1;
219 return $r if $r;
220 }
221 }
222 return $a cmp $b;
223}
224
225
226foreach my $enc (sort cmp_name @encfiles)
227 {
228 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
229 $name = $opt{'n'} if exists $opt{'n'};
230 if (open(E,$enc))
231 {
232 if ($sfx eq 'enc')
233 {
234 compile_enc(\*E,lc($name));
235 }
236 else
237 {
238 compile_ucm(\*E,lc($name));
239 }
240 }
241 else
242 {
243 warn "Cannot open $enc for $name:$!";
244 }
245 }
246
247if ($doC)
248 {
249 print STDERR "Writing compiled form\n";
250 foreach my $name (sort cmp_name keys %encoding)
251 {
252 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
253 output(\*C,$name.'_utf8',$e2u);
254 output(\*C,'utf8_'.$name,$u2e);
b2704119 255 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
67d7b5ef 256 }
257 foreach my $enc (sort cmp_name keys %encoding)
258 {
b2704119 259 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
260 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
261 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
262 my $replen = 0;
263 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
43a9ef84 264 my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8*)"$rep"),$replen,$min_el,$max_el);
67d7b5ef 265 my $sym = "${enc}_encoding";
266 $sym =~ s/\W+/_/g;
267 print C "encode_t $sym = \n";
268 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
269 }
270
271 foreach my $enc (sort cmp_name keys %encoding)
272 {
273 my $sym = "${enc}_encoding";
274 $sym =~ s/\W+/_/g;
275 print H "extern encode_t $sym;\n";
276 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
277 }
278
279 if ($cname =~ /(\w+)\.xs$/)
280 {
281 my $mod = $1;
282 print C <<'END';
283
284static void
285Encode_XSEncoding(pTHX_ encode_t *enc)
286{
287 dSP;
288 HV *stash = gv_stashpv("Encode::XS", TRUE);
289 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
290 int i = 0;
291 PUSHMARK(sp);
292 XPUSHs(sv);
293 while (enc->name[i])
294 {
295 const char *name = enc->name[i++];
296 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
297 }
298 PUTBACK;
299 call_pv("Encode::define_encoding",G_DISCARD);
300 SvREFCNT_dec(sv);
301}
302
303END
304
305 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
306 print C "BOOT:\n{\n";
307 print C "#include \"$dname\"\n";
308 print C "}\n";
309 }
310 # Close in void context is bad, m'kay
311 close(D) or warn "Error closing '$dname': $!";
312 close(H) or warn "Error closing '$hname': $!";
313
314 my $perc_saved = $strings/($strings + $saved) * 100;
315 my $perc_subsaved = $strings/($strings + $subsave) * 100;
316 printf STDERR "%d bytes in string tables\n",$strings;
317 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
318 $saved, $perc_saved if $saved;
319 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
320 $subsave, $perc_subsaved if $subsave;
321 }
322elsif ($doEnc)
323 {
324 foreach my $name (sort cmp_name keys %encoding)
325 {
326 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
327 output_enc(\*C,$name,$e2u);
328 }
329 }
330elsif ($doUcm)
331 {
332 foreach my $name (sort cmp_name keys %encoding)
333 {
334 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
335 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
336 }
337 }
338
339# writing half meg files and then not checking to see if you just filled the
340# disk is bad, m'kay
341close(C) or die "Error closing '$cname': $!";
342
343# End of the main program.
344
345sub compile_ucm
346{
347 my ($fh,$name) = @_;
348 my $e2u = {};
349 my $u2e = {};
350 my $cs;
351 my %attr;
352 while (<$fh>)
353 {
354 s/#.*$//;
355 last if /^\s*CHARMAP\s*$/i;
356 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
357 {
358 $attr{$1} = $2;
359 }
360 }
361 if (!defined($cs = $attr{'code_set_name'}))
362 {
363 warn "No <code_set_name> in $name\n";
364 }
365 else
366 {
367 $name = $cs unless exists $opt{'n'};
368 }
369 my $erep;
370 my $urep;
371 my $max_el;
372 my $min_el;
373 if (exists $attr{'subchar'})
374 {
b2704119 375 #my @byte;
376 #$attr{'subchar'} =~ /^\s*/cg;
377 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
378 #$erep = join('',map(chr(hex($_)),@byte));
379 $erep = $attr{'subchar'};
380 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
67d7b5ef 381 }
382 print "Reading $name ($cs)\n";
383 my $nfb = 0;
384 my $hfb = 0;
385 while (<$fh>)
386 {
387 s/#.*$//;
388 last if /^\s*END\s+CHARMAP\s*$/i;
389 next if /^\s*$/;
a999c27c 390 my (@uni, @byte) = ();
391 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
392 or die "Bad line: $_";
393 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
394 push @uni, map { substr($_, 1) } split(/\+/, $1);
395 }
396 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
397 push @byte, $1;
398 }
399 if (@uni)
67d7b5ef 400 {
a999c27c 401 my $uch = join('', map { encode_U(hex($_)) } @uni );
67d7b5ef 402 my $ech = join('',map(chr(hex($_)),@byte));
403 my $el = length($ech);
404 $max_el = $el if (!defined($max_el) || $el > $max_el);
405 $min_el = $el if (!defined($min_el) || $el < $min_el);
406 if (length($fb))
407 {
408 $fb = substr($fb,1);
409 $hfb++;
410 }
411 else
412 {
413 $nfb++;
414 $fb = '0';
415 }
416 # $fb is fallback flag
417 # 0 - round trip safe
418 # 1 - fallback for unicode -> enc
419 # 2 - skip sub-char mapping
420 # 3 - fallback enc -> unicode
421 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
422 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
423 }
424 else
425 {
426 warn $_;
427 }
428 }
429 if ($nfb && $hfb)
430 {
431 die "$nfb entries without fallback, $hfb entries with\n";
432 }
433 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
434}
435
436
437
438sub compile_enc
439{
440 my ($fh,$name) = @_;
441 my $e2u = {};
442 my $u2e = {};
443
444 my $type;
445 while ($type = <$fh>)
446 {
447 last if $type !~ /^\s*#/;
448 }
449 chomp($type);
450 return if $type eq 'E';
451 # Do the hash lookup once, rather than once per function call. 4% speedup.
452 my $type_func = $encode_types{$type};
453 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
454 warn "$type encoded $name\n";
455 my $rep = '';
456 # Save a defined test by setting these to defined values.
457 my $min_el = ~0; # A very big integer
458 my $max_el = 0; # Anything must be longer than 0
459 {
460 my $v = hex($def);
461 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
462 }
463 my $errors;
464 my $seen;
465 # use -Q to silence the seen test. Makefile.PL uses this by default.
466 $seen = {} unless $opt{Q};
467 do
468 {
469 my $line = <$fh>;
470 chomp($line);
471 my $page = hex($line);
472 my $ch = 0;
473 my $i = 16;
474 do
475 {
476 # So why is it 1% faster to leave the my here?
477 my $line = <$fh>;
478 $line =~ s/\r\n$/\n/;
479 die "$.:${line}Line should be exactly 65 characters long including
480 newline (".length($line).")" unless length ($line) == 65;
481 # Split line into groups of 4 hex digits, convert groups to ints
482 # This takes 65.35
483 # map {hex $_} $line =~ /(....)/g
484 # This takes 63.75 (2.5% less time)
485 # unpack "n*", pack "H*", $line
486 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
487 # Doing it as while ($line =~ /(....)/g) took 74.63
488 foreach my $val (unpack "n*", pack "H*", $line)
489 {
490 next if $val == 0xFFFD;
491 my $ech = &$type_func($ch,$page);
492 if ($val || (!$ch && !$page))
493 {
494 my $el = length($ech);
495 $max_el = $el if $el > $max_el;
496 $min_el = $el if $el < $min_el;
497 my $uch = encode_U($val);
498 if ($seen) {
499 # We're doing the test.
500 # We don't need to read this quickly, so storing it as a scalar,
501 # rather than 3 (anon array, plus the 2 scalars it holds) saves
502 # RAM and may make us faster on low RAM systems. [see __END__]
503 if (exists $seen->{$uch})
504 {
505 warn sprintf("U%04X is %02X%02X and %04X\n",
506 $val,$page,$ch,$seen->{$uch});
507 $errors++;
508 }
509 else
510 {
511 $seen->{$uch} = $page << 8 | $ch;
512 }
513 }
514 # Passing 2 extra args each time is 3.6% slower!
515 # Even with having to add $fallback ||= 0 later
516 enter_fb0($e2u,$ech,$uch);
517 enter_fb0($u2e,$uch,$ech);
518 }
519 else
520 {
521 # No character at this position
522 # enter($e2u,$ech,undef,$e2u);
523 }
524 $ch++;
525 }
526 } while --$i;
527 } while --$pages;
528 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
529 if $min_el > $max_el;
530 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
531 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
532}
533
534# my ($a,$s,$d,$t,$fb) = @_;
535sub enter {
536 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
537 # state we shift to after this (multibyte) input character defaults to same
538 # as current state.
539 $next ||= $current;
540 # Making sure it is defined seems to be faster than {no warnings;} in
541 # &process, or passing it in as 0 explicity.
542 # XXX $fallback ||= 0;
543
544 # Start at the beginning and work forwards through the string to zero.
545 # effectively we are removing 1 character from the front each time
546 # but we don't actually edit the string. [this alone seems to be 14% speedup]
547 # Hence -$pos is the length of the remaining string.
548 my $pos = -length $inbytes;
549 while (1) {
550 my $byte = substr $inbytes, $pos, 1;
551 # RAW_NEXT => 0,
552 # RAW_IN_LEN => 1,
553 # RAW_OUT_BYTES => 2,
554 # RAW_FALLBACK => 3,
555 # to unicode an array would seem to be better, because the pages are dense.
556 # from unicode can be very sparse, favouring a hash.
557 # hash using the bytes (all length 1) as keys rather than ord value,
558 # as it's easier to sort these in &process.
559
560 # It's faster to always add $fallback even if it's undef, rather than
561 # choosing between 3 and 4 element array. (hence why we set it defined
562 # above)
563 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
564 # When $pos was -1 we were at the last input character.
565 unless (++$pos) {
566 $do_now->[RAW_OUT_BYTES] = $outbytes;
567 $do_now->[RAW_NEXT] = $next;
568 return;
569 }
570 # Tail recursion. The intermdiate state may not have a name yet.
571 $current = $do_now->[RAW_NEXT];
572 }
573}
574
575# This is purely for optimistation. It's just &enter hard coded for $fallback
576# of 0, using only a 3 entry array ref to save memory for every entry.
577sub enter_fb0 {
578 my ($current,$inbytes,$outbytes,$next) = @_;
579 $next ||= $current;
580
581 my $pos = -length $inbytes;
582 while (1) {
583 my $byte = substr $inbytes, $pos, 1;
584 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
585 unless (++$pos) {
586 $do_now->[RAW_OUT_BYTES] = $outbytes;
587 $do_now->[RAW_NEXT] = $next;
588 return;
589 }
590 $current = $do_now->[RAW_NEXT];
591 }
592}
593
594
595sub outstring
596{
597 my ($fh,$name,$s) = @_;
598 my $sym = $strings{$s};
599 if ($sym)
600 {
601 $saved += length($s);
602 }
603 else
604 {
605 if ($opt{'O'}) {
606 foreach my $o (keys %strings)
607 {
608 next unless (my $i = index($o,$s)) >= 0;
609 $sym = $strings{$o};
610 # gcc things that 0x0e+0x10 (anything with e+) starts to look like
611 # a hexadecimal floating point constant. Silly gcc. Only p
612 # introduces a floating point constant. Put the space in to stop it
613 # getting confused.
614 $sym .= sprintf(" +0x%02x",$i) if ($i);
615 $subsave += length($s);
616 return $strings{$s} = $sym;
617 }
618 }
619 $strings{$s} = $sym = $name;
620 $strings += length($s);
621 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
622 # Maybe we should assert that these are all <256.
623 $definition .= join(',',unpack "C*",$s);
624 # We have a single long line. Split it at convenient commas.
625 $definition =~ s/(.{74,77},)/$1\n/g;
626 print $fh "$definition };\n\n";
627 }
628 return $sym;
629}
630
631sub process
632{
633 my ($name,$a) = @_;
634 $name =~ s/\W+/_/g;
635 $a->{Cname} = $name;
636 my $raw = $a->{Raw};
637 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
638 my @ent;
639 $agg_max_in = 0;
640 foreach my $key (sort keys %$raw) {
641 # RAW_NEXT => 0,
642 # RAW_IN_LEN => 1,
643 # RAW_OUT_BYTES => 2,
644 # RAW_FALLBACK => 3,
645 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
646 # Now we are converting from raw to aggregate, switch from 1 byte strings
647 # to numbers
648 my $b = ord $key;
649 $fallback ||= 0;
650 if ($l &&
651 # If this == fails, we're going to reset $agg_max_in below anyway.
652 $b == ++$agg_max_in &&
653 # References in numeric context give the pointer as an int.
654 $agg_next == $next &&
655 $agg_in_len == $in_len &&
656 $agg_out_len == length $out_bytes &&
657 $agg_fallback == $fallback
658 # && length($l->[AGG_OUT_BYTES]) < 16
659 ) {
660 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
661 # we can aggregate this byte onto the end.
662 $l->[AGG_MAX_IN] = $b;
663 $l->[AGG_OUT_BYTES] .= $out_bytes;
664 } else {
665 # AGG_MIN_IN => 0,
666 # AGG_MAX_IN => 1,
667 # AGG_OUT_BYTES => 2,
668 # AGG_NEXT => 3,
669 # AGG_IN_LEN => 4,
670 # AGG_OUT_LEN => 5,
671 # AGG_FALLBACK => 6,
672 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
673 # (only gains .6% on euc-jp -- is it worth it?)
674 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
675 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
676 $agg_fallback = $fallback];
677 }
678 if (exists $next->{Cname}) {
679 $next->{'Forward'} = 1 if $next != $a;
680 } else {
681 process(sprintf("%s_%02x",$name,$b),$next);
682 }
683 }
684 # encengine.c rules say that last entry must be for 255
685 if ($agg_max_in < 255) {
686 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
687 }
688 $a->{'Entries'} = \@ent;
689}
690
691sub outtable
692{
693 my ($fh,$a) = @_;
694 my $name = $a->{'Cname'};
695 # String tables
696 foreach my $b (@{$a->{'Entries'}})
697 {
698 next unless $b->[AGG_OUT_LEN];
699 my $s = $b->[AGG_MIN_IN];
700 my $e = $b->[AGG_MAX_IN];
701 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
702 }
703 if ($a->{'Forward'})
704 {
705 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
706 }
707 $a->{'Done'} = 1;
708 foreach my $b (@{$a->{'Entries'}})
709 {
710 my ($s,$e,$out,$t,$end,$l) = @$b;
711 outtable($fh,$t) unless $t->{'Done'};
712 }
713 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
714 foreach my $b (@{$a->{'Entries'}})
715 {
716 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
717 $end |= 0x80 if $fb;
718 print $fh "{";
719 if ($l)
720 {
721 printf $fh outstring($fh,'',$out);
722 }
723 else
724 {
725 print $fh "0";
726 }
727 print $fh ",",$t->{Cname};
728 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
729 }
730 print $fh "};\n";
731}
732
733sub output
734{
735 my ($fh,$name,$a) = @_;
736 process($name,$a);
737 # Sub-tables
738 outtable($fh,$a);
739}
740
741sub output_enc
742{
743 my ($fh,$name,$a) = @_;
744 die "Changed - fix me for new structure";
745 foreach my $b (sort keys %$a)
746 {
747 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
748 }
749}
750
751sub decode_U
752{
753 my $s = shift;
754}
755
756my @uname;
757sub char_names
758{
759 my $s = do "unicore/Name.pl";
760 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
761 pos($s) = 0;
762 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
763 {
764 my $name = $3;
765 my $s = hex($1);
766 last if $s >= 0x10000;
767 my $e = length($2) ? hex($2) : $s;
768 for (my $i = $s; $i <= $e; $i++)
769 {
770 $uname[$i] = $name;
771# print sprintf("U%04X $name\n",$i);
772 }
773 }
774}
775
776sub output_ucm_page
777{
778 my ($cmap,$a,$t,$pre) = @_;
779 # warn sprintf("Page %x\n",$pre);
780 my $raw = $t->{Raw};
781 foreach my $key (sort keys %$raw) {
782 # RAW_NEXT => 0,
783 # RAW_IN_LEN => 1,
784 # RAW_OUT_BYTES => 2,
785 # RAW_FALLBACK => 3,
786 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
787 my $u = ord $key;
788 $fallback ||= 0;
789
790 if ($next != $a && $next != $t) {
791 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
792 } elsif (length $out_bytes) {
793 if ($pre) {
794 $u = $pre|($u &0x3f);
795 }
796 my $s = sprintf "<U%04X> ",$u;
797 #foreach my $c (split(//,$out_bytes)) {
798 # $s .= sprintf "\\x%02X",ord($c);
799 #}
800 # 9.5% faster changing that loop to this:
801 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
802 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
803 push(@$cmap,$s);
804 } else {
805 warn join(',',$u, @{$raw->{$key}},$a,$t);
806 }
807 }
808}
809
810sub output_ucm
811{
812 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
813 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
814 print $fh "<code_set_name> \"$name\"\n";
815 char_names();
816 if (defined $min_el)
817 {
818 print $fh "<mb_cur_min> $min_el\n";
819 }
820 if (defined $max_el)
821 {
822 print $fh "<mb_cur_max> $max_el\n";
823 }
824 if (defined $rep)
825 {
826 print $fh "<subchar> ";
827 foreach my $c (split(//,$rep))
828 {
829 printf $fh "\\x%02X",ord($c);
830 }
831 print $fh "\n";
832 }
833 my @cmap;
834 output_ucm_page(\@cmap,$h,$h,0);
835 print $fh "#\nCHARMAP\n";
836 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
837 {
838 print $fh $line;
839 }
840 print $fh "END CHARMAP\n";
841}
842
3ef515df 843use vars qw(
844 $_Enc2xs
845 $_Version
846 $_Inc
b2704119 847 $_E2X
3ef515df 848 $_Name
849 $_TableFiles
850 $_Now
851);
852
b2704119 853sub find_e2x{
854 eval { require File::Find };
855 my (@inc, %e2x_dir);
856 for my $inc (@INC){
857 push @inc, $inc unless $inc eq '.'; #skip current dir
858 }
859 File::Find::find(
860 sub {
861 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
862 $atime,$mtime,$ctime,$blksize,$blocks)
863 = lstat($_) or return;
864 -f _ or return;
865 if (/^.*\.e2x$/o){
866 $e2x_dir{$File::Find::dir} ||= $mtime;
867 }
868 return;
869 }, @inc);
870 warn join("\n", keys %e2x_dir), "\n";
871 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
872 $_E2X = $d;
873 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
874 return $_E2X;
875 }
876}
877
67d7b5ef 878sub make_makefile_pl
879{
880 eval { require Encode; };
881 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
3ef515df 882 # our used for variable expanstion
883 $_Enc2xs = $0;
884 $_Version = $VERSION;
b2704119 885 $_E2X = find_e2x();
3ef515df 886 $_Name = shift;
887 $_TableFiles = join(",", map {qq('$_')} @_);
888 $_Now = scalar localtime();
b2704119 889
aae85ceb 890 eval { require File::Spec; };
3ef515df 891 warn "Generating Makefile.PL\n";
b2704119 892 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
3ef515df 893 warn "Generating $_Name.pm\n";
b2704119 894 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
3ef515df 895 warn "Generating t/$_Name.t\n";
b2704119 896 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
3ef515df 897 warn "Generating README\n";
b2704119 898 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
3ef515df 899 warn "Generating t/$_Name.t\n";
b2704119 900 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
3ef515df 901 exit;
902}
903
aae85ceb 904use vars qw(
905 $_ModLines
906 $_LocalVer
907 );
908
909sub make_configlocal_pm
910{
911 eval { require Encode; };
912 $@ and die "Unable to require Encode: $@\n";
913 eval { require File::Spec; };
914 # our used for variable expanstion
915 my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
916 my %LocalMod = ();
917 for my $d (@INC){
918 my $inc = File::Spec->catfile($d, "Encode");
919 -d $inc or next;
920 opendir my $dh, $inc or die "$inc:$!";
921 warn "Checking $inc...\n";
922 for my $f (grep /\.pm$/o, readdir($dh)){
923 -f File::Spec->catfile($inc, "$f") or next;
924 $INC{"Encode/$f"} and next;
925 warn "require Encode/$f;\n";
926 eval { require "Encode/$f"; };
927 $@ and die "Can't require Encode/$f: $@\n";
928 for my $enc (Encode->encodings()){
929 $in_core{$enc} and next;
930 $Encode::Config::ExtModule{$enc} and next;
931 my $mod = "Encode/$f";
932 $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
b2704119 933 $LocalMod{$enc} ||= $mod;
aae85ceb 934 }
935 }
936 }
937 $_ModLines = "";
938 for my $enc (sort keys %LocalMod){
939 $_ModLines .=
940 qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
941 }
b2704119 942 warn $_ModLines;
aae85ceb 943 $_LocalVer = _mkversion();
b2704119 944 $_E2X = find_e2x();
aae85ceb 945 $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
b2704119 946 warn "Writing ", File::Spec->catfile($_Inc,"ConfigLocal.pm"), "\n";
947 _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
aae85ceb 948 File::Spec->catfile($_Inc,"ConfigLocal.pm"));
949 exit;
950}
951
952sub _mkversion{
953 my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
954 $yyyy += 1900, $mo +=1;
955 return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
956}
957
3ef515df 958sub _print_expand{
67d7b5ef 959 eval { require File::Basename; };
960 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
961 File::Basename->import();
3ef515df 962 my ($src, $dst) = @_;
963 open my $in, $src or die "$src : $!";
964 if ((my $d = dirname($dst)) ne '.'){
965 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
966 }
967 open my $out, ">$dst" or die "$!";
968 my $asis = 0;
969 while (<$in>){
970 if (/^#### END_OF_HEADER/){
971 $asis = 1; next;
972 }
973 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
974 print $out $_;
67d7b5ef 975 }
67d7b5ef 976}
67d7b5ef 977__END__
978
979=head1 NAME
980
981enc2xs -- Perl Encode Module Generator
982
983=head1 SYNOPSIS
984
67d7b5ef 985 enc2xs -[options]
aae85ceb 986 enc2xs -M ModName mapfiles...
987 enc2xs -C
67d7b5ef 988
989=head1 DESCRIPTION
990
991F<enc2xs> builds a Perl extension for use by Encode from either
992Unicode Character Mapping files (.ucm) or Tcl Encoding Files
993(.enc) Besides internally used during the build process of Encode
994module, you can use F<enc2xs> to add your own encoding to perl. No
995knowledge on XS is necessary.
996
997=head1 Quick Guide
998
999If what you want to know as little about Perl possible but needs to
1000add a new encoding, just read this chapter and forget the rest.
1001
1002=over 4
1003
1004=item 0.
1005
1006Have a .ucm file ready. You can get it from somewhere or you can
1007write your own from scratch or you can grab one from Encode
1008distribution and customize. For UCM format, see the next Chapter.
1009In the example below, I'll call my theoretical encoding myascii,
1010defined inI<my.ucm>. C<$> is a shell prompt.
1011
1012 $ ls -F
1013 my.ucm
1014
1015=item 1.
1016
1017Issue a command as follows;
1018
1019 $ enc2xs -M My my.ucm
3ef515df 1020 generating Makefile.PL
1021 generating My.pm
1022 generating README
1023 generating Changes
67d7b5ef 1024
1025Now take a look at your current directory. It should look like this.
1026
1027 $ ls -F
1028 Makefile.PL My.pm my.ucm t/
1029
1030The following files are created.
1031
1032 Makefle.PL - MakeMaker script
1033 My.pm - Encode Submodule
1034 t/My.t - test file
1035
037b88d6 1036=item 1.1.
1037
1038If you want *.ucm installed together with the modules, do as follows;
1039
1040 $ mkdir Encode
1041 $ mv *.ucm Encode
1042 $ enc2xs -M My Encode/*ucm
1043
67d7b5ef 1044=item 2.
1045
1046Edit the files generated. You don't have to if you have no time AND no
1047intention to give it to someone else. But it is a good idea to edit
1048pod and add more tests.
1049
1050=item 3.
1051
1052Now issue a command all Perl Mongers love;
1053
1054 $ perl5.7.3 Makefile.PL
1055 Writing Makefile for Encode::My
1056
1057=item 4.
1058
1059Now all you have to do is make.
1060
1061 $ make
1062 cp My.pm blib/lib/Encode/My.pm
1063 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1064 -o encode_t.c -f encode_t.fnm
1065 Reading myascii (myascii)
1066 Writing compiled form
1067 128 bytes in string tables
1068 384 bytes (25%) saved spotting duplicates
1069 1 bytes (99.2%) saved using substrings
1070 ....
1071 chmod 644 blib/arch/auto/Encode/My/My.bs
1072 $
1073
1074The time it takes varies how fast your machine is and how large your
1075encoding is. Unless you are working on something big like euc-tw, it
1076won't take too long.
1077
1078=item 5.
1079
1080You can "make install" already but you should test first.
1081
1082 $ make test
1083 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1084 -e 'use Test::Harness qw(&runtests $verbose); \
1085 $verbose=0; runtests @ARGV;' t/*.t
1086 t/My....ok
1087 All tests successful.
1088 Files=1, Tests=2, 0 wallclock secs
1089 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1090
1091=item 6.
1092
1093If you are content with the test result, just "make install"
1094
aae85ceb 1095=item 7.
1096
1097If you want to add your encoding to Encode demand-loading list
1098(so you don't have to "use Encode::YourEncoding"), run
1099
1100 enc2xs -C
1101
1102to update Encode::ConfigLocal, a module that controls local settings.
1103After that, "use Encode;" is enough to load your encodings on demand.
1104
67d7b5ef 1105=back
1106
1107=head1 The Unicode Character Map
1108
1109Encode uses The Unicode Character Map (UCM) for source character
1110mappings. This format is used by ICU package of IBM and adopted by
1111Nick Ing-Simmons. Since UCM is more flexible than Tcl's Encoding Map
1112and far more user-friendly, This is the recommended formet for
1113Encode now.
1114
1115UCM file looks like this.
1116
1117 #
1118 # Comments
1119 #
1120 <code_set_name> "US-ascii" # Required
1121 <code_set_alias> "ascii" # Optional
1122 <mb_cur_min> 1 # Required; usually 1
1123 <mb_cur_max> 1 # Max. # of bytes/char
1124 <subchar> \x3F # Substitution char
1125 #
1126 CHARMAP
1127 <U0000> \x00 |0 # <control>
1128 <U0001> \x01 |0 # <control>
1129 <U0002> \x02 |0 # <control>
1130 ....
1131 <U007C> \x7C |0 # VERTICAL LINE
1132 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1133 <U007E> \x7E |0 # TILDE
1134 <U007F> \x7F |0 # <control>
1135 END CHARMAP
1136
1137=over 4
1138
1139=item *
1140
1141Anything that follows C<#> is treated as comments.
1142
1143=item *
1144
1145The header section continues until CHARMAP. This section Has a form of
1146I<E<lt>keywordE<gt> value>, one at a line. For a value, strings must
1147be quoted. Barewords are treated as numbers. I<\xXX> represents a
1148byte.
1149
1150Most of the keywords are self-explanatory. I<subchar> means
1151substitution character, not subcharacter. When you decode a Unicode
1152sequence to this encoding but no matching character is found, the byte
1153sequence defined here will be used. For most cases, the value here is
1154\x3F, in ASCII this is a question mark.
1155
1156=item *
1157
1158CHARMAP starts the character map section. Each line has a form as
1159follows;
1160
1161 <UXXXX> \xXX.. |0 # comment
1162 ^ ^ ^
1163 | | +- Fallback flag
1164 | +-------- Encoded byte sequence
1165 +-------------- Unicode Character ID in hex
1166
1167The format is roughly the same as a header section except for fallback
1168flag. It is | followed by 0..3. And their meaning as follows
1169
1170=over 2
1171
1172=item |0
1173
1174Round trip safe. A character decoded to Unicode encodes back to the
1175same byte sequence. most character belong to this.
1176
1177=item |1
1178
1179Fallback for unicode -> encoding. When seen, enc2xs adds this
1180character for encode map only
1181
1182=item |2
1183
1184Skip sub-char mapping should there be no code point.
1185
1186=item |3
1187
1188Fallback for encoding -> unicode. When seen, enc2xs adds this
1189character for decode map only
1190
1191=back
1192
1193=item *
1194
1195And finally, END OF CHARMAP ends the section.
1196
1197=back
1198
1199Needless to say, if you are manually creating a UCM file, you should
1200copy ascii.ucm or existing encoding which is close to yours than write
1201your own from scratch.
1202
1203When you do so, make sure you leave at least B<U0000> to B<U0020> as
1204is, unless your environment is on EBCDIC.
1205
1206B<CAVEAT>: not all features in UCM are implemented. For example,
1207icu:state is not used. Because of that, you need to write a perl
1208module if you want to support algorithmical encodings, notablly
1209ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
1210L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1211
1212=head1 Bookmarks
1213
1214ICU Home Page
1215L<http://oss.software.ibm.com/icu/>
1216
1217ICU Character Mapping Tables
1218L<http://oss.software.ibm.com/icu/charset/>
1219
1220ICU:Conversion Data
1221L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1222
1223=head1 SEE ALSO
1224
1225L<Encode>,
1226L<perlmod>,
1227L<perlpod>
1228
1229=cut
1230
1231# -Q to disable the duplicate codepoint test
1232# -S make mapping errors fatal
1233# -q to remove comments written to output files
1234# -O to enable the (brute force) substring optimiser
1235# -o <output> to specify the output file name (else it's the first arg)
1236# -f <inlist> to give a file with a list of input files (else use the args)
1237# -n <name> to name the encoding (else use the basename of the input file.
1238
1239With %seen holding array refs:
1240
1241 865.66 real 28.80 user 8.79 sys
1242 7904 maximum resident set size
1243 1356 average shared memory size
1244 18566 average unshared data size
1245 229 average unshared stack size
1246 46080 page reclaims
1247 33373 page faults
1248
1249With %seen holding simple scalars:
1250
1251 342.16 real 27.11 user 3.54 sys
1252 8388 maximum resident set size
1253 1394 average shared memory size
1254 14969 average unshared data size
1255 236 average unshared stack size
1256 28159 page reclaims
1257 9839 page faults
1258
1259Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1260how %seen is storing things its seen. So it is pathalogically bad on a 16M
1261RAM machine, but it's going to help even on modern machines.
1262Swapping is bad, m'kay :-)