use PERL_CORE=1 to build extensions on VMS
[p5sagit/p5-mst-13.2.git] / ext / Encode / bin / enc2xs
CommitLineData
67d7b5ef 1#!../../../perl -w
2BEGIN {
3 unshift @INC, qw(../../lib ../../../lib ../../../../lib);
4 $ENV{PATH} .= ';../..;../../..;../../../..' if $^O eq 'MSWin32';
5}
6use strict;
7use Getopt::Std;
8my @orig_ARGV = @ARGV;
9our $VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
10
11
12# These may get re-ordered.
13# RAW is a do_now as inserted by &enter
14# AGG is an aggreagated do_now, as built up by &process
15use constant {
16 RAW_NEXT => 0,
17 RAW_IN_LEN => 1,
18 RAW_OUT_BYTES => 2,
19 RAW_FALLBACK => 3,
20
21 AGG_MIN_IN => 0,
22 AGG_MAX_IN => 1,
23 AGG_OUT_BYTES => 2,
24 AGG_NEXT => 3,
25 AGG_IN_LEN => 4,
26 AGG_OUT_LEN => 5,
27 AGG_FALLBACK => 6,
28};
29# (See the algorithm in encengine.c - we're building structures for it)
30
31# There are two sorts of structures.
32# "do_now" (an array, two variants of what needs storing) is whatever we need
33# to do now we've read an input byte.
34# It's housed in a "do_next" (which is how we got to it), and in turn points
35# to a "do_next" which contains all the "do_now"s for the next input byte.
36
37# There will be a "do_next" which is the start state.
38# For a single byte encoding it's the only "do_next" - each "do_now" points
39# back to it, and each "do_now" will cause bytes. There is no state.
40
41# For a multi-byte encoding where all characters in the input are the same
42# length, then there will be a tree of "do_now"->"do_next"->"do_now"
43# branching out from the start state, one step for each input byte.
44# The leaf "do_now"s will all be at the same distance from the start state,
45# only the leaf "do_now"s cause output bytes, and they in turn point back to
46# the start state.
47
48# For an encoding where there are varaible length input byte sequences, you
49# will encounter a leaf "do_now" sooner for the shorter input sequences, but
50# as before the leaves will point back to the start state.
51
52# The system will cope with escape encodings (imagine them as a mostly
53# self-contained tree for each escape state, and cross links between trees
54# at the state-switching characters) but so far no input format defines these.
55
56# The system will also cope with having output "leaves" in the middle of
57# the bifurcating branches, not just at the extremities, but again no
58# input format does this yet.
59
60# There are two variants of the "do_now" structure. The first, smaller variant
61# is generated by &enter as the input file is read. There is one structure
62# for each input byte. Say we are mapping a single byte encoding to a
63# single byte encoding, with "ABCD" going "abcd". There will be
64# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
65
66# &process then walks the tree, building aggregate "do_now" structres for
67# adjacent bytes where possible. The aggregate is for a contiguous range of
68# bytes which each produce the same length of output, each move to the
69# same next state, and each have the same fallback flag.
70# So our 4 RAW "do_now"s above become replaced by a single structure
71# containing:
72# ["A", "D", "abcd", 1, ...]
73# ie, for an input byte $_ in "A".."D", output 1 byte, found as
74# substr ("abcd", (ord $_ - ord "A") * 1, 1)
75# which maps very nicely into pointer arithmetic in C for encengine.c
76
77sub encode_U
78{
79 # UTF-8 encode long hand - only covers part of perl's range
80 ## my $uv = shift;
81 # chr() works in native space so convert value from table
82 # into that space before using chr().
83 my $ch = chr(utf8::unicode_to_native($_[0]));
84 # Now get core perl to encode that the way it likes.
85 utf8::encode($ch);
86 return $ch;
87}
88
89sub encode_S
90{
91 # encode single byte
92 ## my ($ch,$page) = @_; return chr($ch);
93 return chr $_[0];
94}
95
96sub encode_D
97{
98 # encode double byte MS byte first
99 ## my ($ch,$page) = @_; return chr($page).chr($ch);
100 return chr ($_[1]) . chr $_[0];
101}
102
103sub encode_M
104{
105 # encode Multi-byte - single for 0..255 otherwise double
106 ## my ($ch,$page) = @_;
107 ## return &encode_D if $page;
108 ## return &encode_S;
109 return chr ($_[1]) . chr $_[0] if $_[1];
110 return chr $_[0];
111}
112
113my %encode_types = (U => \&encode_U,
114 S => \&encode_S,
115 D => \&encode_D,
116 M => \&encode_M,
117 );
118
119# Win32 does not expand globs on command line
120eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
121
122my %opt;
123# I think these are:
124# -Q to disable the duplicate codepoint test
125# -S make mapping errors fatal
126# -q to remove comments written to output files
127# -O to enable the (brute force) substring optimiser
128# -o <output> to specify the output file name (else it's the first arg)
129# -f <inlist> to give a file with a list of input files (else use the args)
130# -n <name> to name the encoding (else use the basename of the input file.
131getopts('M:SQqOo:f:n:',\%opt);
132
133$opt{M} and make_makefile_pl($opt{M}, @ARGV);
134
135# This really should go first, else the die here causes empty (non-erroneous)
136# output files to be written.
137my @encfiles;
138if (exists $opt{'f'}) {
139 # -F is followed by name of file containing list of filenames
140 my $flist = $opt{'f'};
141 open(FLIST,$flist) || die "Cannot open $flist:$!";
142 chomp(@encfiles = <FLIST>);
143 close(FLIST);
144} else {
145 @encfiles = @ARGV;
146}
147
148my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
149chmod(0666,$cname) if -f $cname && !-w $cname;
150open(C,">$cname") || die "Cannot open $cname:$!";
151
152my $dname = $cname;
153my $hname = $cname;
154
155my ($doC,$doEnc,$doUcm,$doPet);
156
157if ($cname =~ /\.(c|xs)$/)
158 {
159 $doC = 1;
160 $dname =~ s/(\.[^\.]*)?$/_def.h/;
161 chmod(0666,$dname) if -f $cname && !-w $dname;
162 open(D,">$dname") || die "Cannot open $dname:$!";
163 $hname =~ s/(\.[^\.]*)?$/.h/;
164 chmod(0666,$hname) if -f $cname && !-w $hname;
165 open(H,">$hname") || die "Cannot open $hname:$!";
166
167 foreach my $fh (\*C,\*D,\*H)
168 {
169 print $fh <<"END" unless $opt{'q'};
170/*
171 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
172 This file was autogenerated by:
173 $^X $0 @orig_ARGV
174*/
175END
176 }
177
178 if ($cname =~ /(\w+)\.xs$/)
179 {
180 print C "#include <EXTERN.h>\n";
181 print C "#include <perl.h>\n";
182 print C "#include <XSUB.h>\n";
183 print C "#define U8 U8\n";
184 }
185 print C "#include \"encode.h\"\n";
186
187 }
188elsif ($cname =~ /\.enc$/)
189 {
190 $doEnc = 1;
191 }
192elsif ($cname =~ /\.ucm$/)
193 {
194 $doUcm = 1;
195 }
196elsif ($cname =~ /\.pet$/)
197 {
198 $doPet = 1;
199 }
200
201my %encoding;
202my %strings;
203my $saved = 0;
204my $subsave = 0;
205my $strings = 0;
206
207sub cmp_name
208{
209 if ($a =~ /^.*-(\d+)/)
210 {
211 my $an = $1;
212 if ($b =~ /^.*-(\d+)/)
213 {
214 my $r = $an <=> $1;
215 return $r if $r;
216 }
217 }
218 return $a cmp $b;
219}
220
221
222foreach my $enc (sort cmp_name @encfiles)
223 {
224 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
225 $name = $opt{'n'} if exists $opt{'n'};
226 if (open(E,$enc))
227 {
228 if ($sfx eq 'enc')
229 {
230 compile_enc(\*E,lc($name));
231 }
232 else
233 {
234 compile_ucm(\*E,lc($name));
235 }
236 }
237 else
238 {
239 warn "Cannot open $enc for $name:$!";
240 }
241 }
242
243if ($doC)
244 {
245 print STDERR "Writing compiled form\n";
246 foreach my $name (sort cmp_name keys %encoding)
247 {
248 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
249 output(\*C,$name.'_utf8',$e2u);
250 output(\*C,'utf8_'.$name,$u2e);
251 push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
252 }
253 foreach my $enc (sort cmp_name keys %encoding)
254 {
255 my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
256 my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
257 my $sym = "${enc}_encoding";
258 $sym =~ s/\W+/_/g;
259 print C "encode_t $sym = \n";
260 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
261 }
262
263 foreach my $enc (sort cmp_name keys %encoding)
264 {
265 my $sym = "${enc}_encoding";
266 $sym =~ s/\W+/_/g;
267 print H "extern encode_t $sym;\n";
268 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
269 }
270
271 if ($cname =~ /(\w+)\.xs$/)
272 {
273 my $mod = $1;
274 print C <<'END';
275
276static void
277Encode_XSEncoding(pTHX_ encode_t *enc)
278{
279 dSP;
280 HV *stash = gv_stashpv("Encode::XS", TRUE);
281 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
282 int i = 0;
283 PUSHMARK(sp);
284 XPUSHs(sv);
285 while (enc->name[i])
286 {
287 const char *name = enc->name[i++];
288 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
289 }
290 PUTBACK;
291 call_pv("Encode::define_encoding",G_DISCARD);
292 SvREFCNT_dec(sv);
293}
294
295END
296
297 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
298 print C "BOOT:\n{\n";
299 print C "#include \"$dname\"\n";
300 print C "}\n";
301 }
302 # Close in void context is bad, m'kay
303 close(D) or warn "Error closing '$dname': $!";
304 close(H) or warn "Error closing '$hname': $!";
305
306 my $perc_saved = $strings/($strings + $saved) * 100;
307 my $perc_subsaved = $strings/($strings + $subsave) * 100;
308 printf STDERR "%d bytes in string tables\n",$strings;
309 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
310 $saved, $perc_saved if $saved;
311 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
312 $subsave, $perc_subsaved if $subsave;
313 }
314elsif ($doEnc)
315 {
316 foreach my $name (sort cmp_name keys %encoding)
317 {
318 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
319 output_enc(\*C,$name,$e2u);
320 }
321 }
322elsif ($doUcm)
323 {
324 foreach my $name (sort cmp_name keys %encoding)
325 {
326 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
327 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
328 }
329 }
330
331# writing half meg files and then not checking to see if you just filled the
332# disk is bad, m'kay
333close(C) or die "Error closing '$cname': $!";
334
335# End of the main program.
336
337sub compile_ucm
338{
339 my ($fh,$name) = @_;
340 my $e2u = {};
341 my $u2e = {};
342 my $cs;
343 my %attr;
344 while (<$fh>)
345 {
346 s/#.*$//;
347 last if /^\s*CHARMAP\s*$/i;
348 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
349 {
350 $attr{$1} = $2;
351 }
352 }
353 if (!defined($cs = $attr{'code_set_name'}))
354 {
355 warn "No <code_set_name> in $name\n";
356 }
357 else
358 {
359 $name = $cs unless exists $opt{'n'};
360 }
361 my $erep;
362 my $urep;
363 my $max_el;
364 my $min_el;
365 if (exists $attr{'subchar'})
366 {
367 my @byte;
368 $attr{'subchar'} =~ /^\s*/cg;
369 push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
370 $erep = join('',map(chr(hex($_)),@byte));
371 }
372 print "Reading $name ($cs)\n";
373 my $nfb = 0;
374 my $hfb = 0;
375 while (<$fh>)
376 {
377 s/#.*$//;
378 last if /^\s*END\s+CHARMAP\s*$/i;
379 next if /^\s*$/;
380 my ($u,@byte);
381 my $fb = '';
382 $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
383 push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
384 $fb = $1 if /\G\s*(\|[0-3])/gc;
385 # warn "$_: $u @byte | $fb\n";
386 die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
387 if (defined($u))
388 {
389 my $uch = encode_U(hex($u));
390 my $ech = join('',map(chr(hex($_)),@byte));
391 my $el = length($ech);
392 $max_el = $el if (!defined($max_el) || $el > $max_el);
393 $min_el = $el if (!defined($min_el) || $el < $min_el);
394 if (length($fb))
395 {
396 $fb = substr($fb,1);
397 $hfb++;
398 }
399 else
400 {
401 $nfb++;
402 $fb = '0';
403 }
404 # $fb is fallback flag
405 # 0 - round trip safe
406 # 1 - fallback for unicode -> enc
407 # 2 - skip sub-char mapping
408 # 3 - fallback enc -> unicode
409 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
410 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
411 }
412 else
413 {
414 warn $_;
415 }
416 }
417 if ($nfb && $hfb)
418 {
419 die "$nfb entries without fallback, $hfb entries with\n";
420 }
421 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
422}
423
424
425
426sub compile_enc
427{
428 my ($fh,$name) = @_;
429 my $e2u = {};
430 my $u2e = {};
431
432 my $type;
433 while ($type = <$fh>)
434 {
435 last if $type !~ /^\s*#/;
436 }
437 chomp($type);
438 return if $type eq 'E';
439 # Do the hash lookup once, rather than once per function call. 4% speedup.
440 my $type_func = $encode_types{$type};
441 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
442 warn "$type encoded $name\n";
443 my $rep = '';
444 # Save a defined test by setting these to defined values.
445 my $min_el = ~0; # A very big integer
446 my $max_el = 0; # Anything must be longer than 0
447 {
448 my $v = hex($def);
449 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
450 }
451 my $errors;
452 my $seen;
453 # use -Q to silence the seen test. Makefile.PL uses this by default.
454 $seen = {} unless $opt{Q};
455 do
456 {
457 my $line = <$fh>;
458 chomp($line);
459 my $page = hex($line);
460 my $ch = 0;
461 my $i = 16;
462 do
463 {
464 # So why is it 1% faster to leave the my here?
465 my $line = <$fh>;
466 $line =~ s/\r\n$/\n/;
467 die "$.:${line}Line should be exactly 65 characters long including
468 newline (".length($line).")" unless length ($line) == 65;
469 # Split line into groups of 4 hex digits, convert groups to ints
470 # This takes 65.35
471 # map {hex $_} $line =~ /(....)/g
472 # This takes 63.75 (2.5% less time)
473 # unpack "n*", pack "H*", $line
474 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
475 # Doing it as while ($line =~ /(....)/g) took 74.63
476 foreach my $val (unpack "n*", pack "H*", $line)
477 {
478 next if $val == 0xFFFD;
479 my $ech = &$type_func($ch,$page);
480 if ($val || (!$ch && !$page))
481 {
482 my $el = length($ech);
483 $max_el = $el if $el > $max_el;
484 $min_el = $el if $el < $min_el;
485 my $uch = encode_U($val);
486 if ($seen) {
487 # We're doing the test.
488 # We don't need to read this quickly, so storing it as a scalar,
489 # rather than 3 (anon array, plus the 2 scalars it holds) saves
490 # RAM and may make us faster on low RAM systems. [see __END__]
491 if (exists $seen->{$uch})
492 {
493 warn sprintf("U%04X is %02X%02X and %04X\n",
494 $val,$page,$ch,$seen->{$uch});
495 $errors++;
496 }
497 else
498 {
499 $seen->{$uch} = $page << 8 | $ch;
500 }
501 }
502 # Passing 2 extra args each time is 3.6% slower!
503 # Even with having to add $fallback ||= 0 later
504 enter_fb0($e2u,$ech,$uch);
505 enter_fb0($u2e,$uch,$ech);
506 }
507 else
508 {
509 # No character at this position
510 # enter($e2u,$ech,undef,$e2u);
511 }
512 $ch++;
513 }
514 } while --$i;
515 } while --$pages;
516 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
517 if $min_el > $max_el;
518 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
519 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
520}
521
522# my ($a,$s,$d,$t,$fb) = @_;
523sub enter {
524 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
525 # state we shift to after this (multibyte) input character defaults to same
526 # as current state.
527 $next ||= $current;
528 # Making sure it is defined seems to be faster than {no warnings;} in
529 # &process, or passing it in as 0 explicity.
530 # XXX $fallback ||= 0;
531
532 # Start at the beginning and work forwards through the string to zero.
533 # effectively we are removing 1 character from the front each time
534 # but we don't actually edit the string. [this alone seems to be 14% speedup]
535 # Hence -$pos is the length of the remaining string.
536 my $pos = -length $inbytes;
537 while (1) {
538 my $byte = substr $inbytes, $pos, 1;
539 # RAW_NEXT => 0,
540 # RAW_IN_LEN => 1,
541 # RAW_OUT_BYTES => 2,
542 # RAW_FALLBACK => 3,
543 # to unicode an array would seem to be better, because the pages are dense.
544 # from unicode can be very sparse, favouring a hash.
545 # hash using the bytes (all length 1) as keys rather than ord value,
546 # as it's easier to sort these in &process.
547
548 # It's faster to always add $fallback even if it's undef, rather than
549 # choosing between 3 and 4 element array. (hence why we set it defined
550 # above)
551 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
552 # When $pos was -1 we were at the last input character.
553 unless (++$pos) {
554 $do_now->[RAW_OUT_BYTES] = $outbytes;
555 $do_now->[RAW_NEXT] = $next;
556 return;
557 }
558 # Tail recursion. The intermdiate state may not have a name yet.
559 $current = $do_now->[RAW_NEXT];
560 }
561}
562
563# This is purely for optimistation. It's just &enter hard coded for $fallback
564# of 0, using only a 3 entry array ref to save memory for every entry.
565sub enter_fb0 {
566 my ($current,$inbytes,$outbytes,$next) = @_;
567 $next ||= $current;
568
569 my $pos = -length $inbytes;
570 while (1) {
571 my $byte = substr $inbytes, $pos, 1;
572 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
573 unless (++$pos) {
574 $do_now->[RAW_OUT_BYTES] = $outbytes;
575 $do_now->[RAW_NEXT] = $next;
576 return;
577 }
578 $current = $do_now->[RAW_NEXT];
579 }
580}
581
582
583sub outstring
584{
585 my ($fh,$name,$s) = @_;
586 my $sym = $strings{$s};
587 if ($sym)
588 {
589 $saved += length($s);
590 }
591 else
592 {
593 if ($opt{'O'}) {
594 foreach my $o (keys %strings)
595 {
596 next unless (my $i = index($o,$s)) >= 0;
597 $sym = $strings{$o};
598 # gcc things that 0x0e+0x10 (anything with e+) starts to look like
599 # a hexadecimal floating point constant. Silly gcc. Only p
600 # introduces a floating point constant. Put the space in to stop it
601 # getting confused.
602 $sym .= sprintf(" +0x%02x",$i) if ($i);
603 $subsave += length($s);
604 return $strings{$s} = $sym;
605 }
606 }
607 $strings{$s} = $sym = $name;
608 $strings += length($s);
609 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
610 # Maybe we should assert that these are all <256.
611 $definition .= join(',',unpack "C*",$s);
612 # We have a single long line. Split it at convenient commas.
613 $definition =~ s/(.{74,77},)/$1\n/g;
614 print $fh "$definition };\n\n";
615 }
616 return $sym;
617}
618
619sub process
620{
621 my ($name,$a) = @_;
622 $name =~ s/\W+/_/g;
623 $a->{Cname} = $name;
624 my $raw = $a->{Raw};
625 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
626 my @ent;
627 $agg_max_in = 0;
628 foreach my $key (sort keys %$raw) {
629 # RAW_NEXT => 0,
630 # RAW_IN_LEN => 1,
631 # RAW_OUT_BYTES => 2,
632 # RAW_FALLBACK => 3,
633 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
634 # Now we are converting from raw to aggregate, switch from 1 byte strings
635 # to numbers
636 my $b = ord $key;
637 $fallback ||= 0;
638 if ($l &&
639 # If this == fails, we're going to reset $agg_max_in below anyway.
640 $b == ++$agg_max_in &&
641 # References in numeric context give the pointer as an int.
642 $agg_next == $next &&
643 $agg_in_len == $in_len &&
644 $agg_out_len == length $out_bytes &&
645 $agg_fallback == $fallback
646 # && length($l->[AGG_OUT_BYTES]) < 16
647 ) {
648 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
649 # we can aggregate this byte onto the end.
650 $l->[AGG_MAX_IN] = $b;
651 $l->[AGG_OUT_BYTES] .= $out_bytes;
652 } else {
653 # AGG_MIN_IN => 0,
654 # AGG_MAX_IN => 1,
655 # AGG_OUT_BYTES => 2,
656 # AGG_NEXT => 3,
657 # AGG_IN_LEN => 4,
658 # AGG_OUT_LEN => 5,
659 # AGG_FALLBACK => 6,
660 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
661 # (only gains .6% on euc-jp -- is it worth it?)
662 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
663 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
664 $agg_fallback = $fallback];
665 }
666 if (exists $next->{Cname}) {
667 $next->{'Forward'} = 1 if $next != $a;
668 } else {
669 process(sprintf("%s_%02x",$name,$b),$next);
670 }
671 }
672 # encengine.c rules say that last entry must be for 255
673 if ($agg_max_in < 255) {
674 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
675 }
676 $a->{'Entries'} = \@ent;
677}
678
679sub outtable
680{
681 my ($fh,$a) = @_;
682 my $name = $a->{'Cname'};
683 # String tables
684 foreach my $b (@{$a->{'Entries'}})
685 {
686 next unless $b->[AGG_OUT_LEN];
687 my $s = $b->[AGG_MIN_IN];
688 my $e = $b->[AGG_MAX_IN];
689 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
690 }
691 if ($a->{'Forward'})
692 {
693 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
694 }
695 $a->{'Done'} = 1;
696 foreach my $b (@{$a->{'Entries'}})
697 {
698 my ($s,$e,$out,$t,$end,$l) = @$b;
699 outtable($fh,$t) unless $t->{'Done'};
700 }
701 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
702 foreach my $b (@{$a->{'Entries'}})
703 {
704 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
705 $end |= 0x80 if $fb;
706 print $fh "{";
707 if ($l)
708 {
709 printf $fh outstring($fh,'',$out);
710 }
711 else
712 {
713 print $fh "0";
714 }
715 print $fh ",",$t->{Cname};
716 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
717 }
718 print $fh "};\n";
719}
720
721sub output
722{
723 my ($fh,$name,$a) = @_;
724 process($name,$a);
725 # Sub-tables
726 outtable($fh,$a);
727}
728
729sub output_enc
730{
731 my ($fh,$name,$a) = @_;
732 die "Changed - fix me for new structure";
733 foreach my $b (sort keys %$a)
734 {
735 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
736 }
737}
738
739sub decode_U
740{
741 my $s = shift;
742}
743
744my @uname;
745sub char_names
746{
747 my $s = do "unicore/Name.pl";
748 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
749 pos($s) = 0;
750 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
751 {
752 my $name = $3;
753 my $s = hex($1);
754 last if $s >= 0x10000;
755 my $e = length($2) ? hex($2) : $s;
756 for (my $i = $s; $i <= $e; $i++)
757 {
758 $uname[$i] = $name;
759# print sprintf("U%04X $name\n",$i);
760 }
761 }
762}
763
764sub output_ucm_page
765{
766 my ($cmap,$a,$t,$pre) = @_;
767 # warn sprintf("Page %x\n",$pre);
768 my $raw = $t->{Raw};
769 foreach my $key (sort keys %$raw) {
770 # RAW_NEXT => 0,
771 # RAW_IN_LEN => 1,
772 # RAW_OUT_BYTES => 2,
773 # RAW_FALLBACK => 3,
774 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
775 my $u = ord $key;
776 $fallback ||= 0;
777
778 if ($next != $a && $next != $t) {
779 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
780 } elsif (length $out_bytes) {
781 if ($pre) {
782 $u = $pre|($u &0x3f);
783 }
784 my $s = sprintf "<U%04X> ",$u;
785 #foreach my $c (split(//,$out_bytes)) {
786 # $s .= sprintf "\\x%02X",ord($c);
787 #}
788 # 9.5% faster changing that loop to this:
789 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
790 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
791 push(@$cmap,$s);
792 } else {
793 warn join(',',$u, @{$raw->{$key}},$a,$t);
794 }
795 }
796}
797
798sub output_ucm
799{
800 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
801 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
802 print $fh "<code_set_name> \"$name\"\n";
803 char_names();
804 if (defined $min_el)
805 {
806 print $fh "<mb_cur_min> $min_el\n";
807 }
808 if (defined $max_el)
809 {
810 print $fh "<mb_cur_max> $max_el\n";
811 }
812 if (defined $rep)
813 {
814 print $fh "<subchar> ";
815 foreach my $c (split(//,$rep))
816 {
817 printf $fh "\\x%02X",ord($c);
818 }
819 print $fh "\n";
820 }
821 my @cmap;
822 output_ucm_page(\@cmap,$h,$h,0);
823 print $fh "#\nCHARMAP\n";
824 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
825 {
826 print $fh $line;
827 }
828 print $fh "END CHARMAP\n";
829}
830
831sub make_makefile_pl
832{
833 eval { require Encode; };
834 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
835 eval { require File::Basename; };
836 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
837 File::Basename->import();
838 my $inc = dirname($INC{"Encode/Internal.pm"});
839 my $name = shift;
840 my $table_files = join(",", map {qq('$_')} @_);
841 my $now = scalar localtime();
842 open my $fh, ">Makefile.PL" or die "$!";
843 print $fh <<"END_OF_HEADER";
844#
845# This file is auto-generated by:
846# $0
847# $now
848#
849use 5.7.2;
850use strict;
851use ExtUtils::MakeMaker;
852
853# Please edit the following to the taste!
854my \$name = '$name';
855my \%tables = (
856 encode_t => [ $table_files ],
857 );
858
859# And leave the rest!
860my \$enc2xs = '$0';
861WriteMakefile(
862 INC => "-I$inc",
863END_OF_HEADER
864
865 print $fh <<'END_OF_MAKEFILE_PL';
866 NAME => 'Encode::'.$name,
867 VERSION_FROM => "$name.pm",
868 OBJECT => '$(O_FILES)',
869 'dist' => {
870 COMPRESS => 'gzip -9f',
871 SUFFIX => 'gz',
872 DIST_DEFAULT => 'all tardist',
873 },
874 MAN3PODS => {},
875 # OS 390 winges about line numbers > 64K ???
876 XSOPT => '-nolinenumbers',
877 );
878
879package MY;
880
881sub post_initialize
882{
883 my ($self) = @_;
884 my %o;
885 my $x = $self->{'OBJ_EXT'};
886 # Add the table O_FILES
887 foreach my $e (keys %tables)
888 {
889 $o{$e.$x} = 1;
890 }
891 $o{"$name$x"} = 1;
892 $self->{'O_FILES'} = [sort keys %o];
893 my @files = ("$name.xs");
894 $self->{'C'} = ["$name.c"];
895 # $self->{'H'} = [$self->catfile($self->updir,'encode.h')];
896 my %xs;
897 foreach my $table (keys %tables) {
898 push (@{$self->{'C'}},"$table.c");
899 # Do NOT add $table.h etc. to H_FILES unless we own up as to how they
900 # get built.
901 foreach my $ext (qw($(OBJ_EXT) .c .h _def.h .fnm)) {
902 push (@files,$table.$ext);
903 }
904 }
905 $self->{'XS'} = { "$name.xs" => "$name.c" };
906 $self->{'clean'}{'FILES'} .= join(' ',@files);
907 open(XS,">$name.xs") || die "Cannot open $name.xs:$!";
908 print XS <<'END';
909#include <EXTERN.h>
910#include <perl.h>
911#include <XSUB.h>
912#define U8 U8
913#include "encode.h"
914END
915 foreach my $table (keys %tables) {
916 print XS qq[#include "${table}.h"\n];
917 }
918 print XS <<"END";
919
920static void
921Encode_XSEncoding(pTHX_ encode_t *enc)
922{
923 dSP;
924 HV *stash = gv_stashpv("Encode::XS", TRUE);
925 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
926 int i = 0;
927 PUSHMARK(sp);
928 XPUSHs(sv);
929 while (enc->name[i])
930 {
931 const char *name = enc->name[i++];
932 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
933 }
934 PUTBACK;
935 call_pv("Encode::define_encoding",G_DISCARD);
936 SvREFCNT_dec(sv);
937}
938
939MODULE = Encode::$name PACKAGE = Encode::$name
940PROTOTYPES: DISABLE
941BOOT:
942{
943END
944 foreach my $table (keys %tables) {
945 print XS qq[#include "${table}_def.h"\n];
946 }
947 print XS "}\n";
948 close(XS);
949 return "# Built $name.xs\n\n";
950}
951
952sub postamble
953{
954 my $self = shift;
955 my $dir = "."; # $self->catdir('Encode');
956 my $str = "# $name\$(OBJ_EXT) depends on .h and _def.h files not .c files - but all written by enc2xs\n";
957 $str .= "$name.c : $name.xs ";
958 foreach my $table (keys %tables)
959 {
960 $str .= " $table.c";
961 }
962 $str .= "\n\n";
963 $str .= "$name\$(OBJ_EXT) : $name.c\n\n";
964
965 foreach my $table (keys %tables)
966 {
967 my $numlines = 1;
968 my $lengthsofar = length($str);
969 my $continuator = '';
970 $str .= "$table.c : Makefile.PL";
971 foreach my $file (@{$tables{$table}})
972 {
973 $str .= $continuator.' '.$self->catfile($dir,$file);
974 if ( length($str)-$lengthsofar > 128*$numlines )
975 {
976 $continuator .= " \\\n\t";
977 $numlines++;
978 } else {
979 $continuator = '';
980 }
981 }
982 $str .= $^O eq 'VMS' # In VMS quote to preserve case
983 ? qq{\n\t\$(PERL) $enc2xs -"Q" -"O" -o \$\@ -f $table.fnm\n\n}
984 : qq{\n\t\$(PERL) $enc2xs -Q -O -o \$\@ -f $table.fnm\n\n};
985 open (FILELIST, ">$table.fnm")
986 || die "Could not open $table.fnm: $!";
987 foreach my $file (@{$tables{$table}})
988 {
989 print FILELIST $self->catfile($dir,$file) . "\n";
990 }
991 close(FILELIST);
992 }
993 return $str;
994}
995END_OF_MAKEFILE_PL
996 close $fh;
997 (my $pm =<<"END_OF_PM") =~ s/^# //gm;
998# package Encode::$name;
999# our \$VERSION = "0.01";
1000#
1001# use Encode;
1002# use XSLoader;
1003# XSLoader::load('Encode::$name', \$VERSION);
1004#
1005# 1;
1006# __END__
1007#
1008# =head1 NAME
1009#
1010# Encode::$name - New Encoding
1011#
1012# =head1 SYNOPSIS
1013#
1014# You got to fill this in!
1015#
1016# =head1 SEE ALSO
1017#
1018# L<Encode>
1019#
1020# =cut
1021END_OF_PM
1022 open $fh, ">$name.pm" or die "$name.pm:$!";
1023 print $fh $pm;
1024 close $fh;
1025 -d 't' or mkdir 't', 0755 or die "mkdir t:$!";
1026 open $fh, ">t/$name.t" or die "t/$name.t:$!";
1027print $fh <<"END_OF_TEST";
1028use strict;
1029# Adjust the number here!
1030use Test::More tests => 2;
1031
1032use_ok('Encode');
1033use_ok('Encode::$name');
1034# Add more test here!
1035END_OF_TEST
1036 close $fh;
1037 exit;
1038}
1039
1040__END__
1041
1042=head1 NAME
1043
1044enc2xs -- Perl Encode Module Generator
1045
1046=head1 SYNOPSIS
1047
1048 enc2xs -M ModName mapfiles...
1049 enc2xs -[options]
1050
1051=head1 DESCRIPTION
1052
1053F<enc2xs> builds a Perl extension for use by Encode from either
1054Unicode Character Mapping files (.ucm) or Tcl Encoding Files
1055(.enc) Besides internally used during the build process of Encode
1056module, you can use F<enc2xs> to add your own encoding to perl. No
1057knowledge on XS is necessary.
1058
1059=head1 Quick Guide
1060
1061If what you want to know as little about Perl possible but needs to
1062add a new encoding, just read this chapter and forget the rest.
1063
1064=over 4
1065
1066=item 0.
1067
1068Have a .ucm file ready. You can get it from somewhere or you can
1069write your own from scratch or you can grab one from Encode
1070distribution and customize. For UCM format, see the next Chapter.
1071In the example below, I'll call my theoretical encoding myascii,
1072defined inI<my.ucm>. C<$> is a shell prompt.
1073
1074 $ ls -F
1075 my.ucm
1076
1077=item 1.
1078
1079Issue a command as follows;
1080
1081 $ enc2xs -M My my.ucm
1082
1083Now take a look at your current directory. It should look like this.
1084
1085 $ ls -F
1086 Makefile.PL My.pm my.ucm t/
1087
1088The following files are created.
1089
1090 Makefle.PL - MakeMaker script
1091 My.pm - Encode Submodule
1092 t/My.t - test file
1093
1094=item 2.
1095
1096Edit the files generated. You don't have to if you have no time AND no
1097intention to give it to someone else. But it is a good idea to edit
1098pod and add more tests.
1099
1100=item 3.
1101
1102Now issue a command all Perl Mongers love;
1103
1104 $ perl5.7.3 Makefile.PL
1105 Writing Makefile for Encode::My
1106
1107=item 4.
1108
1109Now all you have to do is make.
1110
1111 $ make
1112 cp My.pm blib/lib/Encode/My.pm
1113 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1114 -o encode_t.c -f encode_t.fnm
1115 Reading myascii (myascii)
1116 Writing compiled form
1117 128 bytes in string tables
1118 384 bytes (25%) saved spotting duplicates
1119 1 bytes (99.2%) saved using substrings
1120 ....
1121 chmod 644 blib/arch/auto/Encode/My/My.bs
1122 $
1123
1124The time it takes varies how fast your machine is and how large your
1125encoding is. Unless you are working on something big like euc-tw, it
1126won't take too long.
1127
1128=item 5.
1129
1130You can "make install" already but you should test first.
1131
1132 $ make test
1133 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1134 -e 'use Test::Harness qw(&runtests $verbose); \
1135 $verbose=0; runtests @ARGV;' t/*.t
1136 t/My....ok
1137 All tests successful.
1138 Files=1, Tests=2, 0 wallclock secs
1139 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1140
1141=item 6.
1142
1143If you are content with the test result, just "make install"
1144
1145=back
1146
1147=head1 The Unicode Character Map
1148
1149Encode uses The Unicode Character Map (UCM) for source character
1150mappings. This format is used by ICU package of IBM and adopted by
1151Nick Ing-Simmons. Since UCM is more flexible than Tcl's Encoding Map
1152and far more user-friendly, This is the recommended formet for
1153Encode now.
1154
1155UCM file looks like this.
1156
1157 #
1158 # Comments
1159 #
1160 <code_set_name> "US-ascii" # Required
1161 <code_set_alias> "ascii" # Optional
1162 <mb_cur_min> 1 # Required; usually 1
1163 <mb_cur_max> 1 # Max. # of bytes/char
1164 <subchar> \x3F # Substitution char
1165 #
1166 CHARMAP
1167 <U0000> \x00 |0 # <control>
1168 <U0001> \x01 |0 # <control>
1169 <U0002> \x02 |0 # <control>
1170 ....
1171 <U007C> \x7C |0 # VERTICAL LINE
1172 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1173 <U007E> \x7E |0 # TILDE
1174 <U007F> \x7F |0 # <control>
1175 END CHARMAP
1176
1177=over 4
1178
1179=item *
1180
1181Anything that follows C<#> is treated as comments.
1182
1183=item *
1184
1185The header section continues until CHARMAP. This section Has a form of
1186I<E<lt>keywordE<gt> value>, one at a line. For a value, strings must
1187be quoted. Barewords are treated as numbers. I<\xXX> represents a
1188byte.
1189
1190Most of the keywords are self-explanatory. I<subchar> means
1191substitution character, not subcharacter. When you decode a Unicode
1192sequence to this encoding but no matching character is found, the byte
1193sequence defined here will be used. For most cases, the value here is
1194\x3F, in ASCII this is a question mark.
1195
1196=item *
1197
1198CHARMAP starts the character map section. Each line has a form as
1199follows;
1200
1201 <UXXXX> \xXX.. |0 # comment
1202 ^ ^ ^
1203 | | +- Fallback flag
1204 | +-------- Encoded byte sequence
1205 +-------------- Unicode Character ID in hex
1206
1207The format is roughly the same as a header section except for fallback
1208flag. It is | followed by 0..3. And their meaning as follows
1209
1210=over 2
1211
1212=item |0
1213
1214Round trip safe. A character decoded to Unicode encodes back to the
1215same byte sequence. most character belong to this.
1216
1217=item |1
1218
1219Fallback for unicode -> encoding. When seen, enc2xs adds this
1220character for encode map only
1221
1222=item |2
1223
1224Skip sub-char mapping should there be no code point.
1225
1226=item |3
1227
1228Fallback for encoding -> unicode. When seen, enc2xs adds this
1229character for decode map only
1230
1231=back
1232
1233=item *
1234
1235And finally, END OF CHARMAP ends the section.
1236
1237=back
1238
1239Needless to say, if you are manually creating a UCM file, you should
1240copy ascii.ucm or existing encoding which is close to yours than write
1241your own from scratch.
1242
1243When you do so, make sure you leave at least B<U0000> to B<U0020> as
1244is, unless your environment is on EBCDIC.
1245
1246B<CAVEAT>: not all features in UCM are implemented. For example,
1247icu:state is not used. Because of that, you need to write a perl
1248module if you want to support algorithmical encodings, notablly
1249ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
1250L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1251
1252=head1 Bookmarks
1253
1254ICU Home Page
1255L<http://oss.software.ibm.com/icu/>
1256
1257ICU Character Mapping Tables
1258L<http://oss.software.ibm.com/icu/charset/>
1259
1260ICU:Conversion Data
1261L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1262
1263=head1 SEE ALSO
1264
1265L<Encode>,
1266L<perlmod>,
1267L<perlpod>
1268
1269=cut
1270
1271# -Q to disable the duplicate codepoint test
1272# -S make mapping errors fatal
1273# -q to remove comments written to output files
1274# -O to enable the (brute force) substring optimiser
1275# -o <output> to specify the output file name (else it's the first arg)
1276# -f <inlist> to give a file with a list of input files (else use the args)
1277# -n <name> to name the encoding (else use the basename of the input file.
1278
1279With %seen holding array refs:
1280
1281 865.66 real 28.80 user 8.79 sys
1282 7904 maximum resident set size
1283 1356 average shared memory size
1284 18566 average unshared data size
1285 229 average unshared stack size
1286 46080 page reclaims
1287 33373 page faults
1288
1289With %seen holding simple scalars:
1290
1291 342.16 real 27.11 user 3.54 sys
1292 8388 maximum resident set size
1293 1394 average shared memory size
1294 14969 average unshared data size
1295 236 average unshared stack size
1296 28159 page reclaims
1297 9839 page faults
1298
1299Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1300how %seen is storing things its seen. So it is pathalogically bad on a 16M
1301RAM machine, but it's going to help even on modern machines.
1302Swapping is bad, m'kay :-)