Upgrade to Encode 1.26, from Dan Kogai.
[p5sagit/p5-mst-13.2.git] / ext / Encode / bin / enc2xs
CommitLineData
3ef515df 1#!./perl
67d7b5ef 2BEGIN {
a999c27c 3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
67d7b5ef 7}
8use strict;
9use Getopt::Std;
10my @orig_ARGV = @ARGV;
a999c27c 11our $VERSION = do { my @r = (q$Revision: 1.20 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
67d7b5ef 12
13# These may get re-ordered.
14# RAW is a do_now as inserted by &enter
15# AGG is an aggreagated do_now, as built up by &process
a999c27c 16
67d7b5ef 17use constant {
18 RAW_NEXT => 0,
19 RAW_IN_LEN => 1,
20 RAW_OUT_BYTES => 2,
21 RAW_FALLBACK => 3,
22
23 AGG_MIN_IN => 0,
24 AGG_MAX_IN => 1,
25 AGG_OUT_BYTES => 2,
26 AGG_NEXT => 3,
27 AGG_IN_LEN => 4,
28 AGG_OUT_LEN => 5,
29 AGG_FALLBACK => 6,
30};
a999c27c 31
67d7b5ef 32# (See the algorithm in encengine.c - we're building structures for it)
33
34# There are two sorts of structures.
35# "do_now" (an array, two variants of what needs storing) is whatever we need
36# to do now we've read an input byte.
37# It's housed in a "do_next" (which is how we got to it), and in turn points
38# to a "do_next" which contains all the "do_now"s for the next input byte.
39
40# There will be a "do_next" which is the start state.
41# For a single byte encoding it's the only "do_next" - each "do_now" points
42# back to it, and each "do_now" will cause bytes. There is no state.
43
44# For a multi-byte encoding where all characters in the input are the same
45# length, then there will be a tree of "do_now"->"do_next"->"do_now"
46# branching out from the start state, one step for each input byte.
47# The leaf "do_now"s will all be at the same distance from the start state,
48# only the leaf "do_now"s cause output bytes, and they in turn point back to
49# the start state.
50
51# For an encoding where there are varaible length input byte sequences, you
52# will encounter a leaf "do_now" sooner for the shorter input sequences, but
53# as before the leaves will point back to the start state.
54
55# The system will cope with escape encodings (imagine them as a mostly
56# self-contained tree for each escape state, and cross links between trees
57# at the state-switching characters) but so far no input format defines these.
58
59# The system will also cope with having output "leaves" in the middle of
60# the bifurcating branches, not just at the extremities, but again no
61# input format does this yet.
62
63# There are two variants of the "do_now" structure. The first, smaller variant
64# is generated by &enter as the input file is read. There is one structure
65# for each input byte. Say we are mapping a single byte encoding to a
66# single byte encoding, with "ABCD" going "abcd". There will be
67# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
68
69# &process then walks the tree, building aggregate "do_now" structres for
70# adjacent bytes where possible. The aggregate is for a contiguous range of
71# bytes which each produce the same length of output, each move to the
72# same next state, and each have the same fallback flag.
73# So our 4 RAW "do_now"s above become replaced by a single structure
74# containing:
75# ["A", "D", "abcd", 1, ...]
76# ie, for an input byte $_ in "A".."D", output 1 byte, found as
77# substr ("abcd", (ord $_ - ord "A") * 1, 1)
78# which maps very nicely into pointer arithmetic in C for encengine.c
79
80sub encode_U
81{
82 # UTF-8 encode long hand - only covers part of perl's range
83 ## my $uv = shift;
84 # chr() works in native space so convert value from table
85 # into that space before using chr().
86 my $ch = chr(utf8::unicode_to_native($_[0]));
87 # Now get core perl to encode that the way it likes.
88 utf8::encode($ch);
89 return $ch;
90}
91
92sub encode_S
93{
94 # encode single byte
95 ## my ($ch,$page) = @_; return chr($ch);
96 return chr $_[0];
97}
98
99sub encode_D
100{
101 # encode double byte MS byte first
102 ## my ($ch,$page) = @_; return chr($page).chr($ch);
103 return chr ($_[1]) . chr $_[0];
104}
105
106sub encode_M
107{
108 # encode Multi-byte - single for 0..255 otherwise double
109 ## my ($ch,$page) = @_;
110 ## return &encode_D if $page;
111 ## return &encode_S;
112 return chr ($_[1]) . chr $_[0] if $_[1];
113 return chr $_[0];
114}
115
116my %encode_types = (U => \&encode_U,
117 S => \&encode_S,
118 D => \&encode_D,
119 M => \&encode_M,
120 );
121
122# Win32 does not expand globs on command line
123eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
124
125my %opt;
126# I think these are:
127# -Q to disable the duplicate codepoint test
128# -S make mapping errors fatal
129# -q to remove comments written to output files
130# -O to enable the (brute force) substring optimiser
131# -o <output> to specify the output file name (else it's the first arg)
132# -f <inlist> to give a file with a list of input files (else use the args)
133# -n <name> to name the encoding (else use the basename of the input file.
134getopts('M:SQqOo:f:n:',\%opt);
135
136$opt{M} and make_makefile_pl($opt{M}, @ARGV);
137
138# This really should go first, else the die here causes empty (non-erroneous)
139# output files to be written.
140my @encfiles;
141if (exists $opt{'f'}) {
142 # -F is followed by name of file containing list of filenames
143 my $flist = $opt{'f'};
144 open(FLIST,$flist) || die "Cannot open $flist:$!";
145 chomp(@encfiles = <FLIST>);
146 close(FLIST);
147} else {
148 @encfiles = @ARGV;
149}
150
151my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
152chmod(0666,$cname) if -f $cname && !-w $cname;
153open(C,">$cname") || die "Cannot open $cname:$!";
154
155my $dname = $cname;
156my $hname = $cname;
157
158my ($doC,$doEnc,$doUcm,$doPet);
159
160if ($cname =~ /\.(c|xs)$/)
161 {
162 $doC = 1;
e7cbefb8 163 $dname =~ s/(\.[^\.]*)?$/.exh/;
67d7b5ef 164 chmod(0666,$dname) if -f $cname && !-w $dname;
165 open(D,">$dname") || die "Cannot open $dname:$!";
166 $hname =~ s/(\.[^\.]*)?$/.h/;
167 chmod(0666,$hname) if -f $cname && !-w $hname;
168 open(H,">$hname") || die "Cannot open $hname:$!";
169
170 foreach my $fh (\*C,\*D,\*H)
171 {
172 print $fh <<"END" unless $opt{'q'};
173/*
174 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
175 This file was autogenerated by:
176 $^X $0 @orig_ARGV
177*/
178END
179 }
180
181 if ($cname =~ /(\w+)\.xs$/)
182 {
183 print C "#include <EXTERN.h>\n";
184 print C "#include <perl.h>\n";
185 print C "#include <XSUB.h>\n";
186 print C "#define U8 U8\n";
187 }
188 print C "#include \"encode.h\"\n";
189
190 }
191elsif ($cname =~ /\.enc$/)
192 {
193 $doEnc = 1;
194 }
195elsif ($cname =~ /\.ucm$/)
196 {
197 $doUcm = 1;
198 }
199elsif ($cname =~ /\.pet$/)
200 {
201 $doPet = 1;
202 }
203
204my %encoding;
205my %strings;
206my $saved = 0;
207my $subsave = 0;
208my $strings = 0;
209
210sub cmp_name
211{
212 if ($a =~ /^.*-(\d+)/)
213 {
214 my $an = $1;
215 if ($b =~ /^.*-(\d+)/)
216 {
217 my $r = $an <=> $1;
218 return $r if $r;
219 }
220 }
221 return $a cmp $b;
222}
223
224
225foreach my $enc (sort cmp_name @encfiles)
226 {
227 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
228 $name = $opt{'n'} if exists $opt{'n'};
229 if (open(E,$enc))
230 {
231 if ($sfx eq 'enc')
232 {
233 compile_enc(\*E,lc($name));
234 }
235 else
236 {
237 compile_ucm(\*E,lc($name));
238 }
239 }
240 else
241 {
242 warn "Cannot open $enc for $name:$!";
243 }
244 }
245
246if ($doC)
247 {
248 print STDERR "Writing compiled form\n";
249 foreach my $name (sort cmp_name keys %encoding)
250 {
251 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
252 output(\*C,$name.'_utf8',$e2u);
253 output(\*C,'utf8_'.$name,$u2e);
254 push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
255 }
256 foreach my $enc (sort cmp_name keys %encoding)
257 {
258 my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
259 my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
260 my $sym = "${enc}_encoding";
261 $sym =~ s/\W+/_/g;
262 print C "encode_t $sym = \n";
263 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
264 }
265
266 foreach my $enc (sort cmp_name keys %encoding)
267 {
268 my $sym = "${enc}_encoding";
269 $sym =~ s/\W+/_/g;
270 print H "extern encode_t $sym;\n";
271 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
272 }
273
274 if ($cname =~ /(\w+)\.xs$/)
275 {
276 my $mod = $1;
277 print C <<'END';
278
279static void
280Encode_XSEncoding(pTHX_ encode_t *enc)
281{
282 dSP;
283 HV *stash = gv_stashpv("Encode::XS", TRUE);
284 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
285 int i = 0;
286 PUSHMARK(sp);
287 XPUSHs(sv);
288 while (enc->name[i])
289 {
290 const char *name = enc->name[i++];
291 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
292 }
293 PUTBACK;
294 call_pv("Encode::define_encoding",G_DISCARD);
295 SvREFCNT_dec(sv);
296}
297
298END
299
300 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
301 print C "BOOT:\n{\n";
302 print C "#include \"$dname\"\n";
303 print C "}\n";
304 }
305 # Close in void context is bad, m'kay
306 close(D) or warn "Error closing '$dname': $!";
307 close(H) or warn "Error closing '$hname': $!";
308
309 my $perc_saved = $strings/($strings + $saved) * 100;
310 my $perc_subsaved = $strings/($strings + $subsave) * 100;
311 printf STDERR "%d bytes in string tables\n",$strings;
312 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
313 $saved, $perc_saved if $saved;
314 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
315 $subsave, $perc_subsaved if $subsave;
316 }
317elsif ($doEnc)
318 {
319 foreach my $name (sort cmp_name keys %encoding)
320 {
321 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
322 output_enc(\*C,$name,$e2u);
323 }
324 }
325elsif ($doUcm)
326 {
327 foreach my $name (sort cmp_name keys %encoding)
328 {
329 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
330 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
331 }
332 }
333
334# writing half meg files and then not checking to see if you just filled the
335# disk is bad, m'kay
336close(C) or die "Error closing '$cname': $!";
337
338# End of the main program.
339
340sub compile_ucm
341{
342 my ($fh,$name) = @_;
343 my $e2u = {};
344 my $u2e = {};
345 my $cs;
346 my %attr;
347 while (<$fh>)
348 {
349 s/#.*$//;
350 last if /^\s*CHARMAP\s*$/i;
351 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
352 {
353 $attr{$1} = $2;
354 }
355 }
356 if (!defined($cs = $attr{'code_set_name'}))
357 {
358 warn "No <code_set_name> in $name\n";
359 }
360 else
361 {
362 $name = $cs unless exists $opt{'n'};
363 }
364 my $erep;
365 my $urep;
366 my $max_el;
367 my $min_el;
368 if (exists $attr{'subchar'})
369 {
370 my @byte;
371 $attr{'subchar'} =~ /^\s*/cg;
372 push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
373 $erep = join('',map(chr(hex($_)),@byte));
374 }
375 print "Reading $name ($cs)\n";
376 my $nfb = 0;
377 my $hfb = 0;
378 while (<$fh>)
379 {
380 s/#.*$//;
381 last if /^\s*END\s+CHARMAP\s*$/i;
382 next if /^\s*$/;
a999c27c 383 my (@uni, @byte) = ();
384 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
385 or die "Bad line: $_";
386 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
387 push @uni, map { substr($_, 1) } split(/\+/, $1);
388 }
389 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
390 push @byte, $1;
391 }
392 if (@uni)
67d7b5ef 393 {
a999c27c 394 my $uch = join('', map { encode_U(hex($_)) } @uni );
67d7b5ef 395 my $ech = join('',map(chr(hex($_)),@byte));
396 my $el = length($ech);
397 $max_el = $el if (!defined($max_el) || $el > $max_el);
398 $min_el = $el if (!defined($min_el) || $el < $min_el);
399 if (length($fb))
400 {
401 $fb = substr($fb,1);
402 $hfb++;
403 }
404 else
405 {
406 $nfb++;
407 $fb = '0';
408 }
409 # $fb is fallback flag
410 # 0 - round trip safe
411 # 1 - fallback for unicode -> enc
412 # 2 - skip sub-char mapping
413 # 3 - fallback enc -> unicode
414 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
415 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
416 }
417 else
418 {
419 warn $_;
420 }
421 }
422 if ($nfb && $hfb)
423 {
424 die "$nfb entries without fallback, $hfb entries with\n";
425 }
426 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
427}
428
429
430
431sub compile_enc
432{
433 my ($fh,$name) = @_;
434 my $e2u = {};
435 my $u2e = {};
436
437 my $type;
438 while ($type = <$fh>)
439 {
440 last if $type !~ /^\s*#/;
441 }
442 chomp($type);
443 return if $type eq 'E';
444 # Do the hash lookup once, rather than once per function call. 4% speedup.
445 my $type_func = $encode_types{$type};
446 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
447 warn "$type encoded $name\n";
448 my $rep = '';
449 # Save a defined test by setting these to defined values.
450 my $min_el = ~0; # A very big integer
451 my $max_el = 0; # Anything must be longer than 0
452 {
453 my $v = hex($def);
454 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
455 }
456 my $errors;
457 my $seen;
458 # use -Q to silence the seen test. Makefile.PL uses this by default.
459 $seen = {} unless $opt{Q};
460 do
461 {
462 my $line = <$fh>;
463 chomp($line);
464 my $page = hex($line);
465 my $ch = 0;
466 my $i = 16;
467 do
468 {
469 # So why is it 1% faster to leave the my here?
470 my $line = <$fh>;
471 $line =~ s/\r\n$/\n/;
472 die "$.:${line}Line should be exactly 65 characters long including
473 newline (".length($line).")" unless length ($line) == 65;
474 # Split line into groups of 4 hex digits, convert groups to ints
475 # This takes 65.35
476 # map {hex $_} $line =~ /(....)/g
477 # This takes 63.75 (2.5% less time)
478 # unpack "n*", pack "H*", $line
479 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
480 # Doing it as while ($line =~ /(....)/g) took 74.63
481 foreach my $val (unpack "n*", pack "H*", $line)
482 {
483 next if $val == 0xFFFD;
484 my $ech = &$type_func($ch,$page);
485 if ($val || (!$ch && !$page))
486 {
487 my $el = length($ech);
488 $max_el = $el if $el > $max_el;
489 $min_el = $el if $el < $min_el;
490 my $uch = encode_U($val);
491 if ($seen) {
492 # We're doing the test.
493 # We don't need to read this quickly, so storing it as a scalar,
494 # rather than 3 (anon array, plus the 2 scalars it holds) saves
495 # RAM and may make us faster on low RAM systems. [see __END__]
496 if (exists $seen->{$uch})
497 {
498 warn sprintf("U%04X is %02X%02X and %04X\n",
499 $val,$page,$ch,$seen->{$uch});
500 $errors++;
501 }
502 else
503 {
504 $seen->{$uch} = $page << 8 | $ch;
505 }
506 }
507 # Passing 2 extra args each time is 3.6% slower!
508 # Even with having to add $fallback ||= 0 later
509 enter_fb0($e2u,$ech,$uch);
510 enter_fb0($u2e,$uch,$ech);
511 }
512 else
513 {
514 # No character at this position
515 # enter($e2u,$ech,undef,$e2u);
516 }
517 $ch++;
518 }
519 } while --$i;
520 } while --$pages;
521 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
522 if $min_el > $max_el;
523 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
524 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
525}
526
527# my ($a,$s,$d,$t,$fb) = @_;
528sub enter {
529 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
530 # state we shift to after this (multibyte) input character defaults to same
531 # as current state.
532 $next ||= $current;
533 # Making sure it is defined seems to be faster than {no warnings;} in
534 # &process, or passing it in as 0 explicity.
535 # XXX $fallback ||= 0;
536
537 # Start at the beginning and work forwards through the string to zero.
538 # effectively we are removing 1 character from the front each time
539 # but we don't actually edit the string. [this alone seems to be 14% speedup]
540 # Hence -$pos is the length of the remaining string.
541 my $pos = -length $inbytes;
542 while (1) {
543 my $byte = substr $inbytes, $pos, 1;
544 # RAW_NEXT => 0,
545 # RAW_IN_LEN => 1,
546 # RAW_OUT_BYTES => 2,
547 # RAW_FALLBACK => 3,
548 # to unicode an array would seem to be better, because the pages are dense.
549 # from unicode can be very sparse, favouring a hash.
550 # hash using the bytes (all length 1) as keys rather than ord value,
551 # as it's easier to sort these in &process.
552
553 # It's faster to always add $fallback even if it's undef, rather than
554 # choosing between 3 and 4 element array. (hence why we set it defined
555 # above)
556 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
557 # When $pos was -1 we were at the last input character.
558 unless (++$pos) {
559 $do_now->[RAW_OUT_BYTES] = $outbytes;
560 $do_now->[RAW_NEXT] = $next;
561 return;
562 }
563 # Tail recursion. The intermdiate state may not have a name yet.
564 $current = $do_now->[RAW_NEXT];
565 }
566}
567
568# This is purely for optimistation. It's just &enter hard coded for $fallback
569# of 0, using only a 3 entry array ref to save memory for every entry.
570sub enter_fb0 {
571 my ($current,$inbytes,$outbytes,$next) = @_;
572 $next ||= $current;
573
574 my $pos = -length $inbytes;
575 while (1) {
576 my $byte = substr $inbytes, $pos, 1;
577 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
578 unless (++$pos) {
579 $do_now->[RAW_OUT_BYTES] = $outbytes;
580 $do_now->[RAW_NEXT] = $next;
581 return;
582 }
583 $current = $do_now->[RAW_NEXT];
584 }
585}
586
587
588sub outstring
589{
590 my ($fh,$name,$s) = @_;
591 my $sym = $strings{$s};
592 if ($sym)
593 {
594 $saved += length($s);
595 }
596 else
597 {
598 if ($opt{'O'}) {
599 foreach my $o (keys %strings)
600 {
601 next unless (my $i = index($o,$s)) >= 0;
602 $sym = $strings{$o};
603 # gcc things that 0x0e+0x10 (anything with e+) starts to look like
604 # a hexadecimal floating point constant. Silly gcc. Only p
605 # introduces a floating point constant. Put the space in to stop it
606 # getting confused.
607 $sym .= sprintf(" +0x%02x",$i) if ($i);
608 $subsave += length($s);
609 return $strings{$s} = $sym;
610 }
611 }
612 $strings{$s} = $sym = $name;
613 $strings += length($s);
614 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
615 # Maybe we should assert that these are all <256.
616 $definition .= join(',',unpack "C*",$s);
617 # We have a single long line. Split it at convenient commas.
618 $definition =~ s/(.{74,77},)/$1\n/g;
619 print $fh "$definition };\n\n";
620 }
621 return $sym;
622}
623
624sub process
625{
626 my ($name,$a) = @_;
627 $name =~ s/\W+/_/g;
628 $a->{Cname} = $name;
629 my $raw = $a->{Raw};
630 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
631 my @ent;
632 $agg_max_in = 0;
633 foreach my $key (sort keys %$raw) {
634 # RAW_NEXT => 0,
635 # RAW_IN_LEN => 1,
636 # RAW_OUT_BYTES => 2,
637 # RAW_FALLBACK => 3,
638 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
639 # Now we are converting from raw to aggregate, switch from 1 byte strings
640 # to numbers
641 my $b = ord $key;
642 $fallback ||= 0;
643 if ($l &&
644 # If this == fails, we're going to reset $agg_max_in below anyway.
645 $b == ++$agg_max_in &&
646 # References in numeric context give the pointer as an int.
647 $agg_next == $next &&
648 $agg_in_len == $in_len &&
649 $agg_out_len == length $out_bytes &&
650 $agg_fallback == $fallback
651 # && length($l->[AGG_OUT_BYTES]) < 16
652 ) {
653 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
654 # we can aggregate this byte onto the end.
655 $l->[AGG_MAX_IN] = $b;
656 $l->[AGG_OUT_BYTES] .= $out_bytes;
657 } else {
658 # AGG_MIN_IN => 0,
659 # AGG_MAX_IN => 1,
660 # AGG_OUT_BYTES => 2,
661 # AGG_NEXT => 3,
662 # AGG_IN_LEN => 4,
663 # AGG_OUT_LEN => 5,
664 # AGG_FALLBACK => 6,
665 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
666 # (only gains .6% on euc-jp -- is it worth it?)
667 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
668 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
669 $agg_fallback = $fallback];
670 }
671 if (exists $next->{Cname}) {
672 $next->{'Forward'} = 1 if $next != $a;
673 } else {
674 process(sprintf("%s_%02x",$name,$b),$next);
675 }
676 }
677 # encengine.c rules say that last entry must be for 255
678 if ($agg_max_in < 255) {
679 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
680 }
681 $a->{'Entries'} = \@ent;
682}
683
684sub outtable
685{
686 my ($fh,$a) = @_;
687 my $name = $a->{'Cname'};
688 # String tables
689 foreach my $b (@{$a->{'Entries'}})
690 {
691 next unless $b->[AGG_OUT_LEN];
692 my $s = $b->[AGG_MIN_IN];
693 my $e = $b->[AGG_MAX_IN];
694 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
695 }
696 if ($a->{'Forward'})
697 {
698 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
699 }
700 $a->{'Done'} = 1;
701 foreach my $b (@{$a->{'Entries'}})
702 {
703 my ($s,$e,$out,$t,$end,$l) = @$b;
704 outtable($fh,$t) unless $t->{'Done'};
705 }
706 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
707 foreach my $b (@{$a->{'Entries'}})
708 {
709 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
710 $end |= 0x80 if $fb;
711 print $fh "{";
712 if ($l)
713 {
714 printf $fh outstring($fh,'',$out);
715 }
716 else
717 {
718 print $fh "0";
719 }
720 print $fh ",",$t->{Cname};
721 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
722 }
723 print $fh "};\n";
724}
725
726sub output
727{
728 my ($fh,$name,$a) = @_;
729 process($name,$a);
730 # Sub-tables
731 outtable($fh,$a);
732}
733
734sub output_enc
735{
736 my ($fh,$name,$a) = @_;
737 die "Changed - fix me for new structure";
738 foreach my $b (sort keys %$a)
739 {
740 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
741 }
742}
743
744sub decode_U
745{
746 my $s = shift;
747}
748
749my @uname;
750sub char_names
751{
752 my $s = do "unicore/Name.pl";
753 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
754 pos($s) = 0;
755 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
756 {
757 my $name = $3;
758 my $s = hex($1);
759 last if $s >= 0x10000;
760 my $e = length($2) ? hex($2) : $s;
761 for (my $i = $s; $i <= $e; $i++)
762 {
763 $uname[$i] = $name;
764# print sprintf("U%04X $name\n",$i);
765 }
766 }
767}
768
769sub output_ucm_page
770{
771 my ($cmap,$a,$t,$pre) = @_;
772 # warn sprintf("Page %x\n",$pre);
773 my $raw = $t->{Raw};
774 foreach my $key (sort keys %$raw) {
775 # RAW_NEXT => 0,
776 # RAW_IN_LEN => 1,
777 # RAW_OUT_BYTES => 2,
778 # RAW_FALLBACK => 3,
779 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
780 my $u = ord $key;
781 $fallback ||= 0;
782
783 if ($next != $a && $next != $t) {
784 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
785 } elsif (length $out_bytes) {
786 if ($pre) {
787 $u = $pre|($u &0x3f);
788 }
789 my $s = sprintf "<U%04X> ",$u;
790 #foreach my $c (split(//,$out_bytes)) {
791 # $s .= sprintf "\\x%02X",ord($c);
792 #}
793 # 9.5% faster changing that loop to this:
794 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
795 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
796 push(@$cmap,$s);
797 } else {
798 warn join(',',$u, @{$raw->{$key}},$a,$t);
799 }
800 }
801}
802
803sub output_ucm
804{
805 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
806 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
807 print $fh "<code_set_name> \"$name\"\n";
808 char_names();
809 if (defined $min_el)
810 {
811 print $fh "<mb_cur_min> $min_el\n";
812 }
813 if (defined $max_el)
814 {
815 print $fh "<mb_cur_max> $max_el\n";
816 }
817 if (defined $rep)
818 {
819 print $fh "<subchar> ";
820 foreach my $c (split(//,$rep))
821 {
822 printf $fh "\\x%02X",ord($c);
823 }
824 print $fh "\n";
825 }
826 my @cmap;
827 output_ucm_page(\@cmap,$h,$h,0);
828 print $fh "#\nCHARMAP\n";
829 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
830 {
831 print $fh $line;
832 }
833 print $fh "END CHARMAP\n";
834}
835
3ef515df 836use vars qw(
837 $_Enc2xs
838 $_Version
839 $_Inc
840 $_Name
841 $_TableFiles
842 $_Now
843);
844
67d7b5ef 845sub make_makefile_pl
846{
847 eval { require Encode; };
848 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
3ef515df 849 # our used for variable expanstion
850 $_Enc2xs = $0;
851 $_Version = $VERSION;
852 $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
853 $_Name = shift;
854 $_TableFiles = join(",", map {qq('$_')} @_);
855 $_Now = scalar localtime();
856 warn "Generating Makefile.PL\n";
857 _print_expand("$_Inc/Makefile_PL.e2x", "Makefile.PL");
858 warn "Generating $_Name.pm\n";
859 _print_expand("$_Inc/_PM.e2x", "$_Name.pm");
860 warn "Generating t/$_Name.t\n";
861 _print_expand("$_Inc/_T.e2x", "t/$_Name.t");
862 warn "Generating README\n";
863 _print_expand("$_Inc/README.e2x", "README");
864 warn "Generating t/$_Name.t\n";
865 _print_expand("$_Inc/Changes.e2x", "Changes");
866 exit;
867}
868
869sub _print_expand{
67d7b5ef 870 eval { require File::Basename; };
871 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
872 File::Basename->import();
3ef515df 873 my ($src, $dst) = @_;
874 open my $in, $src or die "$src : $!";
875 if ((my $d = dirname($dst)) ne '.'){
876 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
877 }
878 open my $out, ">$dst" or die "$!";
879 my $asis = 0;
880 while (<$in>){
881 if (/^#### END_OF_HEADER/){
882 $asis = 1; next;
883 }
884 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
885 print $out $_;
67d7b5ef 886 }
67d7b5ef 887}
67d7b5ef 888__END__
889
890=head1 NAME
891
892enc2xs -- Perl Encode Module Generator
893
894=head1 SYNOPSIS
895
896 enc2xs -M ModName mapfiles...
897 enc2xs -[options]
898
899=head1 DESCRIPTION
900
901F<enc2xs> builds a Perl extension for use by Encode from either
902Unicode Character Mapping files (.ucm) or Tcl Encoding Files
903(.enc) Besides internally used during the build process of Encode
904module, you can use F<enc2xs> to add your own encoding to perl. No
905knowledge on XS is necessary.
906
907=head1 Quick Guide
908
909If what you want to know as little about Perl possible but needs to
910add a new encoding, just read this chapter and forget the rest.
911
912=over 4
913
914=item 0.
915
916Have a .ucm file ready. You can get it from somewhere or you can
917write your own from scratch or you can grab one from Encode
918distribution and customize. For UCM format, see the next Chapter.
919In the example below, I'll call my theoretical encoding myascii,
920defined inI<my.ucm>. C<$> is a shell prompt.
921
922 $ ls -F
923 my.ucm
924
925=item 1.
926
927Issue a command as follows;
928
929 $ enc2xs -M My my.ucm
3ef515df 930 generating Makefile.PL
931 generating My.pm
932 generating README
933 generating Changes
67d7b5ef 934
935Now take a look at your current directory. It should look like this.
936
937 $ ls -F
938 Makefile.PL My.pm my.ucm t/
939
940The following files are created.
941
942 Makefle.PL - MakeMaker script
943 My.pm - Encode Submodule
944 t/My.t - test file
945
037b88d6 946=item 1.1.
947
948If you want *.ucm installed together with the modules, do as follows;
949
950 $ mkdir Encode
951 $ mv *.ucm Encode
952 $ enc2xs -M My Encode/*ucm
953
67d7b5ef 954=item 2.
955
956Edit the files generated. You don't have to if you have no time AND no
957intention to give it to someone else. But it is a good idea to edit
958pod and add more tests.
959
960=item 3.
961
962Now issue a command all Perl Mongers love;
963
964 $ perl5.7.3 Makefile.PL
965 Writing Makefile for Encode::My
966
967=item 4.
968
969Now all you have to do is make.
970
971 $ make
972 cp My.pm blib/lib/Encode/My.pm
973 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
974 -o encode_t.c -f encode_t.fnm
975 Reading myascii (myascii)
976 Writing compiled form
977 128 bytes in string tables
978 384 bytes (25%) saved spotting duplicates
979 1 bytes (99.2%) saved using substrings
980 ....
981 chmod 644 blib/arch/auto/Encode/My/My.bs
982 $
983
984The time it takes varies how fast your machine is and how large your
985encoding is. Unless you are working on something big like euc-tw, it
986won't take too long.
987
988=item 5.
989
990You can "make install" already but you should test first.
991
992 $ make test
993 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
994 -e 'use Test::Harness qw(&runtests $verbose); \
995 $verbose=0; runtests @ARGV;' t/*.t
996 t/My....ok
997 All tests successful.
998 Files=1, Tests=2, 0 wallclock secs
999 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1000
1001=item 6.
1002
1003If you are content with the test result, just "make install"
1004
1005=back
1006
1007=head1 The Unicode Character Map
1008
1009Encode uses The Unicode Character Map (UCM) for source character
1010mappings. This format is used by ICU package of IBM and adopted by
1011Nick Ing-Simmons. Since UCM is more flexible than Tcl's Encoding Map
1012and far more user-friendly, This is the recommended formet for
1013Encode now.
1014
1015UCM file looks like this.
1016
1017 #
1018 # Comments
1019 #
1020 <code_set_name> "US-ascii" # Required
1021 <code_set_alias> "ascii" # Optional
1022 <mb_cur_min> 1 # Required; usually 1
1023 <mb_cur_max> 1 # Max. # of bytes/char
1024 <subchar> \x3F # Substitution char
1025 #
1026 CHARMAP
1027 <U0000> \x00 |0 # <control>
1028 <U0001> \x01 |0 # <control>
1029 <U0002> \x02 |0 # <control>
1030 ....
1031 <U007C> \x7C |0 # VERTICAL LINE
1032 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1033 <U007E> \x7E |0 # TILDE
1034 <U007F> \x7F |0 # <control>
1035 END CHARMAP
1036
1037=over 4
1038
1039=item *
1040
1041Anything that follows C<#> is treated as comments.
1042
1043=item *
1044
1045The header section continues until CHARMAP. This section Has a form of
1046I<E<lt>keywordE<gt> value>, one at a line. For a value, strings must
1047be quoted. Barewords are treated as numbers. I<\xXX> represents a
1048byte.
1049
1050Most of the keywords are self-explanatory. I<subchar> means
1051substitution character, not subcharacter. When you decode a Unicode
1052sequence to this encoding but no matching character is found, the byte
1053sequence defined here will be used. For most cases, the value here is
1054\x3F, in ASCII this is a question mark.
1055
1056=item *
1057
1058CHARMAP starts the character map section. Each line has a form as
1059follows;
1060
1061 <UXXXX> \xXX.. |0 # comment
1062 ^ ^ ^
1063 | | +- Fallback flag
1064 | +-------- Encoded byte sequence
1065 +-------------- Unicode Character ID in hex
1066
1067The format is roughly the same as a header section except for fallback
1068flag. It is | followed by 0..3. And their meaning as follows
1069
1070=over 2
1071
1072=item |0
1073
1074Round trip safe. A character decoded to Unicode encodes back to the
1075same byte sequence. most character belong to this.
1076
1077=item |1
1078
1079Fallback for unicode -> encoding. When seen, enc2xs adds this
1080character for encode map only
1081
1082=item |2
1083
1084Skip sub-char mapping should there be no code point.
1085
1086=item |3
1087
1088Fallback for encoding -> unicode. When seen, enc2xs adds this
1089character for decode map only
1090
1091=back
1092
1093=item *
1094
1095And finally, END OF CHARMAP ends the section.
1096
1097=back
1098
1099Needless to say, if you are manually creating a UCM file, you should
1100copy ascii.ucm or existing encoding which is close to yours than write
1101your own from scratch.
1102
1103When you do so, make sure you leave at least B<U0000> to B<U0020> as
1104is, unless your environment is on EBCDIC.
1105
1106B<CAVEAT>: not all features in UCM are implemented. For example,
1107icu:state is not used. Because of that, you need to write a perl
1108module if you want to support algorithmical encodings, notablly
1109ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
1110L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1111
1112=head1 Bookmarks
1113
1114ICU Home Page
1115L<http://oss.software.ibm.com/icu/>
1116
1117ICU Character Mapping Tables
1118L<http://oss.software.ibm.com/icu/charset/>
1119
1120ICU:Conversion Data
1121L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1122
1123=head1 SEE ALSO
1124
1125L<Encode>,
1126L<perlmod>,
1127L<perlpod>
1128
1129=cut
1130
1131# -Q to disable the duplicate codepoint test
1132# -S make mapping errors fatal
1133# -q to remove comments written to output files
1134# -O to enable the (brute force) substring optimiser
1135# -o <output> to specify the output file name (else it's the first arg)
1136# -f <inlist> to give a file with a list of input files (else use the args)
1137# -n <name> to name the encoding (else use the basename of the input file.
1138
1139With %seen holding array refs:
1140
1141 865.66 real 28.80 user 8.79 sys
1142 7904 maximum resident set size
1143 1356 average shared memory size
1144 18566 average unshared data size
1145 229 average unshared stack size
1146 46080 page reclaims
1147 33373 page faults
1148
1149With %seen holding simple scalars:
1150
1151 342.16 real 27.11 user 3.54 sys
1152 8388 maximum resident set size
1153 1394 average shared memory size
1154 14969 average unshared data size
1155 236 average unshared stack size
1156 28159 page reclaims
1157 9839 page faults
1158
1159Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1160how %seen is storing things its seen. So it is pathalogically bad on a 16M
1161RAM machine, but it's going to help even on modern machines.
1162Swapping is bad, m'kay :-)