Linux and Solaris hints: C++ vs dlerror
[p5sagit/p5-mst-13.2.git] / ext / Encode / bin / enc2xs
CommitLineData
3ef515df 1#!./perl
67d7b5ef 2BEGIN {
a999c27c 3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
4 # with $ENV{PERL_CORE} set
5 # In case we need it in future...
6 require Config; import Config;
67d7b5ef 7}
8use strict;
b536bf57 9use warnings;
67d7b5ef 10use Getopt::Std;
f49b0b20 11use Config;
67d7b5ef 12my @orig_ARGV = @ARGV;
f49b0b20 13our $VERSION = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
67d7b5ef 14
15# These may get re-ordered.
16# RAW is a do_now as inserted by &enter
17# AGG is an aggreagated do_now, as built up by &process
a999c27c 18
67d7b5ef 19use constant {
20 RAW_NEXT => 0,
21 RAW_IN_LEN => 1,
22 RAW_OUT_BYTES => 2,
23 RAW_FALLBACK => 3,
24
25 AGG_MIN_IN => 0,
26 AGG_MAX_IN => 1,
27 AGG_OUT_BYTES => 2,
28 AGG_NEXT => 3,
29 AGG_IN_LEN => 4,
30 AGG_OUT_LEN => 5,
31 AGG_FALLBACK => 6,
32};
a999c27c 33
67d7b5ef 34# (See the algorithm in encengine.c - we're building structures for it)
35
36# There are two sorts of structures.
37# "do_now" (an array, two variants of what needs storing) is whatever we need
38# to do now we've read an input byte.
39# It's housed in a "do_next" (which is how we got to it), and in turn points
40# to a "do_next" which contains all the "do_now"s for the next input byte.
41
42# There will be a "do_next" which is the start state.
43# For a single byte encoding it's the only "do_next" - each "do_now" points
44# back to it, and each "do_now" will cause bytes. There is no state.
45
46# For a multi-byte encoding where all characters in the input are the same
47# length, then there will be a tree of "do_now"->"do_next"->"do_now"
48# branching out from the start state, one step for each input byte.
49# The leaf "do_now"s will all be at the same distance from the start state,
50# only the leaf "do_now"s cause output bytes, and they in turn point back to
51# the start state.
52
53# For an encoding where there are varaible length input byte sequences, you
54# will encounter a leaf "do_now" sooner for the shorter input sequences, but
55# as before the leaves will point back to the start state.
56
57# The system will cope with escape encodings (imagine them as a mostly
58# self-contained tree for each escape state, and cross links between trees
59# at the state-switching characters) but so far no input format defines these.
60
61# The system will also cope with having output "leaves" in the middle of
62# the bifurcating branches, not just at the extremities, but again no
63# input format does this yet.
64
65# There are two variants of the "do_now" structure. The first, smaller variant
66# is generated by &enter as the input file is read. There is one structure
67# for each input byte. Say we are mapping a single byte encoding to a
68# single byte encoding, with "ABCD" going "abcd". There will be
69# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
70
71# &process then walks the tree, building aggregate "do_now" structres for
72# adjacent bytes where possible. The aggregate is for a contiguous range of
73# bytes which each produce the same length of output, each move to the
74# same next state, and each have the same fallback flag.
75# So our 4 RAW "do_now"s above become replaced by a single structure
76# containing:
77# ["A", "D", "abcd", 1, ...]
78# ie, for an input byte $_ in "A".."D", output 1 byte, found as
79# substr ("abcd", (ord $_ - ord "A") * 1, 1)
80# which maps very nicely into pointer arithmetic in C for encengine.c
81
82sub encode_U
83{
84 # UTF-8 encode long hand - only covers part of perl's range
85 ## my $uv = shift;
86 # chr() works in native space so convert value from table
87 # into that space before using chr().
88 my $ch = chr(utf8::unicode_to_native($_[0]));
89 # Now get core perl to encode that the way it likes.
90 utf8::encode($ch);
91 return $ch;
92}
93
94sub encode_S
95{
96 # encode single byte
97 ## my ($ch,$page) = @_; return chr($ch);
98 return chr $_[0];
99}
100
101sub encode_D
102{
103 # encode double byte MS byte first
104 ## my ($ch,$page) = @_; return chr($page).chr($ch);
105 return chr ($_[1]) . chr $_[0];
106}
107
108sub encode_M
109{
110 # encode Multi-byte - single for 0..255 otherwise double
111 ## my ($ch,$page) = @_;
112 ## return &encode_D if $page;
113 ## return &encode_S;
114 return chr ($_[1]) . chr $_[0] if $_[1];
115 return chr $_[0];
116}
117
118my %encode_types = (U => \&encode_U,
119 S => \&encode_S,
120 D => \&encode_D,
121 M => \&encode_M,
122 );
123
124# Win32 does not expand globs on command line
125eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
126
127my %opt;
128# I think these are:
129# -Q to disable the duplicate codepoint test
130# -S make mapping errors fatal
131# -q to remove comments written to output files
132# -O to enable the (brute force) substring optimiser
133# -o <output> to specify the output file name (else it's the first arg)
134# -f <inlist> to give a file with a list of input files (else use the args)
135# -n <name> to name the encoding (else use the basename of the input file.
aae85ceb 136getopts('CM:SQqOo:f:n:',\%opt);
67d7b5ef 137
138$opt{M} and make_makefile_pl($opt{M}, @ARGV);
aae85ceb 139$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
67d7b5ef 140
141# This really should go first, else the die here causes empty (non-erroneous)
142# output files to be written.
143my @encfiles;
144if (exists $opt{'f'}) {
145 # -F is followed by name of file containing list of filenames
146 my $flist = $opt{'f'};
147 open(FLIST,$flist) || die "Cannot open $flist:$!";
148 chomp(@encfiles = <FLIST>);
149 close(FLIST);
150} else {
151 @encfiles = @ARGV;
152}
153
154my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
155chmod(0666,$cname) if -f $cname && !-w $cname;
156open(C,">$cname") || die "Cannot open $cname:$!";
157
158my $dname = $cname;
159my $hname = $cname;
160
161my ($doC,$doEnc,$doUcm,$doPet);
162
0e4142c9 163if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
67d7b5ef 164 {
165 $doC = 1;
e7cbefb8 166 $dname =~ s/(\.[^\.]*)?$/.exh/;
67d7b5ef 167 chmod(0666,$dname) if -f $cname && !-w $dname;
168 open(D,">$dname") || die "Cannot open $dname:$!";
169 $hname =~ s/(\.[^\.]*)?$/.h/;
170 chmod(0666,$hname) if -f $cname && !-w $hname;
171 open(H,">$hname") || die "Cannot open $hname:$!";
172
173 foreach my $fh (\*C,\*D,\*H)
174 {
175 print $fh <<"END" unless $opt{'q'};
176/*
177 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
178 This file was autogenerated by:
179 $^X $0 @orig_ARGV
f49b0b20 180 enc2xs VERSION $VERSION
67d7b5ef 181*/
182END
183 }
184
185 if ($cname =~ /(\w+)\.xs$/)
186 {
187 print C "#include <EXTERN.h>\n";
188 print C "#include <perl.h>\n";
189 print C "#include <XSUB.h>\n";
190 print C "#define U8 U8\n";
191 }
b536bf57 192 print C "#include \"encode.h\"\n\n";
67d7b5ef 193
194 }
195elsif ($cname =~ /\.enc$/)
196 {
197 $doEnc = 1;
198 }
199elsif ($cname =~ /\.ucm$/)
200 {
201 $doUcm = 1;
202 }
203elsif ($cname =~ /\.pet$/)
204 {
205 $doPet = 1;
206 }
207
208my %encoding;
209my %strings;
b536bf57 210my $string_acc;
211my %strings_in_acc;
212
67d7b5ef 213my $saved = 0;
214my $subsave = 0;
215my $strings = 0;
216
217sub cmp_name
218{
219 if ($a =~ /^.*-(\d+)/)
220 {
221 my $an = $1;
222 if ($b =~ /^.*-(\d+)/)
223 {
224 my $r = $an <=> $1;
225 return $r if $r;
226 }
227 }
228 return $a cmp $b;
229}
230
231
232foreach my $enc (sort cmp_name @encfiles)
233 {
234 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
235 $name = $opt{'n'} if exists $opt{'n'};
236 if (open(E,$enc))
237 {
238 if ($sfx eq 'enc')
239 {
240 compile_enc(\*E,lc($name));
241 }
242 else
243 {
244 compile_ucm(\*E,lc($name));
245 }
246 }
247 else
248 {
249 warn "Cannot open $enc for $name:$!";
250 }
251 }
252
253if ($doC)
254 {
255 print STDERR "Writing compiled form\n";
256 foreach my $name (sort cmp_name keys %encoding)
257 {
258 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
b536bf57 259 process($name.'_utf8',$e2u);
260 addstrings(\*C,$e2u);
261
262 process('utf8_'.$name,$u2e);
263 addstrings(\*C,$u2e);
264 }
265 outbigstring(\*C,"enctable");
266 foreach my $name (sort cmp_name keys %encoding)
267 {
268 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
269 outtable(\*C,$e2u, "enctable");
270 outtable(\*C,$u2e, "enctable");
271
b2704119 272 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
67d7b5ef 273 }
274 foreach my $enc (sort cmp_name keys %encoding)
275 {
b2704119 276 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
277 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
278 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
279 my $replen = 0;
280 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
67d7b5ef 281 my $sym = "${enc}_encoding";
282 $sym =~ s/\W+/_/g;
7dd03145 283 my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
d1256cb1 284 $min_el,$max_el);
7dd03145 285 print C "static const U8 ${sym}_rep_character[] = \"$rep\";\n";
286 print C "static const char ${sym}_enc_name[] = \"$enc\";\n\n";
0629a5b3 287 print C "const encode_t $sym = \n";
f0a41339 288 # This is to make null encoding work -- dankogai
289 for (my $i = (scalar @info) - 1; $i >= 0; --$i){
d1256cb1 290 $info[$i] ||= 1;
f0a41339 291 }
292 # end of null tweak -- dankogai
7dd03145 293 print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
67d7b5ef 294 }
295
296 foreach my $enc (sort cmp_name keys %encoding)
297 {
298 my $sym = "${enc}_encoding";
299 $sym =~ s/\W+/_/g;
300 print H "extern encode_t $sym;\n";
301 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
302 }
303
304 if ($cname =~ /(\w+)\.xs$/)
305 {
306 my $mod = $1;
307 print C <<'END';
308
309static void
310Encode_XSEncoding(pTHX_ encode_t *enc)
311{
312 dSP;
313 HV *stash = gv_stashpv("Encode::XS", TRUE);
314 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
315 int i = 0;
316 PUSHMARK(sp);
317 XPUSHs(sv);
318 while (enc->name[i])
319 {
320 const char *name = enc->name[i++];
321 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
322 }
323 PUTBACK;
324 call_pv("Encode::define_encoding",G_DISCARD);
325 SvREFCNT_dec(sv);
326}
327
328END
329
330 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
331 print C "BOOT:\n{\n";
332 print C "#include \"$dname\"\n";
333 print C "}\n";
334 }
335 # Close in void context is bad, m'kay
336 close(D) or warn "Error closing '$dname': $!";
337 close(H) or warn "Error closing '$hname': $!";
338
cf9f87ce 339 my $perc_saved = $saved/($strings + $saved) * 100;
340 my $perc_subsaved = $subsave/($strings + $subsave) * 100;
67d7b5ef 341 printf STDERR "%d bytes in string tables\n",$strings;
342 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
343 $saved, $perc_saved if $saved;
344 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
345 $subsave, $perc_subsaved if $subsave;
346 }
347elsif ($doEnc)
348 {
349 foreach my $name (sort cmp_name keys %encoding)
350 {
351 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
352 output_enc(\*C,$name,$e2u);
353 }
354 }
355elsif ($doUcm)
356 {
357 foreach my $name (sort cmp_name keys %encoding)
358 {
359 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
360 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
361 }
362 }
363
364# writing half meg files and then not checking to see if you just filled the
365# disk is bad, m'kay
366close(C) or die "Error closing '$cname': $!";
367
368# End of the main program.
369
370sub compile_ucm
371{
372 my ($fh,$name) = @_;
373 my $e2u = {};
374 my $u2e = {};
375 my $cs;
376 my %attr;
377 while (<$fh>)
378 {
379 s/#.*$//;
380 last if /^\s*CHARMAP\s*$/i;
381 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
382 {
383 $attr{$1} = $2;
384 }
385 }
386 if (!defined($cs = $attr{'code_set_name'}))
387 {
388 warn "No <code_set_name> in $name\n";
389 }
390 else
391 {
392 $name = $cs unless exists $opt{'n'};
393 }
394 my $erep;
395 my $urep;
396 my $max_el;
397 my $min_el;
398 if (exists $attr{'subchar'})
399 {
b2704119 400 #my @byte;
401 #$attr{'subchar'} =~ /^\s*/cg;
402 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
403 #$erep = join('',map(chr(hex($_)),@byte));
404 $erep = $attr{'subchar'};
405 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
67d7b5ef 406 }
407 print "Reading $name ($cs)\n";
408 my $nfb = 0;
409 my $hfb = 0;
410 while (<$fh>)
411 {
412 s/#.*$//;
413 last if /^\s*END\s+CHARMAP\s*$/i;
414 next if /^\s*$/;
a999c27c 415 my (@uni, @byte) = ();
416 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
417 or die "Bad line: $_";
418 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
419 push @uni, map { substr($_, 1) } split(/\+/, $1);
420 }
421 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
422 push @byte, $1;
423 }
424 if (@uni)
67d7b5ef 425 {
a999c27c 426 my $uch = join('', map { encode_U(hex($_)) } @uni );
67d7b5ef 427 my $ech = join('',map(chr(hex($_)),@byte));
428 my $el = length($ech);
429 $max_el = $el if (!defined($max_el) || $el > $max_el);
430 $min_el = $el if (!defined($min_el) || $el < $min_el);
431 if (length($fb))
432 {
433 $fb = substr($fb,1);
434 $hfb++;
435 }
436 else
437 {
438 $nfb++;
439 $fb = '0';
440 }
441 # $fb is fallback flag
442 # 0 - round trip safe
443 # 1 - fallback for unicode -> enc
444 # 2 - skip sub-char mapping
445 # 3 - fallback enc -> unicode
446 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
447 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
448 }
449 else
450 {
451 warn $_;
452 }
453 }
454 if ($nfb && $hfb)
455 {
456 die "$nfb entries without fallback, $hfb entries with\n";
457 }
458 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
459}
460
461
462
463sub compile_enc
464{
465 my ($fh,$name) = @_;
466 my $e2u = {};
467 my $u2e = {};
468
469 my $type;
470 while ($type = <$fh>)
471 {
472 last if $type !~ /^\s*#/;
473 }
474 chomp($type);
475 return if $type eq 'E';
476 # Do the hash lookup once, rather than once per function call. 4% speedup.
477 my $type_func = $encode_types{$type};
478 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
479 warn "$type encoded $name\n";
480 my $rep = '';
481 # Save a defined test by setting these to defined values.
482 my $min_el = ~0; # A very big integer
483 my $max_el = 0; # Anything must be longer than 0
484 {
485 my $v = hex($def);
486 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
487 }
488 my $errors;
489 my $seen;
490 # use -Q to silence the seen test. Makefile.PL uses this by default.
491 $seen = {} unless $opt{Q};
492 do
493 {
494 my $line = <$fh>;
495 chomp($line);
496 my $page = hex($line);
497 my $ch = 0;
498 my $i = 16;
499 do
500 {
501 # So why is it 1% faster to leave the my here?
502 my $line = <$fh>;
503 $line =~ s/\r\n$/\n/;
504 die "$.:${line}Line should be exactly 65 characters long including
505 newline (".length($line).")" unless length ($line) == 65;
506 # Split line into groups of 4 hex digits, convert groups to ints
507 # This takes 65.35
508 # map {hex $_} $line =~ /(....)/g
509 # This takes 63.75 (2.5% less time)
510 # unpack "n*", pack "H*", $line
511 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
512 # Doing it as while ($line =~ /(....)/g) took 74.63
513 foreach my $val (unpack "n*", pack "H*", $line)
514 {
515 next if $val == 0xFFFD;
516 my $ech = &$type_func($ch,$page);
517 if ($val || (!$ch && !$page))
518 {
519 my $el = length($ech);
520 $max_el = $el if $el > $max_el;
521 $min_el = $el if $el < $min_el;
522 my $uch = encode_U($val);
523 if ($seen) {
524 # We're doing the test.
525 # We don't need to read this quickly, so storing it as a scalar,
526 # rather than 3 (anon array, plus the 2 scalars it holds) saves
527 # RAM and may make us faster on low RAM systems. [see __END__]
528 if (exists $seen->{$uch})
529 {
530 warn sprintf("U%04X is %02X%02X and %04X\n",
531 $val,$page,$ch,$seen->{$uch});
532 $errors++;
533 }
534 else
535 {
536 $seen->{$uch} = $page << 8 | $ch;
537 }
538 }
539 # Passing 2 extra args each time is 3.6% slower!
540 # Even with having to add $fallback ||= 0 later
541 enter_fb0($e2u,$ech,$uch);
542 enter_fb0($u2e,$uch,$ech);
543 }
544 else
545 {
546 # No character at this position
547 # enter($e2u,$ech,undef,$e2u);
548 }
549 $ch++;
550 }
551 } while --$i;
552 } while --$pages;
553 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
554 if $min_el > $max_el;
555 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
556 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
557}
558
559# my ($a,$s,$d,$t,$fb) = @_;
560sub enter {
561 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
562 # state we shift to after this (multibyte) input character defaults to same
563 # as current state.
564 $next ||= $current;
565 # Making sure it is defined seems to be faster than {no warnings;} in
566 # &process, or passing it in as 0 explicity.
567 # XXX $fallback ||= 0;
568
569 # Start at the beginning and work forwards through the string to zero.
570 # effectively we are removing 1 character from the front each time
571 # but we don't actually edit the string. [this alone seems to be 14% speedup]
572 # Hence -$pos is the length of the remaining string.
573 my $pos = -length $inbytes;
574 while (1) {
575 my $byte = substr $inbytes, $pos, 1;
576 # RAW_NEXT => 0,
577 # RAW_IN_LEN => 1,
578 # RAW_OUT_BYTES => 2,
579 # RAW_FALLBACK => 3,
580 # to unicode an array would seem to be better, because the pages are dense.
581 # from unicode can be very sparse, favouring a hash.
582 # hash using the bytes (all length 1) as keys rather than ord value,
583 # as it's easier to sort these in &process.
584
585 # It's faster to always add $fallback even if it's undef, rather than
586 # choosing between 3 and 4 element array. (hence why we set it defined
587 # above)
588 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
589 # When $pos was -1 we were at the last input character.
590 unless (++$pos) {
591 $do_now->[RAW_OUT_BYTES] = $outbytes;
592 $do_now->[RAW_NEXT] = $next;
593 return;
594 }
595 # Tail recursion. The intermdiate state may not have a name yet.
596 $current = $do_now->[RAW_NEXT];
597 }
598}
599
600# This is purely for optimistation. It's just &enter hard coded for $fallback
601# of 0, using only a 3 entry array ref to save memory for every entry.
602sub enter_fb0 {
603 my ($current,$inbytes,$outbytes,$next) = @_;
604 $next ||= $current;
605
606 my $pos = -length $inbytes;
607 while (1) {
608 my $byte = substr $inbytes, $pos, 1;
609 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
610 unless (++$pos) {
611 $do_now->[RAW_OUT_BYTES] = $outbytes;
612 $do_now->[RAW_NEXT] = $next;
613 return;
614 }
615 $current = $do_now->[RAW_NEXT];
616 }
617}
618
67d7b5ef 619sub process
620{
621 my ($name,$a) = @_;
622 $name =~ s/\W+/_/g;
623 $a->{Cname} = $name;
624 my $raw = $a->{Raw};
625 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
626 my @ent;
627 $agg_max_in = 0;
628 foreach my $key (sort keys %$raw) {
629 # RAW_NEXT => 0,
630 # RAW_IN_LEN => 1,
631 # RAW_OUT_BYTES => 2,
632 # RAW_FALLBACK => 3,
633 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
634 # Now we are converting from raw to aggregate, switch from 1 byte strings
635 # to numbers
636 my $b = ord $key;
637 $fallback ||= 0;
638 if ($l &&
639 # If this == fails, we're going to reset $agg_max_in below anyway.
640 $b == ++$agg_max_in &&
641 # References in numeric context give the pointer as an int.
642 $agg_next == $next &&
643 $agg_in_len == $in_len &&
644 $agg_out_len == length $out_bytes &&
645 $agg_fallback == $fallback
646 # && length($l->[AGG_OUT_BYTES]) < 16
647 ) {
648 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
649 # we can aggregate this byte onto the end.
650 $l->[AGG_MAX_IN] = $b;
651 $l->[AGG_OUT_BYTES] .= $out_bytes;
652 } else {
653 # AGG_MIN_IN => 0,
654 # AGG_MAX_IN => 1,
655 # AGG_OUT_BYTES => 2,
656 # AGG_NEXT => 3,
657 # AGG_IN_LEN => 4,
658 # AGG_OUT_LEN => 5,
659 # AGG_FALLBACK => 6,
660 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
661 # (only gains .6% on euc-jp -- is it worth it?)
662 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
663 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
664 $agg_fallback = $fallback];
665 }
666 if (exists $next->{Cname}) {
667 $next->{'Forward'} = 1 if $next != $a;
668 } else {
669 process(sprintf("%s_%02x",$name,$b),$next);
670 }
671 }
672 # encengine.c rules say that last entry must be for 255
673 if ($agg_max_in < 255) {
674 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
675 }
676 $a->{'Entries'} = \@ent;
677}
678
b536bf57 679
680sub addstrings
67d7b5ef 681{
682 my ($fh,$a) = @_;
683 my $name = $a->{'Cname'};
684 # String tables
685 foreach my $b (@{$a->{'Entries'}})
686 {
687 next unless $b->[AGG_OUT_LEN];
b536bf57 688 $strings{$b->[AGG_OUT_BYTES]} = undef;
67d7b5ef 689 }
690 if ($a->{'Forward'})
691 {
9f20e2cf 692 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
f49b0b20 693 my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static';
694 my $const = $cpp ? '' : 'const';
695 print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
b536bf57 696 }
697 $a->{'DoneStrings'} = 1;
698 foreach my $b (@{$a->{'Entries'}})
699 {
700 my ($s,$e,$out,$t,$end,$l) = @$b;
701 addstrings($fh,$t) unless $t->{'DoneStrings'};
67d7b5ef 702 }
b536bf57 703}
704
705sub outbigstring
706{
707 my ($fh,$name) = @_;
708
709 $string_acc = '';
710
711 # Make the big string in the string accumulator. Longest first, on the hope
712 # that this makes it more likely that we find the short strings later on.
713 # Not sure if it helps sorting strings of the same length lexcically.
714 foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
715 my $index = index $string_acc, $s;
716 if ($index >= 0) {
717 $saved += length($s);
718 $strings_in_acc{$s} = $index;
719 } else {
720 OPTIMISER: {
d1256cb1 721 if ($opt{'O'}) {
722 my $sublength = length $s;
723 while (--$sublength > 0) {
724 # progressively lop characters off the end, to see if the start of
725 # the new string overlaps the end of the accumulator.
726 if (substr ($string_acc, -$sublength)
727 eq substr ($s, 0, $sublength)) {
728 $subsave += $sublength;
729 $strings_in_acc{$s} = length ($string_acc) - $sublength;
730 # append the last bit on the end.
731 $string_acc .= substr ($s, $sublength);
732 last OPTIMISER;
733 }
734 # or if the end of the new string overlaps the start of the
735 # accumulator
736 next unless substr ($string_acc, 0, $sublength)
737 eq substr ($s, -$sublength);
738 # well, the last $sublength characters of the accumulator match.
739 # so as we're prepending to the accumulator, need to shift all our
740 # existing offsets forwards
741 $_ += $sublength foreach values %strings_in_acc;
742 $subsave += $sublength;
743 $strings_in_acc{$s} = 0;
744 # append the first bit on the start.
745 $string_acc = substr ($s, 0, -$sublength) . $string_acc;
746 last OPTIMISER;
747 }
748 }
749 # Optimiser (if it ran) found nothing, so just going have to tack the
750 # whole thing on the end.
751 $strings_in_acc{$s} = length $string_acc;
752 $string_acc .= $s;
b536bf57 753 };
754 }
755 }
756
757 $strings = length $string_acc;
9f20e2cf 758 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
f49b0b20 759 my $var = $cpp ? '' : 'static';
760 my $definition = "\n$var const U8 $name\[$strings] = { " .
b536bf57 761 join(',',unpack "C*",$string_acc);
762 # We have a single long line. Split it at convenient commas.
763 print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
764 print $fh substr ($definition, pos $definition), " };\n";
765}
766
767sub findstring {
768 my ($name,$s) = @_;
769 my $offset = $strings_in_acc{$s};
770 die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
771 unless defined $offset;
772 "$name + $offset";
773}
774
775sub outtable
776{
777 my ($fh,$a,$bigname) = @_;
778 my $name = $a->{'Cname'};
67d7b5ef 779 $a->{'Done'} = 1;
780 foreach my $b (@{$a->{'Entries'}})
781 {
782 my ($s,$e,$out,$t,$end,$l) = @$b;
b536bf57 783 outtable($fh,$t,$bigname) unless $t->{'Done'};
67d7b5ef 784 }
9f20e2cf 785 my $cpp = ($Config{d_cplusplus} || '') eq 'define';
f49b0b20 786 my $var = $cpp ? '' : 'static';
787 my $const = $cpp ? '' : 'const';
788 print $fh "\n$var $const encpage_t $name\[",
0629a5b3 789 scalar(@{$a->{'Entries'}}), "] = {\n";
67d7b5ef 790 foreach my $b (@{$a->{'Entries'}})
791 {
792 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
d6f02b51 793 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
67d7b5ef 794 print $fh "{";
795 if ($l)
796 {
b536bf57 797 printf $fh findstring($bigname,$out);
67d7b5ef 798 }
799 else
800 {
801 print $fh "0";
802 }
803 print $fh ",",$t->{Cname};
804 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
805 }
806 print $fh "};\n";
807}
808
67d7b5ef 809sub output_enc
810{
811 my ($fh,$name,$a) = @_;
812 die "Changed - fix me for new structure";
813 foreach my $b (sort keys %$a)
814 {
815 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
816 }
817}
818
819sub decode_U
820{
821 my $s = shift;
822}
823
824my @uname;
825sub char_names
826{
827 my $s = do "unicore/Name.pl";
828 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
829 pos($s) = 0;
830 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
831 {
832 my $name = $3;
833 my $s = hex($1);
834 last if $s >= 0x10000;
835 my $e = length($2) ? hex($2) : $s;
836 for (my $i = $s; $i <= $e; $i++)
837 {
838 $uname[$i] = $name;
839# print sprintf("U%04X $name\n",$i);
840 }
841 }
842}
843
844sub output_ucm_page
845{
846 my ($cmap,$a,$t,$pre) = @_;
847 # warn sprintf("Page %x\n",$pre);
848 my $raw = $t->{Raw};
849 foreach my $key (sort keys %$raw) {
850 # RAW_NEXT => 0,
851 # RAW_IN_LEN => 1,
852 # RAW_OUT_BYTES => 2,
853 # RAW_FALLBACK => 3,
854 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
855 my $u = ord $key;
856 $fallback ||= 0;
857
858 if ($next != $a && $next != $t) {
859 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
860 } elsif (length $out_bytes) {
861 if ($pre) {
862 $u = $pre|($u &0x3f);
863 }
864 my $s = sprintf "<U%04X> ",$u;
865 #foreach my $c (split(//,$out_bytes)) {
866 # $s .= sprintf "\\x%02X",ord($c);
867 #}
868 # 9.5% faster changing that loop to this:
869 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
870 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
871 push(@$cmap,$s);
872 } else {
873 warn join(',',$u, @{$raw->{$key}},$a,$t);
874 }
875 }
876}
877
878sub output_ucm
879{
880 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
881 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
882 print $fh "<code_set_name> \"$name\"\n";
883 char_names();
884 if (defined $min_el)
885 {
886 print $fh "<mb_cur_min> $min_el\n";
887 }
888 if (defined $max_el)
889 {
890 print $fh "<mb_cur_max> $max_el\n";
891 }
892 if (defined $rep)
893 {
894 print $fh "<subchar> ";
895 foreach my $c (split(//,$rep))
896 {
897 printf $fh "\\x%02X",ord($c);
898 }
899 print $fh "\n";
900 }
901 my @cmap;
902 output_ucm_page(\@cmap,$h,$h,0);
903 print $fh "#\nCHARMAP\n";
904 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
905 {
906 print $fh $line;
907 }
908 print $fh "END CHARMAP\n";
909}
910
3ef515df 911use vars qw(
912 $_Enc2xs
913 $_Version
914 $_Inc
b2704119 915 $_E2X
3ef515df 916 $_Name
917 $_TableFiles
918 $_Now
919);
920
b2704119 921sub find_e2x{
b536bf57 922 eval { require File::Find; };
b2704119 923 my (@inc, %e2x_dir);
924 for my $inc (@INC){
d1256cb1 925 push @inc, $inc unless $inc eq '.'; #skip current dir
b2704119 926 }
927 File::Find::find(
d1256cb1 928 sub {
929 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
930 $atime,$mtime,$ctime,$blksize,$blocks)
931 = lstat($_) or return;
932 -f _ or return;
933 if (/^.*\.e2x$/o){
934 no warnings 'once';
935 $e2x_dir{$File::Find::dir} ||= $mtime;
936 }
937 return;
938 }, @inc);
b2704119 939 warn join("\n", keys %e2x_dir), "\n";
940 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
d1256cb1 941 $_E2X = $d;
942 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
943 return $_E2X;
b2704119 944 }
945}
946
67d7b5ef 947sub make_makefile_pl
948{
949 eval { require Encode; };
950 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
3ef515df 951 # our used for variable expanstion
952 $_Enc2xs = $0;
953 $_Version = $VERSION;
b2704119 954 $_E2X = find_e2x();
3ef515df 955 $_Name = shift;
956 $_TableFiles = join(",", map {qq('$_')} @_);
957 $_Now = scalar localtime();
b2704119 958
aae85ceb 959 eval { require File::Spec; };
b2704119 960 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
b2704119 961 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
b2704119 962 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
b2704119 963 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
b2704119 964 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
3ef515df 965 exit;
966}
967
aae85ceb 968use vars qw(
d1256cb1 969 $_ModLines
970 $_LocalVer
971 );
aae85ceb 972
656ebd29 973sub make_configlocal_pm {
aae85ceb 974 eval { require Encode; };
975 $@ and die "Unable to require Encode: $@\n";
976 eval { require File::Spec; };
656ebd29 977
aae85ceb 978 # our used for variable expanstion
656ebd29 979 my %in_core = map { $_ => 1 } (
980 'ascii', 'iso-8859-1', 'utf8',
981 'ascii-ctrl', 'null', 'utf-8-strict'
982 );
aae85ceb 983 my %LocalMod = ();
656ebd29 984 # check @enc;
985 use File::Find ();
986 my $wanted = sub{
987 -f $_ or return;
988 $File::Find::name =~ /\A\./ and return;
989 $File::Find::name =~ /\.pm\z/ or return;
990 $File::Find::name =~ m/\bEncode\b/ or return;
991 my $mod = $File::Find::name;
992 $mod =~ s/.*\bEncode\b/Encode/o;
993 $mod =~ s/\.pm\z//o;
994 $mod =~ s,/,::,og;
995 warn qq{ require $mod;\n};
996 eval qq{ require $mod; };
997 $@ and die "Can't require $mod: $@\n";
998 for my $enc ( Encode->encodings() ) {
999 no warnings;
1000 $in_core{$enc} and next;
1001 $Encode::Config::ExtModule{$enc} and next;
1002 $LocalMod{$enc} ||= $mod;
1003 }
1004 };
1005 File::Find::find({wanted => $wanted}, @INC);
aae85ceb 1006 $_ModLines = "";
656ebd29 1007 for my $enc ( sort keys %LocalMod ) {
1008 $_ModLines .=
1009 qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
aae85ceb 1010 }
b2704119 1011 warn $_ModLines;
aae85ceb 1012 $_LocalVer = _mkversion();
656ebd29 1013 $_E2X = find_e2x();
1014 $_Inc = $INC{"Encode.pm"};
1015 $_Inc =~ s/\.pm$//o;
1016 _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1017 File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
aae85ceb 1018 exit;
1019}
1020
1021sub _mkversion{
656ebd29 1022 # v-string is now depreciated; use time() instead;
1023 #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1024 #$yyyy += 1900, $mo +=1;
1025 #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1026 return time();
aae85ceb 1027}
1028
3ef515df 1029sub _print_expand{
67d7b5ef 1030 eval { require File::Basename; };
1031 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
1032 File::Basename->import();
621b0f8d 1033 my ($src, $dst, $clobber) = @_;
1034 if (!$clobber and -e $dst){
d1256cb1 1035 warn "$dst exists. skipping\n";
1036 return;
621b0f8d 1037 }
1038 warn "Generating $dst...\n";
3ef515df 1039 open my $in, $src or die "$src : $!";
1040 if ((my $d = dirname($dst)) ne '.'){
d1256cb1 1041 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
3ef515df 1042 }
1043 open my $out, ">$dst" or die "$!";
1044 my $asis = 0;
1045 while (<$in>){
d1256cb1 1046 if (/^#### END_OF_HEADER/){
1047 $asis = 1; next;
1048 }
1049 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1050 print $out $_;
67d7b5ef 1051 }
67d7b5ef 1052}
67d7b5ef 1053__END__
1054
1055=head1 NAME
1056
1057enc2xs -- Perl Encode Module Generator
1058
1059=head1 SYNOPSIS
1060
67d7b5ef 1061 enc2xs -[options]
aae85ceb 1062 enc2xs -M ModName mapfiles...
1063 enc2xs -C
67d7b5ef 1064
1065=head1 DESCRIPTION
1066
1067F<enc2xs> builds a Perl extension for use by Encode from either
0ab8f81e 1068Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1069Besides being used internally during the build process of the Encode
1070module, you can use F<enc2xs> to add your own encoding to perl.
1071No knowledge of XS is necessary.
67d7b5ef 1072
1073=head1 Quick Guide
1074
0ab8f81e 1075If you want to know as little about Perl as possible but need to
67d7b5ef 1076add a new encoding, just read this chapter and forget the rest.
1077
1078=over 4
1079
1080=item 0.
1081
0ab8f81e 1082Have a .ucm file ready. You can get it from somewhere or you can write
1083your own from scratch or you can grab one from the Encode distribution
1084and customize it. For the UCM format, see the next Chapter. In the
1085example below, I'll call my theoretical encoding myascii, defined
1086in I<my.ucm>. C<$> is a shell prompt.
67d7b5ef 1087
1088 $ ls -F
1089 my.ucm
1090
1091=item 1.
1092
1093Issue a command as follows;
1094
1095 $ enc2xs -M My my.ucm
3ef515df 1096 generating Makefile.PL
1097 generating My.pm
1098 generating README
1099 generating Changes
67d7b5ef 1100
1101Now take a look at your current directory. It should look like this.
1102
1103 $ ls -F
1104 Makefile.PL My.pm my.ucm t/
1105
0ab8f81e 1106The following files were created.
67d7b5ef 1107
0ab8f81e 1108 Makefile.PL - MakeMaker script
1109 My.pm - Encode submodule
1110 t/My.t - test file
1111
1112=over 4
67d7b5ef 1113
037b88d6 1114=item 1.1.
1115
1116If you want *.ucm installed together with the modules, do as follows;
1117
1118 $ mkdir Encode
1119 $ mv *.ucm Encode
1120 $ enc2xs -M My Encode/*ucm
1121
0ab8f81e 1122=back
1123
67d7b5ef 1124=item 2.
1125
1126Edit the files generated. You don't have to if you have no time AND no
1127intention to give it to someone else. But it is a good idea to edit
0ab8f81e 1128the pod and to add more tests.
67d7b5ef 1129
1130=item 3.
1131
0ab8f81e 1132Now issue a command all Perl Mongers love:
67d7b5ef 1133
9160fdbd 1134 $ perl Makefile.PL
67d7b5ef 1135 Writing Makefile for Encode::My
1136
1137=item 4.
1138
1139Now all you have to do is make.
1140
1141 $ make
1142 cp My.pm blib/lib/Encode/My.pm
1143 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1144 -o encode_t.c -f encode_t.fnm
1145 Reading myascii (myascii)
1146 Writing compiled form
1147 128 bytes in string tables
cf9f87ce 1148 384 bytes (75%) saved spotting duplicates
1149 1 bytes (0.775%) saved using substrings
67d7b5ef 1150 ....
1151 chmod 644 blib/arch/auto/Encode/My/My.bs
1152 $
1153
0ab8f81e 1154The time it takes varies depending on how fast your machine is and
1155how large your encoding is. Unless you are working on something big
1156like euc-tw, it won't take too long.
67d7b5ef 1157
1158=item 5.
1159
1160You can "make install" already but you should test first.
1161
1162 $ make test
1163 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1164 -e 'use Test::Harness qw(&runtests $verbose); \
1165 $verbose=0; runtests @ARGV;' t/*.t
1166 t/My....ok
1167 All tests successful.
1168 Files=1, Tests=2, 0 wallclock secs
1169 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1170
1171=item 6.
1172
1173If you are content with the test result, just "make install"
1174
aae85ceb 1175=item 7.
1176
0ab8f81e 1177If you want to add your encoding to Encode's demand-loading list
aae85ceb 1178(so you don't have to "use Encode::YourEncoding"), run
1179
1180 enc2xs -C
1181
1182to update Encode::ConfigLocal, a module that controls local settings.
1183After that, "use Encode;" is enough to load your encodings on demand.
1184
67d7b5ef 1185=back
1186
1187=head1 The Unicode Character Map
1188
0ab8f81e 1189Encode uses the Unicode Character Map (UCM) format for source character
1190mappings. This format is used by IBM's ICU package and was adopted
1191by Nick Ing-Simmons for use with the Encode module. Since UCM is
1192more flexible than Tcl's Encoding Map and far more user-friendly,
1193this is the recommended formet for Encode now.
67d7b5ef 1194
0ab8f81e 1195A UCM file looks like this.
67d7b5ef 1196
1197 #
1198 # Comments
1199 #
1200 <code_set_name> "US-ascii" # Required
1201 <code_set_alias> "ascii" # Optional
1202 <mb_cur_min> 1 # Required; usually 1
1203 <mb_cur_max> 1 # Max. # of bytes/char
1204 <subchar> \x3F # Substitution char
1205 #
1206 CHARMAP
1207 <U0000> \x00 |0 # <control>
1208 <U0001> \x01 |0 # <control>
1209 <U0002> \x02 |0 # <control>
1210 ....
1211 <U007C> \x7C |0 # VERTICAL LINE
1212 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1213 <U007E> \x7E |0 # TILDE
1214 <U007F> \x7F |0 # <control>
1215 END CHARMAP
1216
1217=over 4
1218
1219=item *
1220
0ab8f81e 1221Anything that follows C<#> is treated as a comment.
67d7b5ef 1222
1223=item *
1224
0ab8f81e 1225The header section continues until a line containing the word
1226CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1227pair per line. Strings used as values must be quoted. Barewords are
1228treated as numbers. I<\xXX> represents a byte.
67d7b5ef 1229
1230Most of the keywords are self-explanatory. I<subchar> means
1231substitution character, not subcharacter. When you decode a Unicode
1232sequence to this encoding but no matching character is found, the byte
1233sequence defined here will be used. For most cases, the value here is
0ab8f81e 1234\x3F; in ASCII, this is a question mark.
67d7b5ef 1235
1236=item *
1237
1238CHARMAP starts the character map section. Each line has a form as
0ab8f81e 1239follows:
67d7b5ef 1240
1241 <UXXXX> \xXX.. |0 # comment
1242 ^ ^ ^
1243 | | +- Fallback flag
1244 | +-------- Encoded byte sequence
1245 +-------------- Unicode Character ID in hex
1246
0ab8f81e 1247The format is roughly the same as a header section except for the
1248fallback flag: | followed by 0..3. The meaning of the possible
1249values is as follows:
67d7b5ef 1250
0ab8f81e 1251=over 4
67d7b5ef 1252
1253=item |0
1254
0ab8f81e 1255Round trip safe. A character decoded to Unicode encodes back to the
1256same byte sequence. Most characters have this flag.
67d7b5ef 1257
1258=item |1
1259
1260Fallback for unicode -> encoding. When seen, enc2xs adds this
0ab8f81e 1261character for the encode map only.
67d7b5ef 1262
1263=item |2
1264
1265Skip sub-char mapping should there be no code point.
1266
1267=item |3
1268
1269Fallback for encoding -> unicode. When seen, enc2xs adds this
0ab8f81e 1270character for the decode map only.
67d7b5ef 1271
1272=back
1273
1274=item *
1275
1276And finally, END OF CHARMAP ends the section.
1277
1278=back
1279
6d1c0808 1280When you are manually creating a UCM file, you should copy ascii.ucm
0ab8f81e 1281or an existing encoding which is close to yours, rather than write
1282your own from scratch.
67d7b5ef 1283
1284When you do so, make sure you leave at least B<U0000> to B<U0020> as
0ab8f81e 1285is, unless your environment is EBCDIC.
67d7b5ef 1286
1287B<CAVEAT>: not all features in UCM are implemented. For example,
1288icu:state is not used. Because of that, you need to write a perl
0ab8f81e 1289module if you want to support algorithmical encodings, notably
1290the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
67d7b5ef 1291L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1292
6d1c0808 1293=head2 Coping with duplicate mappings
1294
1295When you create a map, you SHOULD make your mappings round-trip safe.
1296That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1297$data> stands for all characters that are marked as C<|0>. Here is
0ab8f81e 1298how to make sure:
6d1c0808 1299
0ab8f81e 1300=over 4
6d1c0808 1301
1302=item *
1303
1304Sort your map in Unicode order.
1305
1306=item *
1307
1308When you have a duplicate entry, mark either one with '|1' or '|3'.
1309
1310=item *
1311
0ab8f81e 1312And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
6d1c0808 1313
1314=back
1315
1316Here is an example from big5-eten.
1317
1318 <U2550> \xF9\xF9 |0
1319 <U2550> \xA2\xA4 |3
1320
1321Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1322this;
1323
1324 E to U U to E
1325 --------------------------------------
1326 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1327 \xA2\xA4 => U2550
1328
1329So it is round-trip safe for \xF9\xF9. But if the line above is upside
1330down, here is what happens.
1331
1332 E to U U to E
1333 --------------------------------------
1334 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1335 (\xF9\xF9 => U2550 is now overwritten!)
1336
1337The Encode package comes with F<ucmlint>, a crude but sufficient
0ab8f81e 1338utility to check the integrity of a UCM file. Check under the
1339Encode/bin directory for this.
cf9f87ce 1340
1341When in doubt, you can use F<ucmsort>, yet another utility under
1342Encode/bin directory.
6d1c0808 1343
67d7b5ef 1344=head1 Bookmarks
1345
0ab8f81e 1346=over 4
1347
1348=item *
1349
67d7b5ef 1350ICU Home Page
1351L<http://oss.software.ibm.com/icu/>
1352
0ab8f81e 1353=item *
1354
67d7b5ef 1355ICU Character Mapping Tables
1356L<http://oss.software.ibm.com/icu/charset/>
1357
0ab8f81e 1358=item *
1359
67d7b5ef 1360ICU:Conversion Data
1361L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1362
0ab8f81e 1363=back
1364
67d7b5ef 1365=head1 SEE ALSO
1366
1367L<Encode>,
1368L<perlmod>,
1369L<perlpod>
1370
1371=cut
1372
1373# -Q to disable the duplicate codepoint test
1374# -S make mapping errors fatal
1375# -q to remove comments written to output files
1376# -O to enable the (brute force) substring optimiser
1377# -o <output> to specify the output file name (else it's the first arg)
1378# -f <inlist> to give a file with a list of input files (else use the args)
1379# -n <name> to name the encoding (else use the basename of the input file.
1380
1381With %seen holding array refs:
1382
1383 865.66 real 28.80 user 8.79 sys
1384 7904 maximum resident set size
1385 1356 average shared memory size
1386 18566 average unshared data size
1387 229 average unshared stack size
1388 46080 page reclaims
1389 33373 page faults
1390
1391With %seen holding simple scalars:
1392
1393 342.16 real 27.11 user 3.54 sys
1394 8388 maximum resident set size
1395 1394 average shared memory size
1396 14969 average unshared data size
1397 236 average unshared stack size
1398 28159 page reclaims
1399 9839 page faults
1400
1401Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1402how %seen is storing things its seen. So it is pathalogically bad on a 16M
1403RAM machine, but it's going to help even on modern machines.
1404Swapping is bad, m'kay :-)