X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=ext%2FEncode%2Fbin%2Fenc2xs;h=57d256ecd727be04c817ac63941b583db2018a9b;hb=6e8dae779caa61392088d43ba93888dc57f1df7e;hp=1acb61379e9e6e63accd4242c3b4ab43856c5384;hpb=48e3bbddf569369fe6921f305df6ab7290c91152;p=p5sagit%2Fp5-mst-13.2.git diff --git a/ext/Encode/bin/enc2xs b/ext/Encode/bin/enc2xs index 1acb613..57d256e 100644 --- a/ext/Encode/bin/enc2xs +++ b/ext/Encode/bin/enc2xs @@ -1,17 +1,21 @@ -#!../../../perl -w +#!./perl BEGIN { - unshift @INC, qw(../../lib ../../../lib ../../../../lib); - $ENV{PATH} .= ';../..;../../..;../../../..' if $^O eq 'MSWin32'; + # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's + # with $ENV{PERL_CORE} set + # In case we need it in future... + require Config; import Config; } use strict; +use warnings; use Getopt::Std; +use Config; my @orig_ARGV = @ARGV; -our $VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; - +our $VERSION = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # These may get re-ordered. # RAW is a do_now as inserted by &enter # AGG is an aggreagated do_now, as built up by &process + use constant { RAW_NEXT => 0, RAW_IN_LEN => 1, @@ -26,6 +30,7 @@ use constant { AGG_OUT_LEN => 5, AGG_FALLBACK => 6, }; + # (See the algorithm in encengine.c - we're building structures for it) # There are two sorts of structures. @@ -128,9 +133,10 @@ my %opt; # -o to specify the output file name (else it's the first arg) # -f to give a file with a list of input files (else use the args) # -n to name the encoding (else use the basename of the input file. -getopts('M:SQqOo:f:n:',\%opt); +getopts('CM:SQqOo:f:n:',\%opt); $opt{M} and make_makefile_pl($opt{M}, @ARGV); +$opt{C} and make_configlocal_pm($opt{C}, @ARGV); # This really should go first, else the die here causes empty (non-erroneous) # output files to be written. @@ -154,10 +160,10 @@ my $hname = $cname; my ($doC,$doEnc,$doUcm,$doPet); -if ($cname =~ /\.(c|xs)$/) +if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined { $doC = 1; - $dname =~ s/(\.[^\.]*)?$/_def.h/; + $dname =~ s/(\.[^\.]*)?$/.exh/; chmod(0666,$dname) if -f $cname && !-w $dname; open(D,">$dname") || die "Cannot open $dname:$!"; $hname =~ s/(\.[^\.]*)?$/.h/; @@ -171,6 +177,7 @@ if ($cname =~ /\.(c|xs)$/) !!!!!!! DO NOT EDIT THIS FILE !!!!!!! This file was autogenerated by: $^X $0 @orig_ARGV + enc2xs VERSION $VERSION */ END } @@ -182,7 +189,7 @@ END print C "#include \n"; print C "#define U8 U8\n"; } - print C "#include \"encode.h\"\n"; + print C "#include \"encode.h\"\n\n"; } elsif ($cname =~ /\.enc$/) @@ -200,6 +207,9 @@ elsif ($cname =~ /\.pet$/) my %encoding; my %strings; +my $string_acc; +my %strings_in_acc; + my $saved = 0; my $subsave = 0; my $strings = 0; @@ -246,18 +256,43 @@ if ($doC) foreach my $name (sort cmp_name keys %encoding) { my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; - output(\*C,$name.'_utf8',$e2u); - output(\*C,'utf8_'.$name,$u2e); - push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep)); + process($name.'_utf8',$e2u); + addstrings(\*C,$e2u); + + process('utf8_'.$name,$u2e); + addstrings(\*C,$u2e); + } + outbigstring(\*C,"enctable"); + foreach my $name (sort cmp_name keys %encoding) + { + my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; + outtable(\*C,$e2u, "enctable"); + outtable(\*C,$u2e, "enctable"); + + # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep)); } + my $cpp = ($Config{d_cplusplus} || '') eq 'define'; + my $ext_c = $cpp ? 'extern "C" ' : ""; foreach my $enc (sort cmp_name keys %encoding) { - my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}}; - my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el); + # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}}; + my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}}; + #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el); + my $replen = 0; + $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g); my $sym = "${enc}_encoding"; $sym =~ s/\W+/_/g; - print C "encode_t $sym = \n"; - print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n"; + my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen, + $min_el,$max_el); + print C "${ext_c}static const U8 ${sym}_rep_character[] = \"$rep\";\n"; + print C "${ext_c}static const char ${sym}_enc_name[] = \"$enc\";\n\n"; + print C "${ext_c}const encode_t $sym = \n"; + # This is to make null encoding work -- dankogai + for (my $i = (scalar @info) - 1; $i >= 0; --$i){ + $info[$i] ||= 1; + } + # end of null tweak -- dankogai + print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n"; } foreach my $enc (sort cmp_name keys %encoding) @@ -303,8 +338,8 @@ END close(D) or warn "Error closing '$dname': $!"; close(H) or warn "Error closing '$hname': $!"; - my $perc_saved = $strings/($strings + $saved) * 100; - my $perc_subsaved = $strings/($strings + $subsave) * 100; + my $perc_saved = $saved/($strings + $saved) * 100; + my $perc_subsaved = $subsave/($strings + $subsave) * 100; printf STDERR "%d bytes in string tables\n",$strings; printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n", $saved, $perc_saved if $saved; @@ -364,10 +399,12 @@ sub compile_ucm my $min_el; if (exists $attr{'subchar'}) { - my @byte; - $attr{'subchar'} =~ /^\s*/cg; - push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg; - $erep = join('',map(chr(hex($_)),@byte)); + #my @byte; + #$attr{'subchar'} =~ /^\s*/cg; + #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg; + #$erep = join('',map(chr(hex($_)),@byte)); + $erep = $attr{'subchar'}; + $erep =~ s/^\s+//; $erep =~ s/\s+$//; } print "Reading $name ($cs)\n"; my $nfb = 0; @@ -377,16 +414,18 @@ sub compile_ucm s/#.*$//; last if /^\s*END\s+CHARMAP\s*$/i; next if /^\s*$/; - my ($u,@byte); - my $fb = ''; - $u = $1 if (/^\s+/igc); - push(@byte,$1) while /\G\\x([0-9a-f]+)/igc; - $fb = $1 if /\G\s*(\|[0-3])/gc; - # warn "$_: $u @byte | $fb\n"; - die "Bad line:$_" unless /\G\s*(#.*)?$/gc; - if (defined($u)) + my (@uni, @byte) = (); + my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o + or die "Bad line: $_"; + while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){ + push @uni, map { substr($_, 1) } split(/\+/, $1); + } + while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){ + push @byte, $1; + } + if (@uni) { - my $uch = encode_U(hex($u)); + my $uch = join('', map { encode_U(hex($_)) } @uni ); my $ech = join('',map(chr(hex($_)),@byte)); my $el = length($ech); $max_el = $el if (!defined($max_el) || $el > $max_el); @@ -579,43 +618,6 @@ sub enter_fb0 { } } - -sub outstring -{ - my ($fh,$name,$s) = @_; - my $sym = $strings{$s}; - if ($sym) - { - $saved += length($s); - } - else - { - if ($opt{'O'}) { - foreach my $o (keys %strings) - { - next unless (my $i = index($o,$s)) >= 0; - $sym = $strings{$o}; - # gcc things that 0x0e+0x10 (anything with e+) starts to look like - # a hexadecimal floating point constant. Silly gcc. Only p - # introduces a floating point constant. Put the space in to stop it - # getting confused. - $sym .= sprintf(" +0x%02x",$i) if ($i); - $subsave += length($s); - return $strings{$s} = $sym; - } - } - $strings{$s} = $sym = $name; - $strings += length($s); - my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s); - # Maybe we should assert that these are all <256. - $definition .= join(',',unpack "C*",$s); - # We have a single long line. Split it at convenient commas. - $definition =~ s/(.{74,77},)/$1\n/g; - print $fh "$definition };\n\n"; - } - return $sym; -} - sub process { my ($name,$a) = @_; @@ -676,7 +678,8 @@ sub process $a->{'Entries'} = \@ent; } -sub outtable + +sub addstrings { my ($fh,$a) = @_; my $name = $a->{'Cname'}; @@ -684,29 +687,116 @@ sub outtable foreach my $b (@{$a->{'Entries'}}) { next unless $b->[AGG_OUT_LEN]; - my $s = $b->[AGG_MIN_IN]; - my $e = $b->[AGG_MAX_IN]; - outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]); + $strings{$b->[AGG_OUT_BYTES]} = undef; } if ($a->{'Forward'}) { - print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n"; + my $cpp = ($Config{d_cplusplus} || '') eq 'define'; + my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static'; + my $const = $cpp ? '' : 'const'; + print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n"; + } + $a->{'DoneStrings'} = 1; + foreach my $b (@{$a->{'Entries'}}) + { + my ($s,$e,$out,$t,$end,$l) = @$b; + addstrings($fh,$t) unless $t->{'DoneStrings'}; + } +} + +sub outbigstring +{ + my ($fh,$name) = @_; + + $string_acc = ''; + + # Make the big string in the string accumulator. Longest first, on the hope + # that this makes it more likely that we find the short strings later on. + # Not sure if it helps sorting strings of the same length lexcically. + foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) { + my $index = index $string_acc, $s; + if ($index >= 0) { + $saved += length($s); + $strings_in_acc{$s} = $index; + } else { + OPTIMISER: { + if ($opt{'O'}) { + my $sublength = length $s; + while (--$sublength > 0) { + # progressively lop characters off the end, to see if the start of + # the new string overlaps the end of the accumulator. + if (substr ($string_acc, -$sublength) + eq substr ($s, 0, $sublength)) { + $subsave += $sublength; + $strings_in_acc{$s} = length ($string_acc) - $sublength; + # append the last bit on the end. + $string_acc .= substr ($s, $sublength); + last OPTIMISER; + } + # or if the end of the new string overlaps the start of the + # accumulator + next unless substr ($string_acc, 0, $sublength) + eq substr ($s, -$sublength); + # well, the last $sublength characters of the accumulator match. + # so as we're prepending to the accumulator, need to shift all our + # existing offsets forwards + $_ += $sublength foreach values %strings_in_acc; + $subsave += $sublength; + $strings_in_acc{$s} = 0; + # append the first bit on the start. + $string_acc = substr ($s, 0, -$sublength) . $string_acc; + last OPTIMISER; + } + } + # Optimiser (if it ran) found nothing, so just going have to tack the + # whole thing on the end. + $strings_in_acc{$s} = length $string_acc; + $string_acc .= $s; + }; + } } + + $strings = length $string_acc; + my $cpp = ($Config{d_cplusplus} || '') eq 'define'; + my $var = $cpp ? '' : 'static'; + my $definition = "\n$var const U8 $name\[$strings] = { " . + join(',',unpack "C*",$string_acc); + # We have a single long line. Split it at convenient commas. + print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs; + print $fh substr ($definition, pos $definition), " };\n"; +} + +sub findstring { + my ($name,$s) = @_; + my $offset = $strings_in_acc{$s}; + die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator" + unless defined $offset; + "$name + $offset"; +} + +sub outtable +{ + my ($fh,$a,$bigname) = @_; + my $name = $a->{'Cname'}; $a->{'Done'} = 1; foreach my $b (@{$a->{'Entries'}}) { my ($s,$e,$out,$t,$end,$l) = @$b; - outtable($fh,$t) unless $t->{'Done'}; + outtable($fh,$t,$bigname) unless $t->{'Done'}; } - print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n"; + my $cpp = ($Config{d_cplusplus} || '') eq 'define'; + my $var = $cpp ? '' : 'static'; + my $const = $cpp ? '' : 'const'; + print $fh "\n$var $const encpage_t $name\[", + scalar(@{$a->{'Entries'}}), "] = {\n"; foreach my $b (@{$a->{'Entries'}}) { my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b; - $end |= 0x80 if $fb; + # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan print $fh "{"; if ($l) { - printf $fh outstring($fh,'',$out); + printf $fh findstring($bigname,$out); } else { @@ -718,14 +808,6 @@ sub outtable print $fh "};\n"; } -sub output -{ - my ($fh,$name,$a) = @_; - process($name,$a); - # Sub-tables - outtable($fh,$a); -} - sub output_enc { my ($fh,$name,$a) = @_; @@ -828,215 +910,148 @@ sub output_ucm print $fh "END CHARMAP\n"; } -sub make_makefile_pl -{ - eval { require Encode; }; - $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n"; - eval { require File::Basename; }; - $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n"; - File::Basename->import(); - my $inc = dirname($INC{"Encode/Internal.pm"}); - my $name = shift; - my $table_files = join(",", map {qq('$_')} @_); - my $now = scalar localtime(); - open my $fh, ">Makefile.PL" or die "$!"; - print $fh <<"END_OF_HEADER"; -# -# This file is auto-generated by: -# $0 -# $now -# -use 5.7.2; -use strict; -use ExtUtils::MakeMaker; - -# Please edit the following to the taste! -my \$name = '$name'; -my \%tables = ( - encode_t => [ $table_files ], - ); - -# And leave the rest! -my \$enc2xs = '$0'; -WriteMakefile( - INC => "-I$inc", -END_OF_HEADER - - print $fh <<'END_OF_MAKEFILE_PL'; - NAME => 'Encode::'.$name, - VERSION_FROM => "$name.pm", - OBJECT => '$(O_FILES)', - 'dist' => { - COMPRESS => 'gzip -9f', - SUFFIX => 'gz', - DIST_DEFAULT => 'all tardist', - }, - MAN3PODS => {}, - # OS 390 winges about line numbers > 64K ??? - XSOPT => '-nolinenumbers', - ); - -package MY; - -sub post_initialize -{ - my ($self) = @_; - my %o; - my $x = $self->{'OBJ_EXT'}; - # Add the table O_FILES - foreach my $e (keys %tables) - { - $o{$e.$x} = 1; +use vars qw( + $_Enc2xs + $_Version + $_Inc + $_E2X + $_Name + $_TableFiles + $_Now +); + +sub find_e2x{ + eval { require File::Find; }; + my (@inc, %e2x_dir); + for my $inc (@INC){ + push @inc, $inc unless $inc eq '.'; #skip current dir } - $o{"$name$x"} = 1; - $self->{'O_FILES'} = [sort keys %o]; - my @files = ("$name.xs"); - $self->{'C'} = ["$name.c"]; - # $self->{'H'} = [$self->catfile($self->updir,'encode.h')]; - my %xs; - foreach my $table (keys %tables) { - push (@{$self->{'C'}},"$table.c"); - # Do NOT add $table.h etc. to H_FILES unless we own up as to how they - # get built. - foreach my $ext (qw($(OBJ_EXT) .c .h _def.h .fnm)) { - push (@files,$table.$ext); - } - } - $self->{'XS'} = { "$name.xs" => "$name.c" }; - $self->{'clean'}{'FILES'} .= join(' ',@files); - open(XS,">$name.xs") || die "Cannot open $name.xs:$!"; - print XS <<'END'; -#include -#include -#include -#define U8 U8 -#include "encode.h" -END - foreach my $table (keys %tables) { - print XS qq[#include "${table}.h"\n]; + File::Find::find( + sub { + my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, + $atime,$mtime,$ctime,$blksize,$blocks) + = lstat($_) or return; + -f _ or return; + if (/^.*\.e2x$/o){ + no warnings 'once'; + $e2x_dir{$File::Find::dir} ||= $mtime; + } + return; + }, @inc); + warn join("\n", keys %e2x_dir), "\n"; + for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){ + $_E2X = $d; + # warn "$_E2X => ", scalar localtime($e2x_dir{$d}); + return $_E2X; } - print XS <<"END"; - -static void -Encode_XSEncoding(pTHX_ encode_t *enc) -{ - dSP; - HV *stash = gv_stashpv("Encode::XS", TRUE); - SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash); - int i = 0; - PUSHMARK(sp); - XPUSHs(sv); - while (enc->name[i]) - { - const char *name = enc->name[i++]; - XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); - } - PUTBACK; - call_pv("Encode::define_encoding",G_DISCARD); - SvREFCNT_dec(sv); } -MODULE = Encode::$name PACKAGE = Encode::$name -PROTOTYPES: DISABLE -BOOT: +sub make_makefile_pl { -END - foreach my $table (keys %tables) { - print XS qq[#include "${table}_def.h"\n]; - } - print XS "}\n"; - close(XS); - return "# Built $name.xs\n\n"; + eval { require Encode; }; + $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n"; + # our used for variable expanstion + $_Enc2xs = $0; + $_Version = $VERSION; + $_E2X = find_e2x(); + $_Name = shift; + $_TableFiles = join(",", map {qq('$_')} @_); + $_Now = scalar localtime(); + + eval { require File::Spec; }; + _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL"); + _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm"); + _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t"); + _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README"); + _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes"); + exit; } -sub postamble -{ - my $self = shift; - my $dir = "."; # $self->catdir('Encode'); - my $str = "# $name\$(OBJ_EXT) depends on .h and _def.h files not .c files - but all written by enc2xs\n"; - $str .= "$name.c : $name.xs "; - foreach my $table (keys %tables) - { - $str .= " $table.c"; - } - $str .= "\n\n"; - $str .= "$name\$(OBJ_EXT) : $name.c\n\n"; +use vars qw( + $_ModLines + $_LocalVer + ); - foreach my $table (keys %tables) - { - my $numlines = 1; - my $lengthsofar = length($str); - my $continuator = ''; - $str .= "$table.c : Makefile.PL"; - foreach my $file (@{$tables{$table}}) - { - $str .= $continuator.' '.$self->catfile($dir,$file); - if ( length($str)-$lengthsofar > 128*$numlines ) - { - $continuator .= " \\\n\t"; - $numlines++; - } else { - $continuator = ''; - } - } - $str .= $^O eq 'VMS' # In VMS quote to preserve case - ? qq{\n\t\$(PERL) $enc2xs -"Q" -"O" -o \$\@ -f $table.fnm\n\n} - : qq{\n\t\$(PERL) $enc2xs -Q -O -o \$\@ -f $table.fnm\n\n}; - open (FILELIST, ">$table.fnm") - || die "Could not open $table.fnm: $!"; - foreach my $file (@{$tables{$table}}) - { - print FILELIST $self->catfile($dir,$file) . "\n"; +sub make_configlocal_pm { + eval { require Encode; }; + $@ and die "Unable to require Encode: $@\n"; + eval { require File::Spec; }; + + # our used for variable expanstion + my %in_core = map { $_ => 1 } ( + 'ascii', 'iso-8859-1', 'utf8', + 'ascii-ctrl', 'null', 'utf-8-strict' + ); + my %LocalMod = (); + # check @enc; + use File::Find (); + my $wanted = sub{ + -f $_ or return; + $File::Find::name =~ /\A\./ and return; + $File::Find::name =~ /\.pm\z/ or return; + $File::Find::name =~ m/\bEncode\b/ or return; + my $mod = $File::Find::name; + $mod =~ s/.*\bEncode\b/Encode/o; + $mod =~ s/\.pm\z//o; + $mod =~ s,/,::,og; + warn qq{ require $mod;\n}; + eval qq{ require $mod; }; + $@ and die "Can't require $mod: $@\n"; + for my $enc ( Encode->encodings() ) { + no warnings; + $in_core{$enc} and next; + $Encode::Config::ExtModule{$enc} and next; + $LocalMod{$enc} ||= $mod; } - close(FILELIST); + }; + File::Find::find({wanted => $wanted}, @INC); + $_ModLines = ""; + for my $enc ( sort keys %LocalMod ) { + $_ModLines .= + qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n); } - return $str; -} -END_OF_MAKEFILE_PL - close $fh; - (my $pm =<<"END_OF_PM") =~ s/^# //gm; -# package Encode::$name; -# our \$VERSION = "0.01"; -# -# use Encode; -# use XSLoader; -# XSLoader::load('Encode::$name', \$VERSION); -# -# 1; -# __END__ -# -# =head1 NAME -# -# Encode::$name - New Encoding -# -# =head1 SYNOPSIS -# -# You got to fill this in! -# -# =head1 SEE ALSO -# -# L -# -# =cut -END_OF_PM - open $fh, ">$name.pm" or die "$name.pm:$!"; - print $fh $pm; - close $fh; - -d 't' or mkdir 't', 0755 or die "mkdir t:$!"; - open $fh, ">t/$name.t" or die "t/$name.t:$!"; -print $fh <<"END_OF_TEST"; -use strict; -# Adjust the number here! -use Test::More tests => 2; - -use_ok('Encode'); -use_ok('Encode::$name'); -# Add more test here! -END_OF_TEST - close $fh; + warn $_ModLines; + $_LocalVer = _mkversion(); + $_E2X = find_e2x(); + $_Inc = $INC{"Encode.pm"}; + $_Inc =~ s/\.pm$//o; + _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ), + File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 ); exit; } +sub _mkversion{ + # v-string is now depreciated; use time() instead; + #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime(); + #$yyyy += 1900, $mo +=1; + #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm); + return time(); +} + +sub _print_expand{ + eval { require File::Basename; }; + $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n"; + File::Basename->import(); + my ($src, $dst, $clobber) = @_; + if (!$clobber and -e $dst){ + warn "$dst exists. skipping\n"; + return; + } + warn "Generating $dst...\n"; + open my $in, $src or die "$src : $!"; + if ((my $d = dirname($dst)) ne '.'){ + -d $d or mkdir $d, 0755 or die "mkdir $d : $!"; + } + open my $out, ">$dst" or die "$!"; + my $asis = 0; + while (<$in>){ + if (/^#### END_OF_HEADER/){ + $asis = 1; next; + } + s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis; + print $out $_; + } +} __END__ =head1 NAME @@ -1045,31 +1060,32 @@ enc2xs -- Perl Encode Module Generator =head1 SYNOPSIS - enc2xs -M ModName mapfiles... enc2xs -[options] + enc2xs -M ModName mapfiles... + enc2xs -C =head1 DESCRIPTION F builds a Perl extension for use by Encode from either -Unicode Character Mapping files (.ucm) or Tcl Encoding Files -(.enc) Besides internally used during the build process of Encode -module, you can use F to add your own encoding to perl. No -knowledge on XS is necessary. +Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc). +Besides being used internally during the build process of the Encode +module, you can use F to add your own encoding to perl. +No knowledge of XS is necessary. =head1 Quick Guide -If what you want to know as little about Perl possible but needs to +If you want to know as little about Perl as possible but need to add a new encoding, just read this chapter and forget the rest. =over 4 =item 0. -Have a .ucm file ready. You can get it from somewhere or you can -write your own from scratch or you can grab one from Encode -distribution and customize. For UCM format, see the next Chapter. -In the example below, I'll call my theoretical encoding myascii, -defined inI. C<$> is a shell prompt. +Have a .ucm file ready. You can get it from somewhere or you can write +your own from scratch or you can grab one from the Encode distribution +and customize it. For the UCM format, see the next Chapter. In the +example below, I'll call my theoretical encoding myascii, defined +in I. C<$> is a shell prompt. $ ls -F my.ucm @@ -1079,29 +1095,45 @@ defined inI. C<$> is a shell prompt. Issue a command as follows; $ enc2xs -M My my.ucm + generating Makefile.PL + generating My.pm + generating README + generating Changes Now take a look at your current directory. It should look like this. $ ls -F Makefile.PL My.pm my.ucm t/ -The following files are created. +The following files were created. + + Makefile.PL - MakeMaker script + My.pm - Encode submodule + t/My.t - test file + +=over 4 + +=item 1.1. - Makefle.PL - MakeMaker script - My.pm - Encode Submodule - t/My.t - test file +If you want *.ucm installed together with the modules, do as follows; + + $ mkdir Encode + $ mv *.ucm Encode + $ enc2xs -M My Encode/*ucm + +=back =item 2. Edit the files generated. You don't have to if you have no time AND no intention to give it to someone else. But it is a good idea to edit -pod and add more tests. +the pod and to add more tests. =item 3. -Now issue a command all Perl Mongers love; +Now issue a command all Perl Mongers love: - $ perl5.7.3 Makefile.PL + $ perl Makefile.PL Writing Makefile for Encode::My =item 4. @@ -1115,15 +1147,15 @@ Now all you have to do is make. Reading myascii (myascii) Writing compiled form 128 bytes in string tables - 384 bytes (25%) saved spotting duplicates - 1 bytes (99.2%) saved using substrings + 384 bytes (75%) saved spotting duplicates + 1 bytes (0.775%) saved using substrings .... chmod 644 blib/arch/auto/Encode/My/My.bs $ -The time it takes varies how fast your machine is and how large your -encoding is. Unless you are working on something big like euc-tw, it -won't take too long. +The time it takes varies depending on how fast your machine is and +how large your encoding is. Unless you are working on something big +like euc-tw, it won't take too long. =item 5. @@ -1142,17 +1174,27 @@ You can "make install" already but you should test first. If you are content with the test result, just "make install" +=item 7. + +If you want to add your encoding to Encode's demand-loading list +(so you don't have to "use Encode::YourEncoding"), run + + enc2xs -C + +to update Encode::ConfigLocal, a module that controls local settings. +After that, "use Encode;" is enough to load your encodings on demand. + =back =head1 The Unicode Character Map -Encode uses The Unicode Character Map (UCM) for source character -mappings. This format is used by ICU package of IBM and adopted by -Nick Ing-Simmons. Since UCM is more flexible than Tcl's Encoding Map -and far more user-friendly, This is the recommended formet for -Encode now. +Encode uses the Unicode Character Map (UCM) format for source character +mappings. This format is used by IBM's ICU package and was adopted +by Nick Ing-Simmons for use with the Encode module. Since UCM is +more flexible than Tcl's Encoding Map and far more user-friendly, +this is the recommended formet for Encode now. -UCM file looks like this. +A UCM file looks like this. # # Comments @@ -1178,25 +1220,25 @@ UCM file looks like this. =item * -Anything that follows C<#> is treated as comments. +Anything that follows C<#> is treated as a comment. =item * -The header section continues until CHARMAP. This section Has a form of -IkeywordE value>, one at a line. For a value, strings must -be quoted. Barewords are treated as numbers. I<\xXX> represents a -byte. +The header section continues until a line containing the word +CHARMAP. This section has a form of IkeywordE value>, one +pair per line. Strings used as values must be quoted. Barewords are +treated as numbers. I<\xXX> represents a byte. Most of the keywords are self-explanatory. I means substitution character, not subcharacter. When you decode a Unicode sequence to this encoding but no matching character is found, the byte sequence defined here will be used. For most cases, the value here is -\x3F, in ASCII this is a question mark. +\x3F; in ASCII, this is a question mark. =item * CHARMAP starts the character map section. Each line has a form as -follows; +follows: \xXX.. |0 # comment ^ ^ ^ @@ -1204,20 +1246,21 @@ follows; | +-------- Encoded byte sequence +-------------- Unicode Character ID in hex -The format is roughly the same as a header section except for fallback -flag. It is | followed by 0..3. And their meaning as follows +The format is roughly the same as a header section except for the +fallback flag: | followed by 0..3. The meaning of the possible +values is as follows: -=over 2 +=over 4 =item |0 -Round trip safe. A character decoded to Unicode encodes back to the -same byte sequence. most character belong to this. +Round trip safe. A character decoded to Unicode encodes back to the +same byte sequence. Most characters have this flag. =item |1 Fallback for unicode -> encoding. When seen, enc2xs adds this -character for encode map only +character for the encode map only. =item |2 @@ -1226,7 +1269,7 @@ Skip sub-char mapping should there be no code point. =item |3 Fallback for encoding -> unicode. When seen, enc2xs adds this -character for decode map only +character for the decode map only. =back @@ -1236,30 +1279,91 @@ And finally, END OF CHARMAP ends the section. =back -Needless to say, if you are manually creating a UCM file, you should -copy ascii.ucm or existing encoding which is close to yours than write -your own from scratch. +When you are manually creating a UCM file, you should copy ascii.ucm +or an existing encoding which is close to yours, rather than write +your own from scratch. When you do so, make sure you leave at least B to B as -is, unless your environment is on EBCDIC. +is, unless your environment is EBCDIC. B: not all features in UCM are implemented. For example, icu:state is not used. Because of that, you need to write a perl -module if you want to support algorithmical encodings, notablly -ISO-2022 series. Such modules include L, +module if you want to support algorithmical encodings, notably +the ISO-2022 series. Such modules include L, L, and L. +=head2 Coping with duplicate mappings + +When you create a map, you SHOULD make your mappings round-trip safe. +That is, C stands for all characters that are marked as C<|0>. Here is +how to make sure: + +=over 4 + +=item * + +Sort your map in Unicode order. + +=item * + +When you have a duplicate entry, mark either one with '|1' or '|3'. + +=item * + +And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry. + +=back + +Here is an example from big5-eten. + + \xF9\xF9 |0 + \xA2\xA4 |3 + +Internally Encoding -> Unicode and Unicode -> Encoding Map looks like +this; + + E to U U to E + -------------------------------------- + \xF9\xF9 => U2550 U2550 => \xF9\xF9 + \xA2\xA4 => U2550 + +So it is round-trip safe for \xF9\xF9. But if the line above is upside +down, here is what happens. + + E to U U to E + -------------------------------------- + \xA2\xA4 => U2550 U2550 => \xF9\xF9 + (\xF9\xF9 => U2550 is now overwritten!) + +The Encode package comes with F, a crude but sufficient +utility to check the integrity of a UCM file. Check under the +Encode/bin directory for this. + +When in doubt, you can use F, yet another utility under +Encode/bin directory. + =head1 Bookmarks +=over 4 + +=item * + ICU Home Page L +=item * + ICU Character Mapping Tables L +=item * + ICU:Conversion Data L +=back + =head1 SEE ALSO L,