#!./perl
BEGIN {
- # fiddle with @INC iff I am a part of perl dist
- if ($^X =~ m/\bminiperl$/o){
- warn "Fixing \@INC for perl core.\n";
- unshift @INC, qw(../../lib ../../../lib ../../../../lib);
- $ENV{PATH} .= ';../..;../../..;../../../..' if $^O eq 'MSWin32';
- }
+ # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
+ # with $ENV{PERL_CORE} set
+ # In case we need it in future...
+ require Config; import Config;
}
use strict;
use Getopt::Std;
my @orig_ARGV = @ARGV;
-our $VERSION = do { my @r = (q$Revision: 1.10 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
-
+our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
# These may get re-ordered.
# RAW is a do_now as inserted by &enter
# AGG is an aggreagated do_now, as built up by &process
+
use constant {
RAW_NEXT => 0,
RAW_IN_LEN => 1,
AGG_OUT_LEN => 5,
AGG_FALLBACK => 6,
};
+
# (See the algorithm in encengine.c - we're building structures for it)
# There are two sorts of structures.
# -o <output> to specify the output file name (else it's the first arg)
# -f <inlist> to give a file with a list of input files (else use the args)
# -n <name> to name the encoding (else use the basename of the input file.
-getopts('M:SQqOo:f:n:',\%opt);
+getopts('CM:SQqOo:f:n:',\%opt);
$opt{M} and make_makefile_pl($opt{M}, @ARGV);
+$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
# This really should go first, else the die here causes empty (non-erroneous)
# output files to be written.
if ($cname =~ /\.(c|xs)$/)
{
$doC = 1;
- $dname =~ s/(\.[^\.]*)?$/_def.h/;
+ $dname =~ s/(\.[^\.]*)?$/.exh/;
chmod(0666,$dname) if -f $cname && !-w $dname;
open(D,">$dname") || die "Cannot open $dname:$!";
$hname =~ s/(\.[^\.]*)?$/.h/;
my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
output(\*C,$name.'_utf8',$e2u);
output(\*C,'utf8_'.$name,$u2e);
- push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
+ # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
}
foreach my $enc (sort cmp_name keys %encoding)
{
- my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
- my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
+ # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
+ my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
+ #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
+ my $replen = 0;
+ $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
+ my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
my $sym = "${enc}_encoding";
$sym =~ s/\W+/_/g;
print C "encode_t $sym = \n";
+ # This is to make null encoding work -- dankogai
+ for (my $i = (scalar @info) - 1; $i >= 0; --$i){
+ $info[$i] ||= 1;
+ }
+ # end of null tweak -- dankogai
print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
}
my $min_el;
if (exists $attr{'subchar'})
{
- my @byte;
- $attr{'subchar'} =~ /^\s*/cg;
- push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
- $erep = join('',map(chr(hex($_)),@byte));
+ #my @byte;
+ #$attr{'subchar'} =~ /^\s*/cg;
+ #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
+ #$erep = join('',map(chr(hex($_)),@byte));
+ $erep = $attr{'subchar'};
+ $erep =~ s/^\s+//; $erep =~ s/\s+$//;
}
print "Reading $name ($cs)\n";
my $nfb = 0;
s/#.*$//;
last if /^\s*END\s+CHARMAP\s*$/i;
next if /^\s*$/;
- my ($u,@byte);
- my $fb = '';
- $u = $1 if (/^<U([0-9a-f]+)>\s+/igc);
- push(@byte,$1) while /\G\\x([0-9a-f]+)/igc;
- $fb = $1 if /\G\s*(\|[0-3])/gc;
- # warn "$_: $u @byte | $fb\n";
- die "Bad line:$_" unless /\G\s*(#.*)?$/gc;
- if (defined($u))
+ my (@uni, @byte) = ();
+ my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
+ or die "Bad line: $_";
+ while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
+ push @uni, map { substr($_, 1) } split(/\+/, $1);
+ }
+ while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
+ push @byte, $1;
+ }
+ if (@uni)
{
- my $uch = encode_U(hex($u));
+ my $uch = join('', map { encode_U(hex($_)) } @uni );
my $ech = join('',map(chr(hex($_)),@byte));
my $el = length($ech);
$max_el = $el if (!defined($max_el) || $el > $max_el);
}
if ($a->{'Forward'})
{
- print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
+ my $var = $^O eq 'MacOS' ? 'extern' : 'static';
+ print $fh "\n$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
}
$a->{'Done'} = 1;
foreach my $b (@{$a->{'Entries'}})
foreach my $b (@{$a->{'Entries'}})
{
my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
- $end |= 0x80 if $fb;
+ # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
print $fh "{";
if ($l)
{
$_Enc2xs
$_Version
$_Inc
+ $_E2X
$_Name
$_TableFiles
$_Now
);
+sub find_e2x{
+ eval { require File::Find };
+ my (@inc, %e2x_dir);
+ for my $inc (@INC){
+ push @inc, $inc unless $inc eq '.'; #skip current dir
+ }
+ File::Find::find(
+ sub {
+ my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
+ $atime,$mtime,$ctime,$blksize,$blocks)
+ = lstat($_) or return;
+ -f _ or return;
+ if (/^.*\.e2x$/o){
+ $e2x_dir{$File::Find::dir} ||= $mtime;
+ }
+ return;
+ }, @inc);
+ warn join("\n", keys %e2x_dir), "\n";
+ for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
+ $_E2X = $d;
+ # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
+ return $_E2X;
+ }
+}
+
sub make_makefile_pl
{
eval { require Encode; };
# our used for variable expanstion
$_Enc2xs = $0;
$_Version = $VERSION;
- $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
+ $_E2X = find_e2x();
$_Name = shift;
$_TableFiles = join(",", map {qq('$_')} @_);
$_Now = scalar localtime();
- warn "Generating Makefile.PL\n";
- _print_expand("$_Inc/Makefile_PL.e2x", "Makefile.PL");
- warn "Generating $_Name.pm\n";
- _print_expand("$_Inc/_PM.e2x", "$_Name.pm");
- warn "Generating t/$_Name.t\n";
- _print_expand("$_Inc/_T.e2x", "t/$_Name.t");
- warn "Generating README\n";
- _print_expand("$_Inc/README.e2x", "README");
- warn "Generating t/$_Name.t\n";
- _print_expand("$_Inc/Changes.e2x", "Changes");
+
+ eval { require File::Spec; };
+ _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
+ _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
+ _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
+ _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
+ _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
+ exit;
+}
+
+use vars qw(
+ $_ModLines
+ $_LocalVer
+ );
+
+sub make_configlocal_pm
+{
+ eval { require Encode; };
+ $@ and die "Unable to require Encode: $@\n";
+ eval { require File::Spec; };
+ # our used for variable expanstion
+ my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
+ my %LocalMod = ();
+ for my $d (@INC){
+ my $inc = File::Spec->catfile($d, "Encode");
+ -d $inc or next;
+ opendir my $dh, $inc or die "$inc:$!";
+ warn "Checking $inc...\n";
+ for my $f (grep /\.pm$/o, readdir($dh)){
+ -f File::Spec->catfile($inc, "$f") or next;
+ $INC{"Encode/$f"} and next;
+ warn "require Encode/$f;\n";
+ eval { require "Encode/$f"; };
+ $@ and die "Can't require Encode/$f: $@\n";
+ for my $enc (Encode->encodings()){
+ $in_core{$enc} and next;
+ $Encode::Config::ExtModule{$enc} and next;
+ my $mod = "Encode/$f";
+ $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
+ $LocalMod{$enc} ||= $mod;
+ }
+ }
+ }
+ $_ModLines = "";
+ for my $enc (sort keys %LocalMod){
+ $_ModLines .=
+ qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
+ }
+ warn $_ModLines;
+ $_LocalVer = _mkversion();
+ $_E2X = find_e2x();
+ $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
+ _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
+ File::Spec->catfile($_Inc,"ConfigLocal.pm"),
+ 1);
exit;
}
+sub _mkversion{
+ my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
+ $yyyy += 1900, $mo +=1;
+ return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
+}
+
sub _print_expand{
eval { require File::Basename; };
$@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
File::Basename->import();
- my ($src, $dst) = @_;
+ my ($src, $dst, $clobber) = @_;
+ if (!$clobber and -e $dst){
+ warn "$dst exists. skipping\n";
+ return;
+ }
+ warn "Generating $dst...\n";
open my $in, $src or die "$src : $!";
if ((my $d = dirname($dst)) ne '.'){
-d $d or mkdir $d, 0755 or die "mkdir $d : $!";
=head1 SYNOPSIS
- enc2xs -M ModName mapfiles...
enc2xs -[options]
+ enc2xs -M ModName mapfiles...
+ enc2xs -C
=head1 DESCRIPTION
F<enc2xs> builds a Perl extension for use by Encode from either
-Unicode Character Mapping files (.ucm) or Tcl Encoding Files
-(.enc) Besides internally used during the build process of Encode
-module, you can use F<enc2xs> to add your own encoding to perl. No
-knowledge on XS is necessary.
+Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
+Besides being used internally during the build process of the Encode
+module, you can use F<enc2xs> to add your own encoding to perl.
+No knowledge of XS is necessary.
=head1 Quick Guide
-If what you want to know as little about Perl possible but needs to
+If you want to know as little about Perl as possible but need to
add a new encoding, just read this chapter and forget the rest.
=over 4
=item 0.
-Have a .ucm file ready. You can get it from somewhere or you can
-write your own from scratch or you can grab one from Encode
-distribution and customize. For UCM format, see the next Chapter.
-In the example below, I'll call my theoretical encoding myascii,
-defined inI<my.ucm>. C<$> is a shell prompt.
+Have a .ucm file ready. You can get it from somewhere or you can write
+your own from scratch or you can grab one from the Encode distribution
+and customize it. For the UCM format, see the next Chapter. In the
+example below, I'll call my theoretical encoding myascii, defined
+in I<my.ucm>. C<$> is a shell prompt.
$ ls -F
my.ucm
$ ls -F
Makefile.PL My.pm my.ucm t/
-The following files are created.
+The following files were created.
+
+ Makefile.PL - MakeMaker script
+ My.pm - Encode submodule
+ t/My.t - test file
- Makefle.PL - MakeMaker script
- My.pm - Encode Submodule
- t/My.t - test file
+=over 4
=item 1.1.
$ mv *.ucm Encode
$ enc2xs -M My Encode/*ucm
+=back
+
=item 2.
Edit the files generated. You don't have to if you have no time AND no
intention to give it to someone else. But it is a good idea to edit
-pod and add more tests.
+the pod and to add more tests.
=item 3.
-Now issue a command all Perl Mongers love;
+Now issue a command all Perl Mongers love:
- $ perl5.7.3 Makefile.PL
+ $ perl Makefile.PL
Writing Makefile for Encode::My
=item 4.
chmod 644 blib/arch/auto/Encode/My/My.bs
$
-The time it takes varies how fast your machine is and how large your
-encoding is. Unless you are working on something big like euc-tw, it
-won't take too long.
+The time it takes varies depending on how fast your machine is and
+how large your encoding is. Unless you are working on something big
+like euc-tw, it won't take too long.
=item 5.
If you are content with the test result, just "make install"
+=item 7.
+
+If you want to add your encoding to Encode's demand-loading list
+(so you don't have to "use Encode::YourEncoding"), run
+
+ enc2xs -C
+
+to update Encode::ConfigLocal, a module that controls local settings.
+After that, "use Encode;" is enough to load your encodings on demand.
+
=back
=head1 The Unicode Character Map
-Encode uses The Unicode Character Map (UCM) for source character
-mappings. This format is used by ICU package of IBM and adopted by
-Nick Ing-Simmons. Since UCM is more flexible than Tcl's Encoding Map
-and far more user-friendly, This is the recommended formet for
-Encode now.
+Encode uses the Unicode Character Map (UCM) format for source character
+mappings. This format is used by IBM's ICU package and was adopted
+by Nick Ing-Simmons for use with the Encode module. Since UCM is
+more flexible than Tcl's Encoding Map and far more user-friendly,
+this is the recommended formet for Encode now.
-UCM file looks like this.
+A UCM file looks like this.
#
# Comments
=item *
-Anything that follows C<#> is treated as comments.
+Anything that follows C<#> is treated as a comment.
=item *
-The header section continues until CHARMAP. This section Has a form of
-I<E<lt>keywordE<gt> value>, one at a line. For a value, strings must
-be quoted. Barewords are treated as numbers. I<\xXX> represents a
-byte.
+The header section continues until a line containing the word
+CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
+pair per line. Strings used as values must be quoted. Barewords are
+treated as numbers. I<\xXX> represents a byte.
Most of the keywords are self-explanatory. I<subchar> means
substitution character, not subcharacter. When you decode a Unicode
sequence to this encoding but no matching character is found, the byte
sequence defined here will be used. For most cases, the value here is
-\x3F, in ASCII this is a question mark.
+\x3F; in ASCII, this is a question mark.
=item *
CHARMAP starts the character map section. Each line has a form as
-follows;
+follows:
<UXXXX> \xXX.. |0 # comment
^ ^ ^
| +-------- Encoded byte sequence
+-------------- Unicode Character ID in hex
-The format is roughly the same as a header section except for fallback
-flag. It is | followed by 0..3. And their meaning as follows
+The format is roughly the same as a header section except for the
+fallback flag: | followed by 0..3. The meaning of the possible
+values is as follows:
-=over 2
+=over 4
=item |0
-Round trip safe. A character decoded to Unicode encodes back to the
-same byte sequence. most character belong to this.
+Round trip safe. A character decoded to Unicode encodes back to the
+same byte sequence. Most characters have this flag.
=item |1
Fallback for unicode -> encoding. When seen, enc2xs adds this
-character for encode map only
+character for the encode map only.
=item |2
=item |3
Fallback for encoding -> unicode. When seen, enc2xs adds this
-character for decode map only
+character for the decode map only.
=back
=back
-Needless to say, if you are manually creating a UCM file, you should
-copy ascii.ucm or existing encoding which is close to yours than write
-your own from scratch.
+When you are manually creating a UCM file, you should copy ascii.ucm
+or an existing encoding which is close to yours, rather than write
+your own from scratch.
When you do so, make sure you leave at least B<U0000> to B<U0020> as
-is, unless your environment is on EBCDIC.
+is, unless your environment is EBCDIC.
B<CAVEAT>: not all features in UCM are implemented. For example,
icu:state is not used. Because of that, you need to write a perl
-module if you want to support algorithmical encodings, notablly
-ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
+module if you want to support algorithmical encodings, notably
+the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
+=head2 Coping with duplicate mappings
+
+When you create a map, you SHOULD make your mappings round-trip safe.
+That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
+$data> stands for all characters that are marked as C<|0>. Here is
+how to make sure:
+
+=over 4
+
+=item *
+
+Sort your map in Unicode order.
+
+=item *
+
+When you have a duplicate entry, mark either one with '|1' or '|3'.
+
+=item *
+
+And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
+
+=back
+
+Here is an example from big5-eten.
+
+ <U2550> \xF9\xF9 |0
+ <U2550> \xA2\xA4 |3
+
+Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
+this;
+
+ E to U U to E
+ --------------------------------------
+ \xF9\xF9 => U2550 U2550 => \xF9\xF9
+ \xA2\xA4 => U2550
+
+So it is round-trip safe for \xF9\xF9. But if the line above is upside
+down, here is what happens.
+
+ E to U U to E
+ --------------------------------------
+ \xA2\xA4 => U2550 U2550 => \xF9\xF9
+ (\xF9\xF9 => U2550 is now overwritten!)
+
+The Encode package comes with F<ucmlint>, a crude but sufficient
+utility to check the integrity of a UCM file. Check under the
+Encode/bin directory for this.
+
+
=head1 Bookmarks
+=over 4
+
+=item *
+
ICU Home Page
L<http://oss.software.ibm.com/icu/>
+=item *
+
ICU Character Mapping Tables
L<http://oss.software.ibm.com/icu/charset/>
+=item *
+
ICU:Conversion Data
L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
+=back
+
=head1 SEE ALSO
L<Encode>,