require Config; import Config;
}
use strict;
+use warnings;
use Getopt::Std;
+use Config;
my @orig_ARGV = @ARGV;
-our $VERSION = do { my @r = (q$Revision: 1.20 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
# These may get re-ordered.
# RAW is a do_now as inserted by &enter
# -o <output> to specify the output file name (else it's the first arg)
# -f <inlist> to give a file with a list of input files (else use the args)
# -n <name> to name the encoding (else use the basename of the input file.
-getopts('M:SQqOo:f:n:',\%opt);
+getopts('CM:SQqOo:f:n:',\%opt);
$opt{M} and make_makefile_pl($opt{M}, @ARGV);
+$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
# This really should go first, else the die here causes empty (non-erroneous)
# output files to be written.
my ($doC,$doEnc,$doUcm,$doPet);
-if ($cname =~ /\.(c|xs)$/)
+if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
{
$doC = 1;
$dname =~ s/(\.[^\.]*)?$/.exh/;
!!!!!!! DO NOT EDIT THIS FILE !!!!!!!
This file was autogenerated by:
$^X $0 @orig_ARGV
+ enc2xs VERSION $VERSION
*/
END
}
print C "#include <XSUB.h>\n";
print C "#define U8 U8\n";
}
- print C "#include \"encode.h\"\n";
+ print C "#include \"encode.h\"\n\n";
}
elsif ($cname =~ /\.enc$/)
my %encoding;
my %strings;
+my $string_acc;
+my %strings_in_acc;
+
my $saved = 0;
my $subsave = 0;
my $strings = 0;
foreach my $name (sort cmp_name keys %encoding)
{
my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
- output(\*C,$name.'_utf8',$e2u);
- output(\*C,'utf8_'.$name,$u2e);
- push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
+ process($name.'_utf8',$e2u);
+ addstrings(\*C,$e2u);
+
+ process('utf8_'.$name,$u2e);
+ addstrings(\*C,$u2e);
+ }
+ outbigstring(\*C,"enctable");
+ foreach my $name (sort cmp_name keys %encoding)
+ {
+ my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
+ outtable(\*C,$e2u, "enctable");
+ outtable(\*C,$u2e, "enctable");
+
+ # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
}
+ my $cpp = ($Config{d_cplusplus} || '') eq 'define';
+ my $ext_c = $cpp ? 'extern "C" ' : "";
foreach my $enc (sort cmp_name keys %encoding)
{
- my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
- my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
+ # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
+ my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
+ #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
+ my $replen = 0;
+ $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
my $sym = "${enc}_encoding";
$sym =~ s/\W+/_/g;
- print C "encode_t $sym = \n";
- print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
+ my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
+ $min_el,$max_el);
+ print C "${ext_c}static const U8 ${sym}_rep_character[] = \"$rep\";\n";
+ print C "${ext_c}static const char ${sym}_enc_name[] = \"$enc\";\n\n";
+ print C "${ext_c}const encode_t $sym = \n";
+ # This is to make null encoding work -- dankogai
+ for (my $i = (scalar @info) - 1; $i >= 0; --$i){
+ $info[$i] ||= 1;
+ }
+ # end of null tweak -- dankogai
+ print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
}
foreach my $enc (sort cmp_name keys %encoding)
close(D) or warn "Error closing '$dname': $!";
close(H) or warn "Error closing '$hname': $!";
- my $perc_saved = $strings/($strings + $saved) * 100;
- my $perc_subsaved = $strings/($strings + $subsave) * 100;
+ my $perc_saved = $saved/($strings + $saved) * 100;
+ my $perc_subsaved = $subsave/($strings + $subsave) * 100;
printf STDERR "%d bytes in string tables\n",$strings;
printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
$saved, $perc_saved if $saved;
my $min_el;
if (exists $attr{'subchar'})
{
- my @byte;
- $attr{'subchar'} =~ /^\s*/cg;
- push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
- $erep = join('',map(chr(hex($_)),@byte));
+ #my @byte;
+ #$attr{'subchar'} =~ /^\s*/cg;
+ #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
+ #$erep = join('',map(chr(hex($_)),@byte));
+ $erep = $attr{'subchar'};
+ $erep =~ s/^\s+//; $erep =~ s/\s+$//;
}
print "Reading $name ($cs)\n";
my $nfb = 0;
}
}
-
-sub outstring
-{
- my ($fh,$name,$s) = @_;
- my $sym = $strings{$s};
- if ($sym)
- {
- $saved += length($s);
- }
- else
- {
- if ($opt{'O'}) {
- foreach my $o (keys %strings)
- {
- next unless (my $i = index($o,$s)) >= 0;
- $sym = $strings{$o};
- # gcc things that 0x0e+0x10 (anything with e+) starts to look like
- # a hexadecimal floating point constant. Silly gcc. Only p
- # introduces a floating point constant. Put the space in to stop it
- # getting confused.
- $sym .= sprintf(" +0x%02x",$i) if ($i);
- $subsave += length($s);
- return $strings{$s} = $sym;
- }
- }
- $strings{$s} = $sym = $name;
- $strings += length($s);
- my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
- # Maybe we should assert that these are all <256.
- $definition .= join(',',unpack "C*",$s);
- # We have a single long line. Split it at convenient commas.
- $definition =~ s/(.{74,77},)/$1\n/g;
- print $fh "$definition };\n\n";
- }
- return $sym;
-}
-
sub process
{
my ($name,$a) = @_;
$a->{'Entries'} = \@ent;
}
-sub outtable
+
+sub addstrings
{
my ($fh,$a) = @_;
my $name = $a->{'Cname'};
foreach my $b (@{$a->{'Entries'}})
{
next unless $b->[AGG_OUT_LEN];
- my $s = $b->[AGG_MIN_IN];
- my $e = $b->[AGG_MAX_IN];
- outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
+ $strings{$b->[AGG_OUT_BYTES]} = undef;
}
if ($a->{'Forward'})
{
- print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
+ my $cpp = ($Config{d_cplusplus} || '') eq 'define';
+ my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static';
+ my $const = $cpp ? '' : 'const';
+ print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
+ }
+ $a->{'DoneStrings'} = 1;
+ foreach my $b (@{$a->{'Entries'}})
+ {
+ my ($s,$e,$out,$t,$end,$l) = @$b;
+ addstrings($fh,$t) unless $t->{'DoneStrings'};
+ }
+}
+
+sub outbigstring
+{
+ my ($fh,$name) = @_;
+
+ $string_acc = '';
+
+ # Make the big string in the string accumulator. Longest first, on the hope
+ # that this makes it more likely that we find the short strings later on.
+ # Not sure if it helps sorting strings of the same length lexcically.
+ foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
+ my $index = index $string_acc, $s;
+ if ($index >= 0) {
+ $saved += length($s);
+ $strings_in_acc{$s} = $index;
+ } else {
+ OPTIMISER: {
+ if ($opt{'O'}) {
+ my $sublength = length $s;
+ while (--$sublength > 0) {
+ # progressively lop characters off the end, to see if the start of
+ # the new string overlaps the end of the accumulator.
+ if (substr ($string_acc, -$sublength)
+ eq substr ($s, 0, $sublength)) {
+ $subsave += $sublength;
+ $strings_in_acc{$s} = length ($string_acc) - $sublength;
+ # append the last bit on the end.
+ $string_acc .= substr ($s, $sublength);
+ last OPTIMISER;
+ }
+ # or if the end of the new string overlaps the start of the
+ # accumulator
+ next unless substr ($string_acc, 0, $sublength)
+ eq substr ($s, -$sublength);
+ # well, the last $sublength characters of the accumulator match.
+ # so as we're prepending to the accumulator, need to shift all our
+ # existing offsets forwards
+ $_ += $sublength foreach values %strings_in_acc;
+ $subsave += $sublength;
+ $strings_in_acc{$s} = 0;
+ # append the first bit on the start.
+ $string_acc = substr ($s, 0, -$sublength) . $string_acc;
+ last OPTIMISER;
+ }
+ }
+ # Optimiser (if it ran) found nothing, so just going have to tack the
+ # whole thing on the end.
+ $strings_in_acc{$s} = length $string_acc;
+ $string_acc .= $s;
+ };
+ }
}
+
+ $strings = length $string_acc;
+ my $cpp = ($Config{d_cplusplus} || '') eq 'define';
+ my $var = $cpp ? '' : 'static';
+ my $definition = "\n$var const U8 $name\[$strings] = { " .
+ join(',',unpack "C*",$string_acc);
+ # We have a single long line. Split it at convenient commas.
+ print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
+ print $fh substr ($definition, pos $definition), " };\n";
+}
+
+sub findstring {
+ my ($name,$s) = @_;
+ my $offset = $strings_in_acc{$s};
+ die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
+ unless defined $offset;
+ "$name + $offset";
+}
+
+sub outtable
+{
+ my ($fh,$a,$bigname) = @_;
+ my $name = $a->{'Cname'};
$a->{'Done'} = 1;
foreach my $b (@{$a->{'Entries'}})
{
my ($s,$e,$out,$t,$end,$l) = @$b;
- outtable($fh,$t) unless $t->{'Done'};
+ outtable($fh,$t,$bigname) unless $t->{'Done'};
}
- print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
+ my $cpp = ($Config{d_cplusplus} || '') eq 'define';
+ my $var = $cpp ? '' : 'static';
+ my $const = $cpp ? '' : 'const';
+ print $fh "\n$var $const encpage_t $name\[",
+ scalar(@{$a->{'Entries'}}), "] = {\n";
foreach my $b (@{$a->{'Entries'}})
{
my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
- $end |= 0x80 if $fb;
+ # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
print $fh "{";
if ($l)
{
- printf $fh outstring($fh,'',$out);
+ printf $fh findstring($bigname,$out);
}
else
{
print $fh "};\n";
}
-sub output
-{
- my ($fh,$name,$a) = @_;
- process($name,$a);
- # Sub-tables
- outtable($fh,$a);
-}
-
sub output_enc
{
my ($fh,$name,$a) = @_;
$_Enc2xs
$_Version
$_Inc
+ $_E2X
$_Name
$_TableFiles
$_Now
);
+sub find_e2x{
+ eval { require File::Find; };
+ my (@inc, %e2x_dir);
+ for my $inc (@INC){
+ push @inc, $inc unless $inc eq '.'; #skip current dir
+ }
+ File::Find::find(
+ sub {
+ my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
+ $atime,$mtime,$ctime,$blksize,$blocks)
+ = lstat($_) or return;
+ -f _ or return;
+ if (/^.*\.e2x$/o){
+ no warnings 'once';
+ $e2x_dir{$File::Find::dir} ||= $mtime;
+ }
+ return;
+ }, @inc);
+ warn join("\n", keys %e2x_dir), "\n";
+ for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
+ $_E2X = $d;
+ # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
+ return $_E2X;
+ }
+}
+
sub make_makefile_pl
{
eval { require Encode; };
# our used for variable expanstion
$_Enc2xs = $0;
$_Version = $VERSION;
- $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
+ $_E2X = find_e2x();
$_Name = shift;
$_TableFiles = join(",", map {qq('$_')} @_);
$_Now = scalar localtime();
- warn "Generating Makefile.PL\n";
- _print_expand("$_Inc/Makefile_PL.e2x", "Makefile.PL");
- warn "Generating $_Name.pm\n";
- _print_expand("$_Inc/_PM.e2x", "$_Name.pm");
- warn "Generating t/$_Name.t\n";
- _print_expand("$_Inc/_T.e2x", "t/$_Name.t");
- warn "Generating README\n";
- _print_expand("$_Inc/README.e2x", "README");
- warn "Generating t/$_Name.t\n";
- _print_expand("$_Inc/Changes.e2x", "Changes");
+
+ eval { require File::Spec; };
+ _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
+ _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
+ _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
+ _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
+ _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
exit;
}
+use vars qw(
+ $_ModLines
+ $_LocalVer
+ );
+
+sub make_configlocal_pm {
+ eval { require Encode; };
+ $@ and die "Unable to require Encode: $@\n";
+ eval { require File::Spec; };
+
+ # our used for variable expanstion
+ my %in_core = map { $_ => 1 } (
+ 'ascii', 'iso-8859-1', 'utf8',
+ 'ascii-ctrl', 'null', 'utf-8-strict'
+ );
+ my %LocalMod = ();
+ # check @enc;
+ use File::Find ();
+ my $wanted = sub{
+ -f $_ or return;
+ $File::Find::name =~ /\A\./ and return;
+ $File::Find::name =~ /\.pm\z/ or return;
+ $File::Find::name =~ m/\bEncode\b/ or return;
+ my $mod = $File::Find::name;
+ $mod =~ s/.*\bEncode\b/Encode/o;
+ $mod =~ s/\.pm\z//o;
+ $mod =~ s,/,::,og;
+ warn qq{ require $mod;\n};
+ eval qq{ require $mod; };
+ $@ and die "Can't require $mod: $@\n";
+ for my $enc ( Encode->encodings() ) {
+ no warnings;
+ $in_core{$enc} and next;
+ $Encode::Config::ExtModule{$enc} and next;
+ $LocalMod{$enc} ||= $mod;
+ }
+ };
+ File::Find::find({wanted => $wanted}, @INC);
+ $_ModLines = "";
+ for my $enc ( sort keys %LocalMod ) {
+ $_ModLines .=
+ qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
+ }
+ warn $_ModLines;
+ $_LocalVer = _mkversion();
+ $_E2X = find_e2x();
+ $_Inc = $INC{"Encode.pm"};
+ $_Inc =~ s/\.pm$//o;
+ _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
+ File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
+ exit;
+}
+
+sub _mkversion{
+ # v-string is now depreciated; use time() instead;
+ #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
+ #$yyyy += 1900, $mo +=1;
+ #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
+ return time();
+}
+
sub _print_expand{
eval { require File::Basename; };
$@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
File::Basename->import();
- my ($src, $dst) = @_;
+ my ($src, $dst, $clobber) = @_;
+ if (!$clobber and -e $dst){
+ warn "$dst exists. skipping\n";
+ return;
+ }
+ warn "Generating $dst...\n";
open my $in, $src or die "$src : $!";
if ((my $d = dirname($dst)) ne '.'){
- -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
+ -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
}
open my $out, ">$dst" or die "$!";
my $asis = 0;
while (<$in>){
- if (/^#### END_OF_HEADER/){
- $asis = 1; next;
- }
- s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
- print $out $_;
+ if (/^#### END_OF_HEADER/){
+ $asis = 1; next;
+ }
+ s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
+ print $out $_;
}
}
__END__
=head1 SYNOPSIS
- enc2xs -M ModName mapfiles...
enc2xs -[options]
+ enc2xs -M ModName mapfiles...
+ enc2xs -C
=head1 DESCRIPTION
F<enc2xs> builds a Perl extension for use by Encode from either
-Unicode Character Mapping files (.ucm) or Tcl Encoding Files
-(.enc) Besides internally used during the build process of Encode
-module, you can use F<enc2xs> to add your own encoding to perl. No
-knowledge on XS is necessary.
+Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
+Besides being used internally during the build process of the Encode
+module, you can use F<enc2xs> to add your own encoding to perl.
+No knowledge of XS is necessary.
=head1 Quick Guide
-If what you want to know as little about Perl possible but needs to
+If you want to know as little about Perl as possible but need to
add a new encoding, just read this chapter and forget the rest.
=over 4
=item 0.
-Have a .ucm file ready. You can get it from somewhere or you can
-write your own from scratch or you can grab one from Encode
-distribution and customize. For UCM format, see the next Chapter.
-In the example below, I'll call my theoretical encoding myascii,
-defined inI<my.ucm>. C<$> is a shell prompt.
+Have a .ucm file ready. You can get it from somewhere or you can write
+your own from scratch or you can grab one from the Encode distribution
+and customize it. For the UCM format, see the next Chapter. In the
+example below, I'll call my theoretical encoding myascii, defined
+in I<my.ucm>. C<$> is a shell prompt.
$ ls -F
my.ucm
$ ls -F
Makefile.PL My.pm my.ucm t/
-The following files are created.
+The following files were created.
+
+ Makefile.PL - MakeMaker script
+ My.pm - Encode submodule
+ t/My.t - test file
- Makefle.PL - MakeMaker script
- My.pm - Encode Submodule
- t/My.t - test file
+=over 4
=item 1.1.
$ mv *.ucm Encode
$ enc2xs -M My Encode/*ucm
+=back
+
=item 2.
Edit the files generated. You don't have to if you have no time AND no
intention to give it to someone else. But it is a good idea to edit
-pod and add more tests.
+the pod and to add more tests.
=item 3.
-Now issue a command all Perl Mongers love;
+Now issue a command all Perl Mongers love:
- $ perl5.7.3 Makefile.PL
+ $ perl Makefile.PL
Writing Makefile for Encode::My
=item 4.
Reading myascii (myascii)
Writing compiled form
128 bytes in string tables
- 384 bytes (25%) saved spotting duplicates
- 1 bytes (99.2%) saved using substrings
+ 384 bytes (75%) saved spotting duplicates
+ 1 bytes (0.775%) saved using substrings
....
chmod 644 blib/arch/auto/Encode/My/My.bs
$
-The time it takes varies how fast your machine is and how large your
-encoding is. Unless you are working on something big like euc-tw, it
-won't take too long.
+The time it takes varies depending on how fast your machine is and
+how large your encoding is. Unless you are working on something big
+like euc-tw, it won't take too long.
=item 5.
If you are content with the test result, just "make install"
+=item 7.
+
+If you want to add your encoding to Encode's demand-loading list
+(so you don't have to "use Encode::YourEncoding"), run
+
+ enc2xs -C
+
+to update Encode::ConfigLocal, a module that controls local settings.
+After that, "use Encode;" is enough to load your encodings on demand.
+
=back
=head1 The Unicode Character Map
-Encode uses The Unicode Character Map (UCM) for source character
-mappings. This format is used by ICU package of IBM and adopted by
-Nick Ing-Simmons. Since UCM is more flexible than Tcl's Encoding Map
-and far more user-friendly, This is the recommended formet for
-Encode now.
+Encode uses the Unicode Character Map (UCM) format for source character
+mappings. This format is used by IBM's ICU package and was adopted
+by Nick Ing-Simmons for use with the Encode module. Since UCM is
+more flexible than Tcl's Encoding Map and far more user-friendly,
+this is the recommended format for Encode now.
-UCM file looks like this.
+A UCM file looks like this.
#
# Comments
=item *
-Anything that follows C<#> is treated as comments.
+Anything that follows C<#> is treated as a comment.
=item *
-The header section continues until CHARMAP. This section Has a form of
-I<E<lt>keywordE<gt> value>, one at a line. For a value, strings must
-be quoted. Barewords are treated as numbers. I<\xXX> represents a
-byte.
+The header section continues until a line containing the word
+CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
+pair per line. Strings used as values must be quoted. Barewords are
+treated as numbers. I<\xXX> represents a byte.
Most of the keywords are self-explanatory. I<subchar> means
substitution character, not subcharacter. When you decode a Unicode
sequence to this encoding but no matching character is found, the byte
sequence defined here will be used. For most cases, the value here is
-\x3F, in ASCII this is a question mark.
+\x3F; in ASCII, this is a question mark.
=item *
CHARMAP starts the character map section. Each line has a form as
-follows;
+follows:
<UXXXX> \xXX.. |0 # comment
^ ^ ^
| +-------- Encoded byte sequence
+-------------- Unicode Character ID in hex
-The format is roughly the same as a header section except for fallback
-flag. It is | followed by 0..3. And their meaning as follows
+The format is roughly the same as a header section except for the
+fallback flag: | followed by 0..3. The meaning of the possible
+values is as follows:
-=over 2
+=over 4
=item |0
-Round trip safe. A character decoded to Unicode encodes back to the
-same byte sequence. most character belong to this.
+Round trip safe. A character decoded to Unicode encodes back to the
+same byte sequence. Most characters have this flag.
=item |1
Fallback for unicode -> encoding. When seen, enc2xs adds this
-character for encode map only
+character for the encode map only.
=item |2
=item |3
Fallback for encoding -> unicode. When seen, enc2xs adds this
-character for decode map only
+character for the decode map only.
=back
=back
-Needless to say, if you are manually creating a UCM file, you should
-copy ascii.ucm or existing encoding which is close to yours than write
-your own from scratch.
+When you are manually creating a UCM file, you should copy ascii.ucm
+or an existing encoding which is close to yours, rather than write
+your own from scratch.
When you do so, make sure you leave at least B<U0000> to B<U0020> as
-is, unless your environment is on EBCDIC.
+is, unless your environment is EBCDIC.
B<CAVEAT>: not all features in UCM are implemented. For example,
icu:state is not used. Because of that, you need to write a perl
-module if you want to support algorithmical encodings, notablly
-ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
+module if you want to support algorithmical encodings, notably
+the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
+=head2 Coping with duplicate mappings
+
+When you create a map, you SHOULD make your mappings round-trip safe.
+That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
+$data> stands for all characters that are marked as C<|0>. Here is
+how to make sure:
+
+=over 4
+
+=item *
+
+Sort your map in Unicode order.
+
+=item *
+
+When you have a duplicate entry, mark either one with '|1' or '|3'.
+
+=item *
+
+And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
+
+=back
+
+Here is an example from big5-eten.
+
+ <U2550> \xF9\xF9 |0
+ <U2550> \xA2\xA4 |3
+
+Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
+this;
+
+ E to U U to E
+ --------------------------------------
+ \xF9\xF9 => U2550 U2550 => \xF9\xF9
+ \xA2\xA4 => U2550
+
+So it is round-trip safe for \xF9\xF9. But if the line above is upside
+down, here is what happens.
+
+ E to U U to E
+ --------------------------------------
+ \xA2\xA4 => U2550 U2550 => \xF9\xF9
+ (\xF9\xF9 => U2550 is now overwritten!)
+
+The Encode package comes with F<ucmlint>, a crude but sufficient
+utility to check the integrity of a UCM file. Check under the
+Encode/bin directory for this.
+
+When in doubt, you can use F<ucmsort>, yet another utility under
+Encode/bin directory.
+
=head1 Bookmarks
+=over 4
+
+=item *
+
ICU Home Page
L<http://oss.software.ibm.com/icu/>
+=item *
+
ICU Character Mapping Tables
L<http://oss.software.ibm.com/icu/charset/>
+=item *
+
ICU:Conversion Data
L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
+=back
+
=head1 SEE ALSO
L<Encode>,