ext/Encode/lib/Encode/Alias.pm Encode extension
ext/Encode/lib/Encode/CN/HZ.pm Encode extension
ext/Encode/lib/Encode/Details.pod Encode extension
+ext/Encode/lib/Encode/EncFormat.pod Encode extension
ext/Encode/lib/Encode/Encoding.pm Encode extension
ext/Encode/lib/Encode/Internal.pm Encode extension
ext/Encode/lib/Encode/iso10646_1.pm Encode extension
ext/Encode/lib/Encode/JP/Constants.pm Encode extension
ext/Encode/lib/Encode/JP/H2Z.pm Encode extension
ext/Encode/lib/Encode/JP/ISO_2022_JP.pm Encode extension
+ext/Encode/lib/Encode/JP/ISO_2022_JP_1.pm Encode extension
ext/Encode/lib/Encode/JP/JIS.pm Encode extension
ext/Encode/lib/Encode/Supported.pod Encode extension
ext/Encode/lib/Encode/Tcl.pm Encode extension
ext/Encode/lib/Encode/Unicode.pm Encode extension
ext/Encode/lib/Encode/utf8.pm Encode extension
ext/Encode/lib/Encode/XS.pm Encode extension
-ext/Encode/lib/EncodeFormat.pod Encode extension
ext/Encode/Makefile.PL Encode extension makefile writer
ext/Encode/MANIFEST Encode extension
ext/Encode/README Encode extension
die "Encode::CN not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use Encode::CN::HZ;
ASCII part (0x00-0x7f) is preserved for all encodings, even though it
conflicts with mappings by the Unicode Consortium. See
-F<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
+L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
to find why it is implemented that way.
# Revision history for Perl extension Encode.
#
-# $Id: Changes,v 0.97 2002/03/23 20:24:42 dankogai Exp dankogai $
+# $Id: Changes,v 0.98 2002/03/24 15:43:37 dankogai Exp dankogai $
#
+0.98 Mon Mar 25 2002
+! lib/Encode/Supported.pod
+ Further pod fixes
++ lib/Encode/JP/ISO_2022_JP_1.pm
+! lib/Encode/JP/ISO_2022_JP.pm
+! lib/Encode/JP/JIS.pm
+! JP/JP.pm
+ Now Encode::JP is more strict on the difference between ISO-2022-JP
+ and ISO-2022-JP-1. See JP/JP.pm for details. I hope this move
+ makes Anton happier :) FYI the previous version implements
+ ISO-2022-JP as ISO-2022-JP-1 since it had X0212 support.
+! lib/Encode/Supported.pod
+ Further pod fixes
+! Encode.xs
+ Avoid core-dump in Encode with PERLIO=mmap by NI-S
+ Message-Id: <20020324104139.1326.7@bactrian.ni-s.u-net.com>
+! CN/CN.pm
+! JP/JP.pm
+! KR/KR.pm
+! TW/TW.pm
+! lib/Encode/Suppoted.pod
+ pod fixes to replace F<http://...> to L<http://...>,
+ as suggested by Autrijius in:
+ Message-Id: <20020324083943.GA14901@not.autrijus.org>
+! lib/Encode/Suppoted.pod
+ fixes and enhancements by Anton
+ Message-Id: <10632060120.20020324103753@motor.ru>
+! lib/Encode/Alias.pm
+ > define_alias( qr/^GB[- ]?(\d+)$/i => '"gb$1"' );
+ added. Suggested by Anton then deobfuscated by Autrijius
+ Message-Id: <20020324064455.GA3667@not.autrijus.org>
+! compile
+ Further fix by Nicholas Clark
+ Message-Id: <20020323145840.GD304@Bagpuss.unfortu.net>
+- lib/EncodeFormat.pod
++ lib/Encode/EncFormat.pod
+! MANIFEST
+ File renamed as suggested by Autrijius
+! Encode.pm
+! lib/Encode/Details.pod
+! lib/Encode/Supported.pod Sun Mar 24 13:29:35 2002
+! Encode.pm Sun Mar 24 13:43:47 2002
+ pod fixes by Autrijius.
+ Message-Id: <20020324062804.GA3595@not.autrijus.org>
+ Message-Id: <20020324075627.GB11986@not.autrijus.org>
+! t/Alias.t
+! lib/Encode/Alias.pm
+! Encode.pm
+ now more EBCDIC conscious;
+ %ExtModules on EBCDIC system excludes CJK so that you don't
+ have to worry about the matched alias resulting cloaking.
+ t/Alias.t also revised to reflect changes. Verified by jhi
+ Message-Id: <20020324022929.D22596@alpha.hut.fi>
+
0.97 Sun Mar 24 2002
! CN/CN.pm
! KR/KR.pm
package Encode;
use strict;
-our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
require DynaLoader;
use Carp;
+our $ON_EBCDIC = (ord("A") == 193);
use Encode::Alias;
# Make a %Encoding package variable to allow a certain amount of cheating
'posix-bc' => 'Encode/EBCDIC.pm',
symbol => 'Encode/Symbol.pm',
dingbats => 'Encode/Symbol.pm',
- 'euc-cn' => 'Encode/CN.pm',
- gb2312 => 'Encode/CN.pm',
- gb12345 => 'Encode/CN.pm',
- gbk => 'Encode/CN.pm',
- cp936 => 'Encode/CN.pm',
- 'iso-ir-165' => 'Encode/CN.pm',
- 'euc-jp' => 'Encode/JP.pm',
- 'iso-2022-jp' => 'Encode/JP.pm',
- '7bit-jis' => 'Encode/JP.pm',
- shiftjis => 'Encode/JP.pm',
- macjapan => 'Encode/JP.pm',
- cp932 => 'Encode/JP.pm',
- 'euc-kr' => 'Encode/KR.pm',
- ksc5601 => 'Encode/KR.pm',
- cp949 => 'Encode/KR.pm',
- big5 => 'Encode/TW.pm',
- 'big5-hkscs' => 'Encode/TW.pm',
- cp950 => 'Encode/TW.pm',
- gb18030 => 'Encode/HanExtra.pm',
- big5plus => 'Encode/HanExtra.pm',
- 'euc-tw' => 'Encode/HanExtra.pm',
);
for my $k (2..11,13..16){
$ExtModule{"cp$k"} = 'Encode/Byte.pm';
}
+unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env
+%ExtModule =(
+ %ExtModule,
+ 'euc-cn' => 'Encode/CN.pm',
+ gb2312 => 'Encode/CN.pm',
+ gb12345 => 'Encode/CN.pm',
+ gbk => 'Encode/CN.pm',
+ cp936 => 'Encode/CN.pm',
+ 'iso-ir-165' => 'Encode/CN.pm',
+ 'euc-jp' => 'Encode/JP.pm',
+ 'iso-2022-jp' => 'Encode/JP.pm',
+ 'iso-2022-jp-1' => 'Encode/JP.pm',
+ '7bit-jis' => 'Encode/JP.pm',
+ shiftjis => 'Encode/JP.pm',
+ macjapan => 'Encode/JP.pm',
+ cp932 => 'Encode/JP.pm',
+ 'euc-kr' => 'Encode/KR.pm',
+ ksc5601 => 'Encode/KR.pm',
+ cp949 => 'Encode/KR.pm',
+ big5 => 'Encode/TW.pm',
+ 'big5-hkscs' => 'Encode/TW.pm',
+ cp950 => 'Encode/TW.pm',
+ gb18030 => 'Encode/HanExtra.pm',
+ big5plus => 'Encode/HanExtra.pm',
+ 'euc-tw' => 'Encode/HanExtra.pm',
+ );
+}
+
for my $k (qw(centeuro croatian cyrillic dingbats greek
iceland roman rumanian sami
thai turkish ukraine))
and the rest of the system. Perl strings are sequences of B<characters>.
To find more about character encodings, please consult
-L<Encode::Details> . This document focuses on programming references.
+L<Encode::Details>. This document focuses on programming references.
=head1 PERL ENCODING API
=over 4
-=item *
-
- $bytes = encode(ENCODING, $string[, CHECK])
+=item $bytes = encode(ENCODING, $string[, CHECK])
Encodes string from Perl's internal form into I<ENCODING> and returns
a sequence of octets. For CHECK see L</"Handling Malformed Data">.
$octets = encode("utf8", $unicode);
-=item *
-
- $string = decode(ENCODING, $bytes[, CHECK])
+=item $string = decode(ENCODING, $bytes[, CHECK])
Decode sequence of octets assumed to be in I<ENCODING> into Perl's
internal form and returns the resulting string. For CHECK see
$utf8 = decode("latin1", $latin1);
-=item *
-
- from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
+=item from_to($string, FROM_ENCODING, TO_ENCODING[, CHECK])
Convert B<in-place> the data between two encodings. How did the data
in $string originally get to be in FROM_ENCODING? Either using
Multiple return values rather than in-place modifications.
-Index into the string could be pos($str) allowing s/\G...//.
+Index into the string could be C<pos($str)> allowing C<s/\G...//>.
=back
=head2 UTF-8 / utf8
The Unicode consortium defines the UTF-8 standard as a way of encoding
-the entire Unicode repertiore as sequences of octets. This encoding is
-expected to become very widespread. Perl can use this form internaly
+the entire Unicode repertoire as sequences of octets. This encoding is
+expected to become very widespread. Perl can use this form internally
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).
=over 4
-=item *
-
- $bytes = encode_utf8($string);
+=item $bytes = encode_utf8($string);
The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.
-=item *
-
- $string = decode_utf8($bytes [,CHECK]);
+=item $string = decode_utf8($bytes [, CHECK]);
The sequence of octets represented by $bytes is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
@with_jp = Encode->encodings("Encode/JP.pm");
-Note in this case you have to say "Encode/JP.pm instead of Encode::JP.
+Note in this case you have to say C<"Encode/JP.pm"> instead of
+C<"Encode::JP">.
-To find which encodings are suppoted by this package in details,
+To find which encodings are supported by this package in details,
see L<Encode::Supported>.
=head2 Defining Aliases
use Encode;
use Encode::Alias;
- define_alias( newName => ENCODING);
+ define_alias(newName => ENCODING);
Allows newName to be used as am alias for ENCODING. ENCODING may be
either the name of an encoding or and encoding object (as above).
=head1 Defining Encodings
use Encode qw(define_alias);
- define_encoding( $object, 'canonicalName' [,alias...]);
+ define_encoding($object, 'canonicalName' [, alias...]);
Causes I<canonicalName> to be associated with I<$object>. The object
should provide the interface described in L<Encode::Encoding>
=over 4
-=item * is_utf8(STRING [, CHECK])
+=item is_utf8(STRING [, CHECK])
[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8. Returns true if successful, false otherwise.
-=item *
-
- _utf8_on(STRING)
+=item _utf8_on(STRING)
[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
B<not> checked for being well-formed UTF-8. Do not use unless you
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.
-=item *
-
- _utf8_off(STRING)
+=item _utf8_off(STRING)
[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
}
}
use Encode;
-our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use XSLoader;
XSLoader::load('Encode::JP',$VERSION);
use Encode::JP::JIS;
use Encode::JP::ISO_2022_JP;
+use Encode::JP::ISO_2022_JP_1;
1;
__END__
iso-2022-jp ISO-2022-JP
(7bit JIS with all Halfwidth Kana
converted to Fullwidth)
+ iso-2022-jp-1 ISO-2022-JP-1
+ (ISO-2022-JP with JIS X 0212-1990
+ support. See below)
macjapan Mac Japan (Shift JIS + Apple vendor mappings)
cp932 Code Page 932 (Shift JIS + MS/IBM vendor mappings)
--------------------------------------------------------------------
To find how to use this module in detail, see L<Encode>.
+=head1 Note on ISO-2022-JP(-1)?
+
+ISO-2022-JP-1 (RFC2237) is a superset of ISO-2022-JP (RFC1468) which
+adds support for JIS X 0212-1990. That means you can use the same
+code to decode to utf8 but not vice versa.
+
+ $utf8 = decode('iso-2022-jp-1', $stream);
+ $utf8 = decode('iso-2022-jp', $stream);
+
+Yields the same result but
+
+ $with_0212 = encode('iso-2022-jp-1', $utf8);
+
+is now different from
+
+ $without_0212 = encode('iso-2022-jp', $utf8 );
+
+In the latter case, characters that map to 0212 are at first converted
+to U+3013 (0xA2AE in EUC-JP; a white square also known as 'Tofu') then
+fed to decoding engine. U+FFFD is not used to preserve text layout as
+much as possible.
+
=head1 BUGS
ASCII part (0x00-0x7f) is preserved for all encodings, even though it
conflicts with mappings by the Unicode Consortium. See
-F<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
+L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
to find why it is implemented that way.
die "Encode::KR not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use XSLoader;
ASCII part (0x00-0x7f) is preserved for all encodings, even though it
conflicts with mappings by the Unicode Consortium. See
-F<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
+L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
to find why it is implemented that way.
lib/Encode/JP/Constants.pm Encode extension
lib/Encode/JP/H2Z.pm Encode extension
lib/Encode/JP/ISO_2022_JP.pm Encode extension
+lib/Encode/JP/ISO_2022_JP_1.pm Encode extension
lib/Encode/JP/JIS.pm Encode extension
lib/Encode/Supported.pod Documents supported encodings
lib/Encode/Tcl.pm Encode extension
lib/Encode/iso10646_1.pm Encode extension
lib/Encode/ucs2_le.pm Encode extension
lib/Encode/utf8.pm Encode extension
-lib/EncodeFormat.pod Encode extension
+lib/Encode/EncFormat.pod Encode extension
t/Aliases.t Encode extension test
t/CN.t Encode extension test
t/Encode.t Encode extension test
die "Encode::TW not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 0.97 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use XSLoader;
ASCII part (0x00-0x7f) is preserved for all encodings, even though it
conflicts with mappings by the Unicode Consortium. See
-F<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
+L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
to find why it is implemented that way.
package Encode::Alias;
use strict;
use Encode;
-our $VERSION = do { my @r = (q$Revision: 0.96 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
require Exporter;
my $new;
if (ref($alias) eq 'Regexp' && $_ =~ $alias)
{
+ $DEBUG and warn "eval $val";
$new = eval $val;
# $@ and warn "$val, $@";
}
elsif (ref($alias) eq 'CODE')
{
+ $DEBUG and warn "$alias", "->", "($val)";
$new = $alias->($val);
}
elsif (lc($_) eq lc($alias))
if (defined($new))
{
next if $new eq $_; # avoid (direct) recursion on bugs
+ $DEBUG and warn "$alias, $new";
my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
if ($enc)
{
}
}
}
+ if ($DEBUG){
+ my $name;
+ if (my $e = $Alias{$_}){
+ $name = $e->name;
+ }else{
+ $name = "";
+ }
+ warn "find_alias($class, $_)->name = $name";
+ }
return $Alias{$_};
}
for my $k (@a){
if (ref($alias) eq 'Regexp' && $k =~ $alias)
{
- $DEBUG and warn $k;
+ $DEBUG and warn "delete \$Alias\{$k\}";
delete $Alias{$k};
}
elsif (ref($alias) eq 'CODE')
{
+ $DEBUG and warn "delete \$Alias\{$k\}";
delete $Alias{$alias->($name)};
}
}
}else{
+ $DEBUG and warn "delete \$Alias\{$alias\}";
delete $Alias{$alias};
}
}
define_alias( qr/^macRomanian$/i => '"macRumanian"');
# Standardize on the dashed versions.
- define_alias( qr/^utf8$/i => 'utf-8' );
+ # define_alias( qr/^utf8$/i => 'utf-8' );
define_alias( qr/^koi8r$/i => 'koi8-r' );
define_alias( qr/^koi8u$/i => 'koi8-u' );
-# for Encode::CN
- define_alias( qr/euc.*cn$/i => '"euc-cn"' );
- define_alias( qr/cn.*euc/i => '"euc-cn"' );
-
-# for Encode::JP
- define_alias( qr/euc.*jp$/i => '"euc-jp"' );
- define_alias( qr/jp.*euc/i => '"euc-jp"' );
- define_alias( qr/ujis$/i => '"euc-jp"' );
- define_alias( qr/shift.*jis$/i => '"shiftjis"' );
- define_alias( qr/sjis$/i => '"shiftjis"' );
- define_alias( qr/^jis$/i => '"7bit-jis"' );
-
-# for Encode::KR
- define_alias( qr/euc.*kr$/i => '"euc-kr"' );
- define_alias( qr/kr.*euc/i => '"euc-kr"' );
-
-# for Encode::TW
- define_alias( qr/big-?5$/i => '"big5"' );
- define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' );
+ unless ($Encode::ON_EBCDIC){
+ # for Encode::CN
+ define_alias( qr/euc.*cn$/i => '"euc-cn"' );
+ define_alias( qr/cn.*euc/i => '"euc-cn"' );
+ define_alias( qr/^GB[- ]?(\d+)$/i => '"gb$1"' );
+ # for Encode::JP
+ define_alias( qr/euc.*jp$/i => '"euc-jp"' );
+ define_alias( qr/jp.*euc/i => '"euc-jp"' );
+ define_alias( qr/ujis$/i => '"euc-jp"' );
+ define_alias( qr/shift.*jis$/i => '"shiftjis"' );
+ define_alias( qr/sjis$/i => '"shiftjis"' );
+ define_alias( qr/^jis$/i => '"7bit-jis"' );
+ # for Encode::KR
+ define_alias( qr/euc.*kr$/i => '"euc-kr"' );
+ define_alias( qr/kr.*euc/i => '"euc-kr"' );
+ # for Encode::TW
+ define_alias( qr/big-?5$/i => '"big5"' );
+ define_alias( qr/big5-hk(?:scs)?/i => '"big5-hkscs"' );
+ }
# At last, Map white space and _ to '-'
define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
-
=head1 NAME
-Encode - character encodings
-
-=head1 SYNOPSIS
-
- use Encode;
+Encode::Details - implementation details of Encode.pm
=head1 DESCRIPTION
the legacy encoding is some variant of EBCDIC rather than a super-set
of ASCII - see L<perlebcdic>).
-Traditionaly computer data has been moved around in 8-bit chunks
+Traditionally computer data has been moved around in 8-bit chunks
often called "bytes". These chunks are also known as "octets" in
networking standards. Perl is widely used to manipulate data of
many types - not only strings of characters representing human or
Not really very "encoded" encodings. The Unicode code points
are just represented as 4-octet integers. None the less because
different architectures use different representations of integers
-(so called "endian") there at least two disctinct encodings.
+(so called "endian") there at least two distinct encodings.
=item * Multi-byte encodings
UTF-16 KOI8-U ISO-2022-JP-2
-are IANA-registered preferred MIME names but probably shoule
+are IANA-registered preferred MIME names but probably should
be avoided as encoding for web pages due to lack of browser
support.
=head2 UTF-8 / utf8
The Unicode consortium defines the UTF-8 standard as a way of encoding
-the entire Unicode repertiore as sequences of octets. This encoding is
-expected to become very widespread. Perl can use this form internaly
+the entire Unicode repertoire as sequences of octets. This encoding is
+expected to become very widespread. Perl can use this form internally
to represent strings, so conversions to and from this form are
particularly efficient (as octets in memory do not have to change,
just the meta-data that tells Perl how to treat them).
=over 4
-=item *
-
- $bytes = encode_utf8($string);
+=item $bytes = encode_utf8($string);
The characters that comprise string are encoded in Perl's superset of UTF-8
and the resulting octets returned as a sequence of bytes. All possible
characters have a UTF-8 representation so this function cannot fail.
-=item *
-
- $string = decode_utf8($bytes [,CHECK]);
+=item $string = decode_utf8($bytes [,CHECK]);
The sequence of octets represented by $bytes is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
In this case if I<ENCODING> is not a reference it is C<eval>-ed to
-allow C<$1> etc. to be subsituted. The example is one way to names as
+allow C<$1> etc. to be substituted. The example is one way to names as
used in X11 font names to alias the MIME names for the iso-8859-*
family. Note the double quote inside the single quote. If you are
-using regex here, y ou have to do so or it won't work in this case.
+using regex here, you have to do so or it won't work in this case.
=item As a code reference, e.g.:
=over 4
-=item * is_utf8(STRING [, CHECK])
+=item is_utf8(STRING [, CHECK])
[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8. Returns true if successful, false otherwise.
-=item *
-
- _utf8_on(STRING)
+=item _utf8_on(STRING)
[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
B<not> checked for being well-formed UTF-8. Do not use unless you
state of the UTF-8 flag (so please don't test the return value as
I<not> success or failure), or C<undef> if STRING is not a string.
-=item *
-
- _utf8_off(STRING)
+=item _utf8_off(STRING)
[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
Returns the previous state of the UTF-8 flag (so please don't test the
L<perlunicode>, L<perlebcdic>, L<perlfunc/open>, L<PerlIO>, L<encoding>,
L<utf8>, the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
-
=cut
-
=head1 NAME
-EncodeFormat - the format of encoding tables of the Encode extension
+Encode::EncFormat - the format of encoding tables of the Encode/*.enc files
=head1 DESCRIPTION
use base 'Encode::Encoding';
use vars qw($VERSION);
-$VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
my $canon = 'iso-2022-jp';
my $obj = bless {name => $canon}, __PACKAGE__;
my ($obj,$str,$chk) = @_;
my $euc = Encode::encode('euc-jp', $str, $chk);
&Encode::JP::H2Z::h2z(\$euc);
- return &Encode::JP::JIS::euc_jis(\$euc);
+ return &Encode::JP::JIS::euc_jis_nox0212(\$euc);
}
1;
--- /dev/null
+package Encode::JP::ISO_2022_JP_1;
+use Encode::JP;
+use Encode::JP::JIS;
+use Encode::JP::H2Z;
+use base 'Encode::Encoding';
+
+use vars qw($VERSION);
+$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+
+my $canon = 'iso-2022-jp-1';
+my $obj = bless {name => $canon}, __PACKAGE__;
+$obj->Define($canon);
+
+sub name { return $_[0]->{name}; }
+
+#
+# decode is identical to 7bit-jis
+#
+
+sub decode
+{
+ my ($obj,$str,$chk) = @_;
+ return Encode::decode('7bit-jis', $str, $chk);
+}
+
+# iso-2022-jp = 7bit-jis with all x201 (Hankaku) converted to
+# x208 equivalent (Zenkaku)
+
+sub encode
+{
+ my ($obj,$str,$chk) = @_;
+ my $euc = Encode::encode('euc-jp', $str, $chk);
+ &Encode::JP::H2Z::h2z(\$euc);
+ return &Encode::JP::JIS::euc_jis(\$euc);
+}
+
+1;
+__END__
use strict;
use vars qw($VERSION);
-$VERSION = do { my @r = (q$Revision: 0.94 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+$VERSION = do { my @r = (q$Revision: 0.98 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
# Just for the time being, we implement jis-7bit
# encoding via EUC
$$r_str;
}
+sub euc_jis_nox0212{
+ my $r_str = shift;
+ $$r_str =~ s/$RE{EUC_0212}/$CHARCODE{UNDEF_EUC}/go;
+ euc_jis($r_str);
+}
+
1;
__END__
As of Perl 5.8.0, at least the following encodings are recognized.
Note that unless otherwise specified, they are all case insensitive
-(via alias) and all occurance of spaces are replaced with '-'. In
+(via alias) and all occurrance of spaces are replaced with '-'. In
other words, "ISO 8859 1" and "iso-8859-1" are identical.
Encodings are categorized and implemented in several different modules
non-ASCII characters.
-----------------------
- iso-8859-1 latin
+ (iso-8859-1 is in built-in)
iso-8859-2 latin2
iso-8859-3 latin3
iso-8859-4 latin4
- iso-8859-5 latin
- iso-8859-6 latin
+ iso-8859-5
+ iso-8859-6
iso-8859-7
iso-8859-8
iso-8859-9 latin5
=head2 The CJK: Chinese, Japanese, Korean (Multibyte)
Note Vietnamese is listed above. Also read "Encoding vs Charset"
-below. Also note these are impelemented in distinct module by
-languages, due the the size concerns. See these perldocs also.
+below. Also note these are implemented in distinct module by
+languages, due the the size concerns. Please also refer to their
+respective document pages.
=over 4
cp932
euc-jp ujis
iso-2022-jp
+ iso-2022-jp-1
macjapan
shiftjis Shift_JIS, sjis
-----------------------
posix-bc
-----------------------
-=item Enocode::Symbols
+=item Encode::Symbols
For symbols and dingbats.
Encoding actually maps charset(s) to stream of bits.
-Note a given encoding contains multiple charsets. For instance,
-euc-jp contains ASCII, JIS X 0201 (Hankaku Kana), JIS X 0208 (Zenkaku
-Kana and Kanji) and JIS X 0212 (Extended Kanji) in a single encoding.
+Note a given encoding may contain multiple charsets and complex CJK
+encodings are usually implemented that way.
+
+For instance, euc-jp contains ASCII, JIS X 0201-1978 (Hankaku Kana),
+JIS X 0208-1997 (ZenkakuKana and Kanji) and JIS X 0212-1990 (Extended
+Kanji) in a single encoding.
As the name suggests, the Encode module supports encodings, not
individual charsets.
-=head1 Encoding Classification (by Anton Tagunov)
+=head1 Encoding Classification (by Anton Tagunov and Dan Kogai)
+
+This section tries to classify the supported encodings by their
+applicability for information exchange over the Internet and to
+choose the most suitable aliases to name them in the context of
+such communication.
+
+Encoding names
-Encodings
+ US-ASCII UTF-8
+ ISO-8859-* KOI8-R
+ Shift_JIS EUC-JP ISO-2022-JP ISO-2022-JP-1
+ EUC-KR
+ Big5
- US-ASCII UTF-8 KOI8-R ISO-8859-*
- ISO-2022-CN ISO-2022-JP Big5
- EUC-CN EUC-JP EUC-KR
+are L<http://www.iana.org/assignments/character-sets>-registered as
+preferred MIME names and may probably be used over the Internet.
-are <http://www.iana.org/assignments/character-sets>-registered as
-preferred MIME names and may probably be used over the Internet. So is
+C<Shift_JIS> is no longer Microsft proprietary since it has been
+officialized by JIS X 0208-1997. It is probably the most wide
+spread encoding for Japanese on the Internet.
- Shift_JIS
+ EUC-CN
-but despite its wide spread it bears the label of being
-Microsft proprietary -- was. Now Shift JIS is official as of
-JIS X 0208-1997.
+has not been registered with IANA (as of march 2002) but
+seems to be supported by major web browsers. (IANA has registered
+this encoding as C<GB2312>, but C<gb2312> currently has a different
+meaning to the C<Encode> module. It will probably become alias to
+C<EUC-CN> in the future; until then it is safer to avoid using
+C<gb2312> as encoding name within Perl).
- UTF-16 KOI8-U
+ UTF-16
+ KOI8-U (http://www.faqs.org/rfcs/rfc2319.html)
-are IANA-registered preferred MIME names but probably
-shoule be avoided as encoding for web pages due to lack of
-browser support.
+are IANA-registered (C<UTF-16> even as a preferred MIME name)
+but probably should be avoided as encoding for web pages due to
+lack of browser support.
- ISO-2022 (http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM)
- ISO-2022-JP-1 (http://www.faqs.org/rfcs/rfc2237.html)
ISO-IR-165 (http://www.faqs.org/rfcs/rfc1345.html)
GBK
VISCII
- GB 12345 (only plains 1 and 2 available)
- GB 18030
- CNS 11643
+ GB 12345
+ GB 18030 (*) (see links bellow)
+ EUC-TW (*)
are totally valid encodings but not registered at IANA.
+The names under which they are listed here are probably the
+most widely-known names for these encodings and are recommended
+names.
+
+
+=for comment this used to be listed as supported but
- BIG5PLUS
- EUC-JP-0212 (Encode::lib::Encode::Tcl::Extended)
+do not work @15457 when it's clear they will be uncommented
+or deleted - Anton
+ISO-2022 (http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM)
+CNS 11643 (only plains 1 and 2 available)
-are a bit proprietary
+ BIG5PLUS (*)
+
+is a bit proprietary name. C<(*)>-marked encodings belong to
+C<Encode::HanExtra> available from CPAN.
You may probably get some info on CJK encodings at
brief description for most of the mentioned CJK encodings
-
-F<http://www.debian.org.ru/doc/manuals/intro-i18n/ch-codes.html>
+L<http://www.debian.org.ru/doc/manuals/intro-i18n/ch-codes.html>
several years old, but still useful
-
-F<http://www.oreilly.com/people/authors/lunde/cjk_inf.html>
+L<http://www.oreilly.com/people/authors/lunde/cjk_inf.html>
and some in-depth reading for the heroes :-)
-F<http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM> (eq ISO-2022)
+L<http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM> (eq C<ISO-2022>)
+
+gives brief info on C<EUC-CN>, C<GBK> and mostly on C<GB 18030>
+F<ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf>
+
+The nature of information in this section is most fragile and
+error-prone; I<probably> is the most popular adverb :)
+Please feel free to send your comments, disagreements and
+additions to L<...>. (Note however,
+that the mission of this document is to cover the
+C<Encode>-supported encodings only.
=head1 See Also
L<Encode>,
L<Encode::Byte>,
-L<Encode::CN>, L<Encode::JP>, L<Encode::KR>, L<Encode::TW>
+L<Encode::CN>, L<Encode::JP>, L<Encode::KR>, L<Encode::TW>,
L<Encode::EBCDIC>, L<Encode::Symbol>
=cut
use strict;
use Encode;
use Encode::Alias;
+my %a2c;
+my $ON_EBCDIC;
BEGIN {
- if (ord("A") == 193) {
- print "1..0 # Skip: EBCDIC\n";
- exit 0;
+ $ON_EBCDIC = ord("A") == 193;
+ @ARGV and $ON_EBCDIC = $ARGV[0] eq 'EBCDIC';
+ $Encode::ON_EBCDIC = $ON_EBCDIC;
+
+ %a2c = (
+ 'ascii' => 'US-ascii',
+ 'cyrillic' => 'iso-8859-5',
+ 'arabic' => 'iso-8859-6',
+ 'greek' => 'iso-8859-7',
+ 'hebrew' => 'iso-8859-8',
+ 'thai' => 'iso-8859-11',
+ 'tis620' => 'iso-8859-11',
+ 'WinLatin1' => 'cp1252',
+ 'WinLatin2' => 'cp1250',
+ 'WinCyrillic' => 'cp1251',
+ 'WinGreek' => 'cp1253',
+ 'WinTurkish' => 'cp1254',
+ 'WinHebrew' => 'cp1255',
+ 'WinArabic' => 'cp1256',
+ 'WinBaltic' => 'cp1257',
+ 'WinVietnamese' => 'cp1258',
+ 'ja_JP.euc' => $ON_EBCDIC ? '' : 'euc-jp',
+ 'x-euc-jp' => $ON_EBCDIC ? '' : 'euc-jp',
+ 'zh_CN.euc' => $ON_EBCDIC ? '' : 'euc-cn',
+ 'x-euc-cn' => $ON_EBCDIC ? '' : 'euc-cn',
+ 'ko_KR.euc' => $ON_EBCDIC ? '' : 'euc-kr',
+ 'x-euc-kr' => $ON_EBCDIC ? '' : 'euc-kr',
+ 'ujis' => $ON_EBCDIC ? '' : 'euc-jp',
+ 'Shift_JIS' => $ON_EBCDIC ? '' : 'shiftjis',
+ 'x-sjis' => $ON_EBCDIC ? '' : 'shiftjis',
+ 'jis' => $ON_EBCDIC ? '' : '7bit-jis',
+ 'big-5' => $ON_EBCDIC ? '' : 'big5',
+ 'zh_TW.Big5' => $ON_EBCDIC ? '' : 'big5',
+ 'big5-hk' => $ON_EBCDIC ? '' : 'big5-hkscs',
+ );
+
+ for my $i (1..11,13..16){
+ $a2c{"ISO 8859 $i"} = "iso-8859-$i";
+ }
+ for my $i (1..10){
+ $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]";
+ }
+ for my $k (keys %Encode::Alias::Winlatin2cp){
+ my $v = $Encode::Alias::Winlatin2cp{$k};
+ $a2c{"Win" . ucfirst($k)} = "cp" . $v;
+ $a2c{"IBM-$v"} = $a2c{"MS-$v"} = "cp" . $v;
}
}
-my %a2c;
-
-BEGIN {
- %a2c = (
- 'ascii' => 'US-ascii',
- 'cyrillic' => 'iso-8859-5',
- 'arabic' => 'iso-8859-6',
- 'greek' => 'iso-8859-7',
- 'hebrew' => 'iso-8859-8',
- 'thai' => 'iso-8859-11',
- 'tis620' => 'iso-8859-11',
- 'ja_JP.euc' => 'euc-jp',
- 'x-euc-jp' => 'euc-jp',
- 'zh_CN.euc' => 'euc-cn',
- 'x-euc-cn' => 'euc-cn',
- 'ko_KR.euc' => 'euc-kr',
- 'x-euc-kr' => 'euc-kr',
- 'ujis' => 'euc-jp',
- 'Shift_JIS' => 'shiftjis',
- 'x-sjis' => 'shiftjis',
- 'jis' => '7bit-jis',
- 'big-5' => 'big5',
- 'zh_TW.Big5' => 'big5',
- 'big5-hk' => 'big5-hkscs',
- 'WinLatin1' => 'cp1252',
- 'WinLatin2' => 'cp1250',
- 'WinCyrillic' => 'cp1251',
- 'WinGreek' => 'cp1253',
- 'WinTurkish' => 'cp1254',
- 'WinHebrew' => 'cp1255',
- 'WinArabic' => 'cp1256',
- 'WinBaltic' => 'cp1257',
- 'WinVietnamese' => 'cp1258',
- );
-
- for my $i (1..11,13..16){
- $a2c{"ISO 8859 $i"} = "iso-8859-$i";
- }
- for my $i (1..10){
- $a2c{"ISO Latin $i"} = "iso-8859-$Encode::Alias::Latin2iso[$i]";
- }
- for my $k (keys %Encode::Alias::Winlatin2cp){
- my $v = $Encode::Alias::Winlatin2cp{$k};
- $a2c{"Win" . ucfirst($k)} = "cp" . $v;
- $a2c{"IBM-$v"} = "cp" . $v;
- $a2c{"MS-$v"} = "cp" . $v;
- }
+if ($ON_EBCDIC){
+ delete @Encode::ExtModule{
+ qw(euc-cn gb2312 gb12345 gbk cp936 iso-ir-165
+ euc-jp iso-2022-jp 7bit-jis shiftjis macjapan cp932
+ euc-kr ksc5601 cp949
+ big5 big5-hkscs cp950
+ gb18030 big5plus euc-tw)
+ };
}
use Test::More tests => (scalar keys %a2c) * 3;
-print "# alias test\n";
+print "# alias test; \$ON_EBCDIC == $ON_EBCDIC\n";
foreach my $a (keys %a2c){
my $e = Encode::find_encoding($a);
# now we override some of the aliases and see if it works fine
-define_alias( qr/shift.*jis$/i => '"macjapan"' );
-define_alias( qr/sjis$/i => '"cp932"' );
+define_alias(ascii => 'WinLatin1',
+ cyrillic => 'WinCyrillic',
+ arabic => 'WinArabic',
+ greek => 'WinGreek',
+ hebrew => 'WinHebrew');
-@a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932);
+@a2c{qw(ascii cyrillic arabic greek hebrew)} =
+ qw(cp1252 cp1251 cp1256 cp1253 cp1255);
+
+unless ($ON_EBCDIC){
+ define_alias( qr/shift.*jis$/i => '"macjapan"',
+ qr/sjis$/i => '"cp932"' );
+ @a2c{qw(Shift_JIS x-sjis)} = qw(macjapan cp932);
+}
print "# alias test with alias overrides\n";