-# To give due honor to those who have made Encode module what is is today,
-# here are easily-from-changelogs-extractable people and their
-# (hopefully) current and preferred email addresses (as of early 2001,
+# To give due honour to those who have made the Encode module what it
+# is today, here are easily-from-changelogs-extractable people and their
+# (hopefully) current and preferred email addresses (as of early 2002,
# if known).
#
# The use of this database for anything else than Encode and/or Perl
# development is strictly forbidden. (Passive distribution with the Perl
-# source code kit or CPAN is naturally allowed.)
+# source code kit or CPAN is, of course, allowed.)
#
# This list is in alphabetical order.
--
Craig A. Berry <craigberry@mac.com>
Dan Kogai <dankogai@dan.co.jp>
Gerrit P. Haase <gp@familiehaase.de>
+Gurusamy Sarathy <gsar@activestate.com>
Jarkko Hietaniemi <jhi@iki.fi>
Jungshik Shin <jshin@mailaps.org>
Laszlo Molnar <ml1050@freemail.hu>
Mark-Jason Dominus <mjd@plover.com>
+Mattia Barbon <mbarbon@dsi.unive.it>
Michael G Schwern <schwern@pobox.com>
Nicholas Clark <nick@ccl4.org>
Nick Ing-Simmons <nick@ing-simmons.net>
package Encode::Byte;
use Encode;
-our $VERSION = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use XSLoader;
XSLoader::load(__PACKAGE__,$VERSION);
=head1 SYNOPSIS
use Encode qw/encode decode/;
- $greek = encode("iso-885-7", $utf8); # loads Encode::Byte implicitly
+ $greek = encode("iso-8859-7", $utf8); # loads Encode::Byte implicitly
$utf8 = decode("iso-8859-7", $greek); # ditto
=head1 ABSTRACT
cp1251 WinCyrillic
cp1252 WinLatin1
cp1253 WinGreek
- cp1254 WinTurkiskh
+ cp1254 WinTurkish
cp1255 WinHebrew
cp1256 WinArabic
cp1257 WinBaltic
die "Encode::CN not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use XSLoader;
mean C<euc-cn> encodings. To fix that, C<gb2312> is aliased to C<euc-cn>.
Use C<gb2312-raw> when you really mean it.
-ASCII part (0x00-0x7f) is preserved for all encodings, even though it
-conflicts with mappings by the Unicode Consortium. See
+The ASCII region (0x00-0x7f) is preserved for all encodings, even though
+this conflicts with mappings by the Unicode Consortium. See
L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
-to find why it is implemented that way.
+to find out why it is implemented that way.
=head1 SEE ALSO
# Revision history for Perl extension Encode.
#
-# $Id: Changes,v 1.52 2002/04/20 23:43:47 dankogai Exp dankogai $
+# $Id: Changes,v 1.56 2002/04/22 09:48:07 dankogai Exp dankogai $
#
-1.52 $Date: 2002/04/20 23:43:47 $
+$Revision: 1.56 $ $Date: 2002/04/22 09:48:07 $
+! Encode.pm encoding.pm t/perlio.t t/jperl.t
+ New PerlIO::encoding 0.04 compliance met
+
+1.55 2002/04/22 03:43:05
+! Encode.pm Encode.xs Unicode/Unicode.pm
+ needs_lines() defined so Encode::Encoding is no longer needed
+ for perlio
+
+1.54 2002/04/22 02:50:01
+! Encode.pm! Encode.xs! Unicode/Unicode.pm t/perlio.t
+! lib/Encode/Encoding.pm lib/Encode/CN/HZ.pm
+ now perlio_ok is true by default if PerlIO::encoding->VERSION is
+ 0.03 or larger. POD in Encode::Encoding revised to reflect this.
+ Encode::XS and Encode::Unicode now has perlio_ok() method.
+! lib/Encode/Supported.pod
+ s/UP-UX/HP-UX/ by jhi
+! AUTHORS Byte/Byte.pm CN/CN.pm Encode.pm JP/JP.pm KR/KR.pm README
+! Symbol/Symbol.pm TW/TW.pm Unicode/Unicode.pm bin/enc2xs bin/piconv
+! bin/ucmlint encoding.pm lib/Encode/Alias.pm lib/Encode/CN/HZ.pm
+! lib/Encode/Config.pm lib/Encode/Encoder.pm lib/Encode/Encoding.pm
+! lib/Encode/KR/2022_KR.pm lib/Encode/PerlIO.pod
+! lib/Encode/Supported.pod
+ Huge document fixes by Philip.
+! AUTHORS
+! t/JP.t
+ s/compare\(/compare_text\(/o by Sarathy. Adds him to AUTHORS
+ http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
+! t/perlio.t
+ binmode() after "<:encoding" to make Win32 happy, by Mattia.
+ Mattia added to AUTHORS file
+ Message-Id: <3CC3150F.5798.22A05AE@localhost>
+
+1.52 2002/04/20 23:43:47
! t/perlio.t
TODO: is now SKIP:, as NI-XS requested. Also adds more
eraborate failure analysis added.
Typo fixes and improvements by jhi
Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al.
-1.11 $Date: 2002/04/20 23:43:47 $
+1.11 $Date: 2002/04/22 09:48:07 $
+ t/encoding.t
+ t/jperl.t
! MANIFEST
package Encode;
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.52 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.56 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
use XSLoader ();
XSLoader::load 'Encode';
sub perlio_ok{
exists $INC{"PerlIO/encoding.pm"} or return 0;
- my $stash = ref($_[0]);
- $stash ||= ref(find_encoding($_[0]));
- return ($stash eq "Encode::XS" || $stash eq "Encode::Unicode");
+ my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]);
+ $obj->can("perlio_ok") and return $obj->perlio_ok() unless $@;
+ return 0; # safety net
}
sub define_encoding
$_[1] = '' if $chk;
return $octets;
};
- $Encode::Encoding{utf8} =
+ $Encode::Encoding{utf8} =
bless {Name => "utf8"} => "Encode::utf8";
}
}
-require Encode::Encoding;
-@Encode::XS::ISA = qw(Encode::Encoding);
-
-# This is very dodgy - PerlIO::encoding does "use Encode" and _BEFORE_ it gets a
-# chance to set its VERSION we potentially delete it from %INC so it will be re-loaded
-# NI-S
-eval {
- require PerlIO::encoding;
- unless (PerlIO::encoding->VERSION >= 0.02){
- delete $INC{"PerlIO/encoding.pm"};
- }
-};
-# warn $@ if $@;
-@Encode::XS::ISA = qw(Encode::Encoding);
-
1;
__END__
use Encode;
-
=head2 Table of Contents
-Encode consists of a collection of modules which details are too big
+Encode consists of a collection of modules whose details are too big
to fit in one document. This POD itself explains the top-level APIs
and general topics at a glance. For other topics and more details,
-see the PODs below;
+see the PODs below:
Name Description
--------------------------------------------------------
the legacy encoding is some variant of EBCDIC rather than a super-set
of ASCII - see L<perlebcdic>).
-Traditionally computer data has been moved around in 8-bit chunks
+Traditionally, computer data has been moved around in 8-bit chunks
often called "bytes". These chunks are also known as "octets" in
networking standards. Perl is widely used to manipulate data of many
types - not only strings of characters representing human or computer
-languages but also "binary" data being the machines representation of
+languages but also "binary" data being the machine's representation of
numbers, pixels in an image - or just about anything.
-When Perl is processing "binary data" the programmer wants Perl to
+When Perl is processing "binary data", the programmer wants Perl to
process "sequences of bytes". This is not a problem for Perl - as a
-byte has 256 possible values it easily fits in Perl's much larger
+byte has 256 possible values, it easily fits in Perl's much larger
"logical character".
=head2 TERMINOLOGY
=item *
I<octet>: 8 bits of data, with ordinal values 0..255
-(Term for bytes passed to or from a non-Perl context, e.g. disk file.)
+(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
=back
=item $octets = encode(ENCODING, $string[, CHECK])
-Encodes string from Perl's internal form into I<ENCODING> and returns
+Encodes a string from Perl's internal form into I<ENCODING> and returns
a sequence of octets. ENCODING can be either a canonical name or
-alias. For encoding names and aliases, see L</"Defining Aliases">.
-For CHECK see L</"Handling Malformed Data">.
+an alias. For encoding names and aliases, see L</"Defining Aliases">.
+For CHECK, see L</"Handling Malformed Data">.
-For example to convert (internally UTF-8 encoded) Unicode string to
+For example, to convert (internally UTF-8 encoded) Unicode string to
iso-8859-1 (also known as Latin1),
$octets = encode("iso-8859-1", $unicode);
=item $string = decode(ENCODING, $octets[, CHECK])
-Decode sequence of octets assumed to be in I<ENCODING> into Perl's
-internal form and returns the resulting string. as in encode(),
-ENCODING can be either a canonical name or alias. For encoding names
-and aliases, see L</"Defining Aliases">. For CHECK see
+Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
+internal form and returns the resulting string. As in encode(),
+ENCODING can be either a canonical name or an alias. For encoding names
+and aliases, see L</"Defining Aliases">. For CHECK, see
L</"Handling Malformed Data">.
-For example to convert ISO-8859-1 data to UTF-8:
+For example, to convert ISO-8859-1 data to UTF-8:
$utf8 = decode("iso-8859-1", $latin1);
=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
-Convert B<in-place> the data between two encodings.
-For example to convert ISO-8859-1 data to UTF-8:
+Converts B<in-place> data between two encodings.
+For example, to convert ISO-8859-1 data to UTF-8:
from_to($data, "iso-8859-1", "utf-8");
from_to($data, "utf-8", "iso-8859-1");
Note that because the conversion happens in place, the data to be
-converted cannot be a string constant, it must be a scalar variable.
+converted cannot be a string constant; it must be a scalar variable.
-from_to() return the length of the converted string on success, undef
+from_to() returns the length of the converted string on success, undef
otherwise.
=back
=head2 UTF-8 / utf8
-The Unicode consortium defines the UTF-8 standard as a way of encoding
-the entire Unicode repertoire as sequences of octets. This encoding is
-expected to become very widespread. Perl can use this form internally
-to represent strings, so conversions to and from this form are
-particularly efficient (as octets in memory do not have to change,
-just the meta-data that tells Perl how to treat them).
+The Unicode Consortium defines the UTF-8 transformation format as a
+way of encoding the entire Unicode repertoire as sequences of octets.
+This encoding is expected to become very widespread. Perl can use this
+form internally to represent strings, so conversions to and from this
+form are particularly efficient (as octets in memory do not have to
+change, just the meta-data that tells Perl how to treat them).
=over 4
=item $octets = encode_utf8($string);
-The characters that comprise string are encoded in Perl's superset of UTF-8
-and the resulting octets returned as a sequence of bytes. All possible
-characters have a UTF-8 representation so this function cannot fail.
+The characters that comprise $string are encoded in Perl's superset of
+UTF-8 and the resulting octets are returned as a sequence of bytes. All
+possible characters have a UTF-8 representation so this function cannot
+fail.
=item $string = decode_utf8($octets [, CHECK]);
The sequence of octets represented by $octets is decoded from UTF-8
into a sequence of logical characters. Not all sequences of octets
form valid UTF-8 encodings, so it is possible for this call to fail.
-For CHECK see L</"Handling Malformed Data">.
+For CHECK, see L</"Handling Malformed Data">.
=back
@all_encodings = Encode->encodings(":all");
-Or you can give the name of specific module.
+Or you can give the name of a specific module.
@with_jp = Encode->encodings("Encode::JP");
@ebcdic = Encode->encodings("EBCDIC");
-To find which encodings are supported by this package in details,
+To find out in detail which encodings are supported by this package,
see L<Encode::Supported>.
=head2 Defining Aliases
-To add new alias to a given encoding, Use;
+To add a new alias to a given encoding, use:
use Encode;
use Encode::Alias;
Encode::resolve_alias("iso-8859-12") # false; nonexistent
Encode::resolve_alias($name) eq $name # true if $name is canonical
-This resolve_alias() does not need C<use Encode::Alias> and is
-exported via C<use encode qw(resolve_alias)>.
+resolve_alias() does not need C<use Encode::Alias>; it can be
+exported via C<use Encode qw(resolve_alias)>.
-See L<Encode::Alias> on details.
+See L<Encode::Alias> for details.
=head1 Encoding via PerlIO
-If your perl supports I<PerlIO>, you can use PerlIO layer to directly
-decode and encode via filehandle. The following two examples are
-totally identical by functionality.
+If your perl supports I<PerlIO>, you can use a PerlIO layer to decode
+and encode directly via a filehandle. The following two examples
+are totally identical in their functionality.
# via PerlIO
open my $in, "<:encoding(shiftjis)", $infile or die;
while(<>){ print; }
# via from_to
- open my $in, $infile or die;
- open my $out, $outfile or die;
+ open my $in, "<", $infile or die;
+ open my $out, ">", $outfile or die;
while(<>){
- from_to($_, "shiftjis", "euc", 1);
+ from_to($_, "shiftjis", "euc-jp", 1);
}
-Unfortunately, not all encodings are PerlIO-savvy. You can check if
-your encoding is supported by PerlIO by C<perlio_ok> method.
+Unfortunately, there may be encodings are PerlIO-savvy. You can check
+if your encoding is supported by PerlIO by calling the C<perlio_ok>
+method.
+
+ Encode::perlio_ok("hz"); # False
+ find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
+
+ use Encode qw(perlio_ok); # exported upon request
+ perlio_ok("euc-jp")
- Encode::perlio_ok("iso-20220jp"); # false
- find_encoding("iso-2022-jp")->perlio_ok; # false
- use Encode qw(perlio_ok); # exported upon request
- perlio_ok("euc-jp") # true if PerlIO is enabled
+Fortunately, all encodings that come with Encode core are PerlIO-savvy
+except for hz and ISO-2022-kr. See L<Encode::Encoding> for details.
-For gory details, see L<Encode::PerlIO>;
+For gory details, see L<Encode::PerlIO>.
=head1 Handling Malformed Data
=over 4
-THE I<CHECK> argument is used as follows. When you omit it, it is
-identical to I<CHECK> = 0.
+The I<CHECK> argument is used as follows. When you omit it,
+the behaviour is the same as if you had passed a value of 0 for
+I<CHECK>.
=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
-If I<CHECK> is 0, (en|de)code will put I<substitution character> in
-place of the malformed character. for UCM-based encodings,
-E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
-data is supposed to be UTF-8, an optional lexical warning (category
-utf8) is given.
+If I<CHECK> is 0, (en|de)code will put a I<substitution character>
+in place of a malformed character. For UCM-based encodings,
+E<lt>subcharE<gt> will be used. For Unicode, "\x{FFFD}" is used.
+If the data is supposed to be UTF-8, an optional lexical warning
+(category utf8) is given.
=item I<CHECK> = Encode::DIE_ON_ERROR (== 1)
-If I<CHECK> is 1, methods will die immediately with an error
-message. so when I<CHECK> is set, you should trap the fatal error
-with eval{} unless you really want to let it die on error.
+If I<CHECK> is 1, methods will die immediately with an error
+message. Therefore, when I<CHECK> is set to 1, you should trap the
+fatal error with eval{} unless you really want to let it die on error.
=item I<CHECK> = Encode::FB_QUIET
If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
-return processed part on error, with data passed via argument
-overwritten with unprocessed part. This is handy when have to
-repeatedly call because the source data is chopped in the middle for
-some reasons, such as fixed-width buffer. Here is a sample code that
-just does this.
+return the portion of the data that has been processed so far when
+an error occurs. The data argument will be overwritten with
+everything after that point (that is, the unprocessed part of data).
+This is handy when you have to call decode repeatedly in the case
+where your source data may contain partial multi-byte character
+sequences, for example because you are reading with a fixed-width
+buffer. Here is some sample code that does exactly this:
my $data = '';
while(defined(read $fh, $buffer, 256)){
- # buffer may end in partial character so we append
+ # buffer may end in a partial character so we append
$data .= $buffer;
$utf8 .= decode($encoding, $data, ENCODE::FB_QUIET);
- # $data now contains unprocessed partial character
+ # $data now contains the unprocessed partial character
}
=item I<CHECK> = Encode::FB_WARN
-This is the same as above, except it warns on error. Handy when you
-are debugging the mode above.
+This is the same as above, except that it warns on error. Handy when
+you are debugging the mode above.
=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
For encodings that are implemented by Encode::XS, CHECK ==
Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
-When you decode, '\xI<XX>' will be placed where I<XX> is the hex
-representation of the octet that could not be decoded to utf8. And
-when you encode, '\x{I<xxxx>}' will be placed where I<xxxx> is the
-Unicode ID of the character that cannot be found in the character
-repertoire of the encoding.
+When you decode, '\xI<XX>' will be inserted for a malformed character,
+where I<XX> is the hex representation of the octet that could not be
+decoded to utf8. And when you encode, '\x{I<xxxx>}' will be inserted,
+where I<xxxx> is the Unicode ID of the character that cannot be found
+in the character repertoire of the encoding.
=item The bitmask
-These modes are actually set via bitmask. here is how FB_XX are laid
-out. for FB_XX you can import via C<use Encode qw(:fallbacks)> for
-generic bitmask constants, you can import via
- C<use Encode qw(:fallback_all)>.
+These modes are actually set via a bitmask. Here is how the FB_XX
+constants are laid out. You can import the FB_XX constants via
+C<use Encode qw(:fallbacks)>; you can import the generic bitmask
+constants via C<use Encode qw(:fallback_all)>.
FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
DIE_ON_ERR 0x0001 X
LEAVE_SRC 0x0008
PERLQQ 0x0100 X
-=head2 Unemplemented fallback schemes
+=head2 Unimplemented fallback schemes
-In future you will be able to use a code reference to a callback
+In the future, you will be able to use a code reference to a callback
function for the value of I<CHECK> but its API is still undecided.
=head1 Defining Encodings
define_encoding($object, 'canonicalName' [, alias...]);
I<canonicalName> will be associated with I<$object>. The object
-should provide the interface described in L<Encode::Encoding>
+should provide the interface described in L<Encode::Encoding>.
If more than two arguments are provided then additional
-arguments are taken as aliases for I<$object> as for C<define_alias>.
+arguments are taken as aliases for I<$object>, as for C<define_alias>.
See L<Encode::Encoding> for more details.
=head1 Messing with Perl's Internals
The following API uses parts of Perl's internals in the current
-implementation. As such they are efficient, but may change.
+implementation. As such, they are efficient but may change.
=over 4
=item is_utf8(STRING [, CHECK])
-[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
+[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
If CHECK is true, also checks the data in STRING for being well-formed
UTF-8. Returns true if successful, false otherwise.
=item _utf8_on(STRING)
-[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is
+[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is
B<not> checked for being well-formed UTF-8. Do not use unless you
B<know> that the STRING is well-formed UTF-8. Returns the previous
-state of the UTF-8 flag (so please don't test the return value as
-I<not> success or failure), or C<undef> if STRING is not a string.
+state of the UTF-8 flag (so please don't treat the return value as
+indicating success or failure), or C<undef> if STRING is not a string.
=item _utf8_off(STRING)
-[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously.
-Returns the previous state of the UTF-8 flag (so please don't test the
-return value as I<not> success or failure), or C<undef> if STRING is
+[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously.
+Returns the previous state of the UTF-8 flag (so please don't treat the
+return value as indicating success or failure), or C<undef> if STRING is
not a string.
=back
=head1 MAINTAINER
This project was originated by Nick Ing-Simmons and later maintained
-by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for full list
+by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full list
of people involved. For any questions, use
E<lt>perl-unicode@perl.orgE<gt> so others can share.
/*
- $Id: Encode.xs,v 1.31 2002/04/20 23:43:47 dankogai Exp dankogai $
+ $Id: Encode.xs,v 1.33 2002/04/22 03:43:05 dankogai Exp $
*/
#define PERL_NO_GET_CONTEXT
#include "XSUB.h"
#define U8 U8
#include "encode.h"
+# define PERLIO_FILENAME "PerlIO/encoding.pm"
/* set 1 or more to profile. t/encoding.t dumps core because of
Perl_warner and PerlIO don't work well */
-#define ENCODE_XS_PROFILE 0
+#define ENCODE_XS_PROFILE 0
/* set 0 to disable floating point to calculate buffer size for
encode_method(). 1 is recommended. 2 restores NI-S original */
-#define ENCODE_XS_USEFP 1
+#define ENCODE_XS_USEFP 1
#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
Perl_croak(aTHX_ "panic_unimplemented"); \
}
case ENCODE_NOREP:
/* encoding */
- if (dir == enc->f_utf8) {
+ if (dir == enc->f_utf8) {
STRLEN clen;
UV ch =
- utf8n_to_uvuni(s+slen, (SvCUR(src)-slen),
+ utf8n_to_uvuni(s+slen, (SvCUR(src)-slen),
&clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY);
if (check & ENCODE_DIE_ON_ERR) {
Perl_croak(
- aTHX_ "\"\\N{U+%" UVxf "}\" does not map to %s, %d",
+ aTHX_ "\"\\N{U+%" UVxf "}\" does not map to %s, %d",
ch, enc->name[0], __LINE__);
}else{
if (check & ENCODE_RETURN_ON_ERR){
if (check & ENCODE_WARN_ON_ERR){
Perl_warner(
aTHX_ packWARN(WARN_UTF8),
- "\"\\N{U+%" UVxf "}\" does not map to %s",
+ "\"\\N{U+%" UVxf "}\" does not map to %s",
ch,enc->name[0]);
}
goto ENCODE_SET_SRC;
}else if (check & ENCODE_PERLQQ){
- SV* perlqq =
+ SV* perlqq =
sv_2mortal(newSVpvf("\\x{%04x}", ch));
sdone += slen + clen;
ddone += dlen + SvCUR(perlqq);
sv_catsv(dst, perlqq);
- } else {
+ } else {
/* fallback char */
sdone += slen + clen;
- ddone += dlen + enc->replen;
- sv_catpvn(dst, (char*)enc->rep, enc->replen);
+ ddone += dlen + enc->replen;
+ sv_catpvn(dst, (char*)enc->rep, enc->replen);
}
- }
+ }
}
/* decoding */
- else {
+ else {
if (check & ENCODE_DIE_ON_ERR){
Perl_croak(
aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)",
}
goto ENCODE_SET_SRC;
}else if (check & ENCODE_PERLQQ){
- SV* perlqq =
+ SV* perlqq =
sv_2mortal(newSVpvf("\\x%02X", s[slen]));
sdone += slen + 1;
ddone += dlen + SvCUR(perlqq);
sv_catsv(dst, perlqq);
} else {
sdone += slen + 1;
- ddone += dlen + strlen(FBCHAR_UTF8);
- sv_catpv(dst, FBCHAR_UTF8);
+ ddone += dlen + strlen(FBCHAR_UTF8);
+ sv_catpv(dst, FBCHAR_UTF8);
}
}
}
/* settle variables when fallback */
d = (U8 *)SvEND(dst);
- dlen = SvLEN(dst) - ddone - 1;
- s = (U8*)SvPVX(src) + sdone;
+ dlen = SvLEN(dst) - ddone - 1;
+ s = (U8*)SvPVX(src) + sdone;
slen = tlen - sdone;
break;
if (code && !(check & ENCODE_RETURN_ON_ERR)) {
return &PL_sv_undef;
}
-
+
SvCUR_set(dst, dlen+ddone);
SvPOK_only(dst);
-
+
#if ENCODE_XS_PROFILE
if (SvCUR(dst) > SvCUR(src)){
Perl_warn(aTHX_
(SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
}
#endif
-
+
ENCODE_END:
*SvEND(dst) = '\0';
return dst;
XSRETURN(1);
}
+void
+Method_needs_lines(obj)
+SV * obj
+CODE:
+{
+ encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
+ ST(0) = &PL_sv_no;
+ XSRETURN(1);
+}
+
+void
+Method_perlio_ok(obj)
+SV * obj
+CODE:
+{
+ encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
+ if (hv_exists(get_hv("INC", 0),
+ PERLIO_FILENAME, strlen(PERLIO_FILENAME)))
+ {
+ ST(0) = &PL_sv_yes;
+ }else{
+ ST(0) = &PL_sv_no;
+ }
+ XSRETURN(1);
+}
+
MODULE = Encode PACKAGE = Encode
PROTOTYPES: ENABLE
CODE:
{
SV * encoding = items == 2 ? ST(1) : Nullsv;
-
+
if (encoding)
RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
else {
/* Must do things the slow way */
U8 *dest;
/* We need a copy to pass to check() */
- U8 *src = (U8*)savepv((char *)s);
+ U8 *src = (U8*)savepv((char *)s);
U8 *send = s + len;
New(83, dest, len, U8); /* I think */
/* Note change to utf8.c variable naming, for variety */
while (ulen--) {
- if ((*s & 0xc0) != 0x80){
- goto failure;
+ if ((*s & 0xc0) != 0x80){
+ goto failure;
} else {
uv = (uv << 6) | (*s++ & 0x3f);
}
OUTPUT:
RETVAL
-int
+int
WARN_ON_ERR()
CODE:
RETVAL = ENCODE_WARN_ON_ERR;
}
}
use Encode;
-our $VERSION = do { my @r = (q$Revision: 1.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use XSLoader;
XSLoader::load(__PACKAGE__,$VERSION);
=head1 DESCRIPTION
-To find how to use this module in detail, see L<Encode>.
+To find out how to use this module in detail, see L<Encode>.
=head1 Note on ISO-2022-JP(-1)?
code to decode to utf8 but not vice versa.
$utf8 = decode('iso-2022-jp-1', $stream);
+
+and
+
$utf8 = decode('iso-2022-jp', $stream);
-Yields the same result but
+yield the same result but
$with_0212 = encode('iso-2022-jp-1', $utf8);
$without_0212 = encode('iso-2022-jp', $utf8 );
-In the latter case, characters that map to 0212 are at first converted
-to U+3013 (0xA2AE in EUC-JP; a white square also known as 'Tofu') then
-fed to decoding engine. U+FFFD is not used to preserve text layout as
-much as possible.
+In the latter case, characters that map to 0212 are first converted
+to U+3013 (0xA2AE in EUC-JP; a white square also known as 'Tofu' or
+'geta mark') then fed to the decoding engine. U+FFFD is not used,
+in order to preserve text layout as much as possible.
=head1 BUGS
-ASCII part (0x00-0x7f) is preserved for all encodings, even though it
-conflicts with mappings by the Unicode Consortium. See
+The ASCII region (0x00-0x7f) is preserved for all encodings, even
+though this conflicts with mappings by the Unicode Consortium. See
L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
-to find why it is implemented that way.
+to find out why it is implemented that way.
=head1 SEE ALSO
die "Encode::KR not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use XSLoader;
XSLoader::load(__PACKAGE__,$VERSION);
+use Encode::KR::2022_KR;
+
1;
__END__
qr/(?:x-)?windows-949$/i => '"cp949"'
qr/ks_c_5601-1987$/i => '"cp949"'
-ASCII part (0x00-0x7f) is preserved for all encodings, even though it
-conflicts with mappings by the Unicode Consortium. See
+The ASCII region (0x00-0x7f) is preserved for all encodings, even
+though this conflicts with mappings by the Unicode Consortium. See
L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
-to find why it is implemented that way.
+to find out why it is implemented that way.
=head1 SEE ALSO
INSTALLATION
-To install this module type the following:
+To install this module, type the following:
perl Makefile.PL
make
perl Makefile.PL MORE_SCRIPTS
make && make test && make install
-by default, only enc2xs and piconv are installed.
+By default, only enc2xs and piconv are installed.
To install *.ucm files also, say
perl Makefile.PL INSTALL_UCM
make && make test && make install
-by default, *.ucm are not installed.
+By default, *.ucm are not installed.
DEPENDENCIES
MAINTAINER
This project was originated by Nick Ing-Simmons and later maintained by
-Dan Kogai <dankogai@dan.co.jp>. See AUTHORS for full list of people
+Dan Kogai <dankogai@dan.co.jp>. See AUTHORS for the full list of people
involved.
QUESTIONS?
-If you have any questions "perldoc Encode" does not answer, please
+If you have any questions which "perldoc Encode" does not answer, please
feel free to ask at perl-unicode@perl.org.
package Encode::Symbol;
use Encode;
-our $VERSION = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use XSLoader;
XSLoader::load(__PACKAGE__,$VERSION);
=head1 DESCRIPTION
-To find how to use this module in detail, see L<Encode>.
+To find out how to use this module in detail, see L<Encode>.
=head1 SEE ALSO
die "Encode::TW not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use XSLoader;
=head1 DESCRIPTION
-This module implements Taiwan-based Chinese charset encodings.
+This module implements tradition Chinese charset encodings as used
+in Taiwan and Hong Kong.
Encodings supported are as follows.
Canonical Alias Description
= Big5 + Microsoft vendor mappings
--------------------------------------------------------------------
-To find how to use this module in detail, see L<Encode>.
+To find out how to use this module in detail, see L<Encode>.
=head1 NOTES
Since the original C<big5> encoding (1984) is not supported anywhere
(glibc and DOS-based systems uses C<big5> to mean C<big5-eten>; Microsoft
-uses C<big5> to mean C<cp950>), a concious decision was made to alias
+uses C<big5> to mean C<cp950>), a conscious decision was made to alias
C<big5> to C<big5-eten>, which is the de facto superset of the original
big5.
The C<CNS11643> encoding files are not complete. For common C<CNS11643>
manipulation, please use C<EUC-TW> in L<Encode::HanExtra>, which contains
-plane 1-7.
+planes 1-7.
-ASCII part (0x00-0x7f) is preserved for all encodings, even though it
-conflicts with mappings by the Unicode Consortium. See
+The ASCII region (0x00-0x7f) is preserved for all encodings, even
+though this conflicts with mappings by the Unicode Consortium. See
L<http://www.debian.or.jp/~kubota/unicode-symbols.html.en>
-to find why it is implemented that way.
+to find out why it is implemented that way.
=head1 SEE ALSO
use strict;
use warnings;
-our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.34 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use XSLoader;
XSLoader::load(__PACKAGE__,$VERSION);
return bless {%$self},ref($self);
}
+sub needs_lines { 0 };
+
+sub perlio_ok {
+ exists $INC{"PerlIO/encoding.pm"} or return 0;
+ return 1;
+}
+
#
-# three implementation of (en|de)code exist. XS version is the fastest.
-# *_modern use # an array and *_classic stick with substr. *_classic is
-# much slower but more memory conservative. *_xs is default.
+# three implementations of (en|de)code exist. The XS version is the
+# fastest. *_modern uses an array and *_classic sticks with substr.
+# *_classic is much slower but more memory conservative.
+# *_xs is the default.
sub set_transcoder{
no warnings qw(redefine);
=head1 NAME
-Encode::Unicode -- Various Unicode Transform Format
+Encode::Unicode -- Various Unicode Transformation Formats
=cut
UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
UTF-32 4 Y - is bogus As is BE/LE
- UTF-32BE 4 N - bogus As is 0x0010abcd
- UTF-32LE 4 N - bogus As is 0xcdab1000
+ UTF-32BE 4 N - bogus As is 0x0001abcd
+ UTF-32LE 4 N - bogus As is 0xcdab0100
UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
---------------+-----------------+------------------------------
=head1 Size, Endianness, and BOM
-You can categorize these CES by 3 criteria; Size of each character,
-Endianness, and Byte Order Mark.
+You can categorize these CES by 3 criteria: size of each character,
+endianness, and Byte Order Mark.
-=head2 by Size
+=head2 by size
UCS-2 is a fixed-length encoding with each character taking 16 bits.
-It B<does not> support I<Surrogate Pairs>. When a surrogate pair is
-encountered during decode(), its place is filled with \xFFFD without
-I<CHECK> or croaks if I<CHECK>. When a character whose ord value is
-larger than 0xFFFF is encountered, it uses 0xFFFD without I<CHECK> or
-croaks if <CHECK>.
-
-UTF-16 is almost the same as UCS-2 but it supports I<Surrogate Pairs>.
+It B<does not> support I<surrogate pairs>. When a surrogate pair
+is encountered during decode(), its place is filled with \x{FFFD}
+if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a
+character whose ord value is larger than 0xFFFF is encountered,
+its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
+croaks if I<CHECK> is 1.
+
+UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
-following low surrogate (0xDC00-0xDFFF), C<desurrogate>s them to form a
-character. Bogus surrogates result in death. When \x{10000} or above
-is encountered during encode(), it C<ensurrogate>s them and pushes the
-surrogate pair to the output stream.
+following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
+form a character. Bogus surrogates result in death. When \x{10000}
+or above is encountered during encode(), it C<ensurrogate>s them and
+pushes the surrogate pair to the output stream.
UTF-32 is a fixed-length encoding with each character taking 32 bits.
-Since it is 32-bit there is no need for I<Surrogate Pairs>.
+Since it is 32-bit, there is no need for I<surrogate pairs>.
-=head2 by Endianness
+=head2 by endianness
-First (and now failed) goal of Unicode was to map all character
-repertories into a fixed-length integer so programmers are happy.
-Since each character is either I<short> or I<long> in C, you have to
-put endianness of each platform when you pass data to one another.
+The first (and now failed) goal of Unicode was to map all character
+repertoires into a fixed-length integer so that programmers are happy.
+Since each character is either a I<short> or I<long> in C, you have to
+pay attention to the endianness of each platform when you pass data
+to one another.
Anything marked as BE is Big Endian (or network byte order) and LE is
-Little Endian (aka VAX byte order). For anything without, a character
-called Byte Order Mark (BOM) is prepended to the head of string.
+Little Endian (aka VAX byte order). For anything not marked either
+BE or LE, a character called Byte Order Mark (BOM) indicating the
+endianness is prepended to the string.
=over 4
=back
-This modules handles BOM as follows.
+This modules handles the BOM as follows.
=over 4
=item *
When BE or LE is explicitly stated as the name of encoding, BOM is
-simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE).
+simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).
=item *
-When BE or LE is omitted during decode(), it checks if BOM is in the
-beginning of the string and if found endianness is set to what BOM
-says. If not found, dies.
+When BE or LE is omitted during decode(), it checks if BOM is at the
+beginning of the string; if one is found, the endianness is set to
+what the BOM says. If no BOM is found, the routine dies.
=item *
When BE or LE is omitted during encode(), it returns a BE-encoded
string with BOM prepended. So when you want to encode a whole text
-file, make sure you encode() by whole text, not line by line or each
-line, not file, is prepended with BOMs.
+file, make sure you encode() the whole text at once, not line by line
+or each line, not file, will have a BOM prepended.
=item *
-C<UCS-2> is an exception. Unlike others this is an alias of UCS-2BE.
+C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE.
UCS-2 is already registered by IANA and others that way.
=back
Vogons here ;) Or, comparing Encode to Babel Fish is completely
appropriate -- if you can only stick this into your ear :)
-Surrogate pairs were born when Unicode Consortium finally
+Surrogate pairs were born when the Unicode Consortium finally
admitted that 16 bits were not big enough to hold all the world's
-character repertoire. But they have already made UCS-2 16-bit. What
+character repertoires. But they already made UCS-2 16-bit. What
do we do?
-Back then 0xD800-0xDFFF was not allocated. Let's split them half and
-use the first half to represent C<upper half of a character> and the
-latter C<lower half of a character>. That way you can represent 1024
-* 1024 = 1048576 more characters. Now we can store character ranges
-up to \x{10ffff} even with 16-bit encodings. This pair of
-half-character is now called a I<Surrogate Pair> and UTF-16 is the
-name of the encoding that embraces them.
+Back then, the range 0xD800-0xDFFF was not allocated. Let's split
+that range in half and use the first half to represent the C<upper
+half of a character> and the second half to represent the C<lower
+half of a character>. That way, you can represent 1024 * 1024 =
+1048576 more characters. Now we can store character ranges up to
+\x{10ffff} even with 16-bit encodings. This pair of half-character is
+now called a I<surrogate pair> and UTF-16 is the name of the encoding
+that embraces them.
Here is a formula to ensurrogate a Unicode character \x{10000} and
above;
every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
(*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
- integer support! (**)
-
- (**) Is anything beyond \x{11_0000} still Unicode :?
+ integer support!
=head1 SEE ALSO
/*
- $Id: Unicode.xs,v 1.3 2002/04/20 23:43:47 dankogai Exp dankogai $
+ $Id: Unicode.xs,v 1.3 2002/04/20 23:43:47 dankogai Exp $
*/
#define PERL_NO_GET_CONTEXT
use strict;
use Getopt::Std;
my @orig_ARGV = @ARGV;
-our $VERSION = do { my @r = (q$Revision: 1.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
# These may get re-ordered.
# RAW is a do_now as inserted by &enter
=head1 DESCRIPTION
F<enc2xs> builds a Perl extension for use by Encode from either
-Unicode Character Mapping files (.ucm) or Tcl Encoding Files
-(.enc) Besides internally used during the build process of Encode
-module, you can use F<enc2xs> to add your own encoding to perl. No
-knowledge on XS is necessary.
+Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
+Besides being used internally during the build process of the Encode
+module, you can use F<enc2xs> to add your own encoding to perl.
+No knowledge of XS is necessary.
=head1 Quick Guide
-If what you want to know as little about Perl possible but needs to
+If you want to know as little about Perl as possible but need to
add a new encoding, just read this chapter and forget the rest.
=over 4
=item 0.
-Have a .ucm file ready. You can get it from somewhere or you can
-write your own from scratch or you can grab one from Encode
-distribution and customize. For UCM format, see the next Chapter.
-In the example below, I'll call my theoretical encoding myascii,
-defined inI<my.ucm>. C<$> is a shell prompt.
+Have a .ucm file ready. You can get it from somewhere or you can write
+your own from scratch or you can grab one from the Encode distribution
+and customize it. For the UCM format, see the next Chapter. In the
+example below, I'll call my theoretical encoding myascii, defined
+in I<my.ucm>. C<$> is a shell prompt.
$ ls -F
my.ucm
$ ls -F
Makefile.PL My.pm my.ucm t/
-The following files are created.
+The following files were created.
- Makefle.PL - MakeMaker script
- My.pm - Encode Submodule
- t/My.t - test file
+ Makefile.PL - MakeMaker script
+ My.pm - Encode submodule
+ t/My.t - test file
+
+=over 4
=item 1.1.
$ mv *.ucm Encode
$ enc2xs -M My Encode/*ucm
+=back
+
=item 2.
Edit the files generated. You don't have to if you have no time AND no
intention to give it to someone else. But it is a good idea to edit
-pod and add more tests.
+the pod and to add more tests.
=item 3.
-Now issue a command all Perl Mongers love;
+Now issue a command all Perl Mongers love:
$ perl5.7.3 Makefile.PL
Writing Makefile for Encode::My
chmod 644 blib/arch/auto/Encode/My/My.bs
$
-The time it takes varies how fast your machine is and how large your
-encoding is. Unless you are working on something big like euc-tw, it
-won't take too long.
+The time it takes varies depending on how fast your machine is and
+how large your encoding is. Unless you are working on something big
+like euc-tw, it won't take too long.
=item 5.
=item 7.
-If you want to add your encoding to Encode demand-loading list
+If you want to add your encoding to Encode's demand-loading list
(so you don't have to "use Encode::YourEncoding"), run
enc2xs -C
=head1 The Unicode Character Map
-Encode uses The Unicode Character Map (UCM) for source character
-mappings. This format is used by ICU package of IBM and adopted by
-Nick Ing-Simmons. Since UCM is more flexible than Tcl's Encoding Map
-and far more user-friendly, This is the recommended formet for
-Encode now.
+Encode uses the Unicode Character Map (UCM) format for source character
+mappings. This format is used by IBM's ICU package and was adopted
+by Nick Ing-Simmons for use with the Encode module. Since UCM is
+more flexible than Tcl's Encoding Map and far more user-friendly,
+this is the recommended formet for Encode now.
-UCM file looks like this.
+A UCM file looks like this.
#
# Comments
=item *
-Anything that follows C<#> is treated as comments.
+Anything that follows C<#> is treated as a comment.
=item *
-The header section continues until CHARMAP. This section Has a form of
-I<E<lt>keywordE<gt> value>, one at a line. For a value, strings must
-be quoted. Barewords are treated as numbers. I<\xXX> represents a
-byte.
+The header section continues until a line containing the word
+CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
+pair per line. Strings used as values must be quoted. Barewords are
+treated as numbers. I<\xXX> represents a byte.
Most of the keywords are self-explanatory. I<subchar> means
substitution character, not subcharacter. When you decode a Unicode
sequence to this encoding but no matching character is found, the byte
sequence defined here will be used. For most cases, the value here is
-\x3F, in ASCII this is a question mark.
+\x3F; in ASCII, this is a question mark.
=item *
CHARMAP starts the character map section. Each line has a form as
-follows;
+follows:
<UXXXX> \xXX.. |0 # comment
^ ^ ^
| +-------- Encoded byte sequence
+-------------- Unicode Character ID in hex
-The format is roughly the same as a header section except for fallback
-flag. It is | followed by 0..3. And their meaning as follows
+The format is roughly the same as a header section except for the
+fallback flag: | followed by 0..3. The meaning of the possible
+values is as follows:
-=over 2
+=over 4
=item |0
-Round trip safe. A character decoded to Unicode encodes back to the
-same byte sequence. most character belong to this.
+Round trip safe. A character decoded to Unicode encodes back to the
+same byte sequence. Most characters have this flag.
=item |1
Fallback for unicode -> encoding. When seen, enc2xs adds this
-character for encode map only
+character for the encode map only.
=item |2
=item |3
Fallback for encoding -> unicode. When seen, enc2xs adds this
-character for decode map only
+character for the decode map only.
=back
=back
When you are manually creating a UCM file, you should copy ascii.ucm
-or existing encoding which is close to yours than write your own from
-scratch.
+or an existing encoding which is close to yours, rather than write
+your own from scratch.
When you do so, make sure you leave at least B<U0000> to B<U0020> as
-is, unless your environment is on EBCDIC.
+is, unless your environment is EBCDIC.
B<CAVEAT>: not all features in UCM are implemented. For example,
icu:state is not used. Because of that, you need to write a perl
-module if you want to support algorithmical encodings, notablly
-ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
+module if you want to support algorithmical encodings, notably
+the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
=head2 Coping with duplicate mappings
When you create a map, you SHOULD make your mappings round-trip safe.
That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
$data> stands for all characters that are marked as C<|0>. Here is
-how to make sure;
+how to make sure:
-=over 2
+=over 4
=item *
=item *
-And make sure '|1' or '|3' FOLLOWS '|0' entry.
+And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
=back
(\xF9\xF9 => U2550 is now overwritten!)
The Encode package comes with F<ucmlint>, a crude but sufficient
-utility to check the integrity of ucm file. Check under Encode/bin
-directory for this.
+utility to check the integrity of a UCM file. Check under the
+Encode/bin directory for this.
=head1 Bookmarks
+=over 4
+
+=item *
+
ICU Home Page
L<http://oss.software.ibm.com/icu/>
+=item *
+
ICU Character Mapping Tables
L<http://oss.software.ibm.com/icu/charset/>
+=item *
+
ICU:Conversion Data
L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
+=back
+
=head1 SEE ALSO
L<Encode>,
#!./perl
-# $Id: piconv,v 1.23 2002/04/19 05:36:43 dankogai Exp $
+# $Id: piconv,v 1.24 2002/04/22 02:45:50 dankogai Exp $
#
use 5.7.3;
use strict;
=head1 DESCRIPTION
-B<piconv> is perl version of F<iconv>, a character encoding converter
-widely available for various Unixen today. This script was primarily
-a technology demonstrator for Perl 5.8.0, you can use piconv in the
-place of iconv for virtually any cases.
+B<piconv> is perl version of B<iconv>, a character encoding converter
+widely available for various Unixen today. This script was primarily
+a technology demonstrator for Perl 5.8.0, but you can use piconv in the
+place of iconv for virtually any case.
-piconv converts character encoding of either STDIN or files specified
-in the argument and prints out to STDOUT.
+piconv converts the character encoding of either STDIN or files
+specified in the argument and prints out to STDOUT.
-Here are list of options.
+Here is the list of options.
=over 4
=item -f from_encoding
-Specifies the encoding you are converting from. Unlike F<iconv>,
-this option can be omitted. In such cases the current locale is used.
+Specifies the encoding you are converting from. Unlike B<iconv>,
+this option can be omitted. In such cases, the current locale is used.
=item -t to_encoding
-Specifies the encoding you are converting to. Unlike F<iconv>,
-this option can be omitted. In such cases the current locale is used.
+Specifies the encoding you are converting to. Unlike B<iconv>,
+this option can be omitted. In such cases, the current locale is used.
-Therefore when both -f and -t are omitted, F<piconv> just acts like F<cat>.
+Therefore, when both -f and -t are omitted, B<piconv> just acts
+like B<cat>.
=item -s I<string>
-uses I<string> instead of file for the source of text. Same as F<iconv>.
+uses I<string> instead of file for the source of text. Same as B<iconv>.
=item -l
Lists all available encodings, one per line, in case-insensitive
-order. Note that only the canonical names are listed, many aliases
+order. Note that only the canonical names are listed; many aliases
exist. For example, the names are case-insensitive, and many standard
-and common aliases work, like "latin1" for "ISO 8859-1", or "ibm850"
+and common aliases work, such as "latin1" for "ISO-8859-1", or "ibm850"
instead of "cp850", or "winlatin1" for "cp1252". See L<Encode::Supported>
-for the full discussion.
+for a full discussion.
=item -C I<N>
=item -S scheme
Selects which scheme is to be used for conversion. Available schemes
-are as follows;
+are as follows:
=over 4
=back
-Like I<-D> option, this is also for Encode hackers.
+Like the I<-D> option, this is also for Encode hackers.
=back
#!/usr/local/bin/perl
#
-# $Id: ucmlint,v 0.1 2002/04/09 20:04:30 dankogai Exp $
+# $Id: ucmlint,v 0.2 2002/04/22 02:45:50 dankogai Exp $
#
use strict;
-our $VERSION = do { my @r = (q$Revision: 0.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Getopt::Std;
our %Opt;
__END__
-UCM file looks like this.
+A UCM file looks like this.
#
# Comments
package encoding;
-our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.33 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use strict;
}
}
-our $HAS_PERLIO = exists $INC{"PerlIO/encoding.pm"};
-$HAS_PERLIO or binmode(STDIN);
+our $HAS_PERLIO = 0;
+eval { require PerlIO::encoding };
+unless ($@){
+ $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
+}
sub import {
my $class = shift;
require Carp;
Carp::croak "Unknown encoding for $h, '$arg{$h}'";
}
- eval qq{ binmode($h, ":encoding($arg{$h})") };
+ eval { binmode($h, ":encoding($arg{$h})") };
}else{
unless (exists $arg{$h}){
- eval qq{ binmode($h, ":encoding($name)") };
+ eval {
+ no warnings 'uninitialized';
+ binmode($h, ":encoding($name)");
+ };
}
}
if ($@){
=head1 NAME
-encoding - allows you to write your script in non-ascii or non-utf8
+encoding - allows you to write your script in non-ascii or non-utf8
=head1 SYNOPSIS
# or you can even do this if your shell supports your native encoding
perl -Mencoding=latin2 -e '...' # Feeling centrally European?
- perl -Mencoding=euc-ko -e '...'
+ perl -Mencoding=euc-kr -e '...' # Or Korean?
# or from the shebang line
#!/your/path/to/perl -Mencoding="8859-6" # Arabian Nights
- #!/your/path/to/perl -Mencoding=euc-tw
+ #!/your/path/to/perl -Mencoding=big5 # Taiwanese
# more control
Let's start with a bit of history: Perl 5.6.0 introduced Unicode
support. You could apply C<substr()> and regexes even to complex CJK
characters -- so long as the script was written in UTF-8. But back
-then text editors that supported UTF-8 were still rare and many users
-rather chose to write scripts in legacy encodings, given up whole new
-feature of Perl 5.6.
+then, text editors that supported UTF-8 were still rare and many users
+instead chose to write scripts in legacy encodings, giving up a whole
+new feature of Perl 5.6.
-Rewind to the future: starting from perl 5.8.0 with B<encoding>
+Rewind to the future: starting from perl 5.8.0 with the B<encoding>
pragma, you can write your script in any encoding you like (so long
as the C<Encode> module supports it) and still enjoy Unicode support.
-You can write a code in EUC-JP as follows:
+You can write code in EUC-JP as follows:
my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
#<-char-><-char-> # 4 octets
Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
not "\x{99F1}\x{99DD} is the symbol of perl.\n".
-You can override this by giving extra arguments, see below.
+You can override this by giving extra arguments; see below.
=head1 USAGE
=item use encoding [I<ENCNAME>] ;
-Sets the script encoding to I<ENCNAME> and filehandle disciplines of
-STDIN, STDOUT are set to ":encoding(I<ENCNAME>)". Note STDERR will
-not be changed.
+Sets the script encoding to I<ENCNAME>. Filehandle disciplines of
+STDIN and STDOUT are set to ":encoding(I<ENCNAME>)". Note that STDERR
+will not be changed.
If no encoding is specified, the environment variable L<PERL_ENCODING>
is consulted. If no encoding can be found, the error C<Unknown encoding
=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
-You can also individually set encodings of STDIN and STDOUT via
+You can also individually set encodings of STDIN and STDOUT via the
C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
completely off.
=item no encoding;
-Unsets the script encoding and the disciplines of STDIN, STDOUT are
+Unsets the script encoding. The disciplines of STDIN, STDOUT are
reset to ":raw" (the default unprocessed raw stream of bytes).
=back
The pragma is a per script, not a per block lexical. Only the last
C<use encoding> or C<no encoding> matters, and it affects B<the whole script>.
-However, <no encoding> pragma is supported and C<use encoding> can
+However, the <no encoding> pragma is supported and C<use encoding> can
appear as many times as you want in a given script. The multiple use
of this pragma is discouraged.
gets UTF-8 encoded.
After all, the best thing about this pragma is that you don't have to
-resort to \x... just to spell your name in native a encoding. So feel
-free to put your strings in your encoding in quotes and regexes.
+resort to \x{....} just to spell your name in a native encoding.
+So feel free to put your strings in your encoding in quotes and
+regexes.
=head1 Non-ASCII Identifiers and Filter option
is a single Han ideograph) work, you still need to write your script
in UTF-8 or use a source filter.
-In other words, the same restriction as Jperl applies.
+In other words, the same restriction as with Jperl applies.
-If you dare to experiment, however, you can try Filter option.
+If you dare to experiment, however, you can try the Filter option.
=over 4
=item use encoding I<ENCNAME> Filter=E<gt>1;
-This turns encoding pragma into source filter. While the default
+This turns the encoding pragma into a source filter. While the default
approach just decodes interpolated literals (in qq() and qr()), this
-will apply source filter to entire source code. In this case, STDIN
-and STDOUT remain untouched.
+will apply a source filter to the entire source code. In this case,
+STDIN and STDOUT remain untouched.
=back
What does this mean? Your source code behaves as if it is written in
-UTF-8. So even if your editor only supports Shift_JIS, for example.
-You can still try examples in Chapter 15 of C<Programming Perl, 3rd
-Ed.> For instance, you can use UTF-8 identifiers.
+UTF-8. So even if your editor only supports Shift_JIS, for example,
+you can still try examples in Chapter 15 of C<Programming Perl, 3rd
+Ed.>. For instance, you can use UTF-8 identifiers.
This option is significantly slower and (as of this writing) non-ASCII
identifiers are not very stable WITHOUT this option and with the
use encoding "iso 8859-7";
- # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode.
+ # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
$a = "\xDF";
$b = "\x{100}";
print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
# ... but pack/unpack C are not affected, in case you still
- # want back to your native encoding
+ # want to go back to your native encoding
print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
=head1 KNOWN PROBLEMS
-For native multibyte encodings (either fixed or variable length)
+For native multibyte encodings (either fixed or variable length),
the current implementation of the regular expressions may introduce
-recoding errors for longer regular expression literals than 127 bytes.
+recoding errors for regular expression literals longer than 127 bytes.
The encoding pragma is not supported on EBCDIC platforms.
-(Porters wanted.)
+(Porters who are willing and able to remove this limitation are
+welcome.)
=head1 SEE ALSO
package Encode::Alias;
use strict;
use Encode;
-our $VERSION = do { my @r = (q$Revision: 1.28 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.29 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
require Exporter;
define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
-In this case if I<ENCODING> is not a reference it is C<eval>-ed to
-allow C<$1> etc. to be substituted. The example is one way to alias
-names as used in X11 fonts to the MIME names for the iso-8859-*
-family. Note the double quote inside the single quote.
+In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
+in order to allow C<$1> etc. to be substituted. The example is one
+way to alias names as used in X11 fonts to the MIME names for the
+iso-8859-* family. Note the double quotes inside the single quotes.
If you are using a regex here, you have to use the quotes as shown or
it won't work. Also note that regex handling is tricky even for the
define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , '');
-
-In this case C<$_> will be set to the name that is being looked up and
+In this case, C<$_> will be set to the name that is being looked up and
I<ENCODING> is passed to the sub as its first argument. The example
is another way to alias names as used in X11 fonts to the MIME names
for the iso-8859-* family.
=back
-=head2 Alias overloading
+=head2 Alias overloading
You can override predefined aliases by simply applying define_alias().
-New alias is always evaluated first and when neccessary define_alias()
-flushes internal cache to make new definition available.
+The new alias is always evaluated first, and when neccessary,
+define_alias() flushes the internal cache to make the new definition
+available.
- # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
+ # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
# superset of SHIFT_JIS
define_alias( qr/shift.*jis$/i => '"cp932"' );
define_alias( qr/sjis$/i => '"cp932"' );
-If you want to zap all predefined aliases, you can
+If you want to zap all predefined aliases, you can use
Encode::Alias->undef_aliases;
Encode::Alias->init_aliases;
-gets factory setting back.
-
+gets the factory settings back.
=head1 SEE ALSO
use strict;
use vars qw($VERSION);
-$VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+$VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode ();
use Encode::CN;
use base 'Encode::Encoding';
-# HZ is but escaped GB, so we implement it with the
-# GB2312(raw) encoding here. Cf. RFC 1842 & 1843.
+# HZ is only escaped GB, so we implement it with the
+# GB2312(raw) encoding here. Cf. RFCs 1842 & 1843.
my $canon = 'hz';
my $obj = bless {name => $canon}, __PACKAGE__;
$obj->Define($canon);
+sub needs_lines { 1 }
+
+sub perlio_ok {
+ # exists $INC{"PerlIO/encoding.pm"} or return 0;
+ # PerlIO::encoding->VERSION >= 0.03 and return 1;
+ return 0; # for the time being
+}
+
sub decode
{
my ($obj,$str,$chk) = @_;
# Demand-load module list
#
package Encode::Config;
-our $VERSION = do { my @r = (q$Revision: 1.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use strict;
our %ExtModule =
(
# Encode::Byte
- #iso-8859-1 is on Encode.pm itself
+ #iso-8859-1 is in Encode.pm itself
'iso-8859-2' => 'Encode::Byte',
'iso-8859-3' => 'Encode::Byte',
'iso-8859-4' => 'Encode::Byte',
#
-# $Id: Encoder.pm,v 0.4 2002/04/12 20:23:05 dankogai Exp $
+# $Id: Encoder.pm,v 0.5 2002/04/22 02:45:50 dankogai Exp $
#
package Encode::Encoder;
use strict;
use warnings;
-our $VERSION = do { my @r = (q$Revision: 0.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 0.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
require Exporter;
our @ISA = qw(Exporter);
=head1 ABSTRACT
-B<Encode::Encoder> allows you to use Encode via OOP style. This is
-not only more intuitive than functional approach, but also handier
-when you want to stack encodings. Suppose you want your UTF-8 string
-converted to Latin1 then Base64, you can simply say
+B<Encode::Encoder> allows you to use Encode in an object-oriented
+style. This is not only more intuitive than a functional approach,
+but also handier when you want to stack encodings. Suppose you want
+your UTF-8 string converted to Latin1 then Base64: you can simply say
my $base64 = encoder($utf8)->latin1->base64;
my $latin1 = encode("latin1", $utf8);
my $base64 = encode_base64($utf8);
-or lazier and convolted
+or the lazier and more convoluted
my $base64 = encode_base64(encode("latin1", $utf8));
=item *
-There are at least two instance variable stored in hash reference,
+There are at least two instance variables stored in a hash reference,
{data} and {encoding}.
=item *
-When there is no method, it takes the method name as the name of
-encoding and encode instance I<data> with I<encoding>. If successful,
-instance I<encoding> is set accordingly.
+When there is no method, it takes the method name as the name of the
+encoding and encodes the instance I<data> with I<encoding>. If successful,
+the instance I<encoding> is set accordingly.
=item *
=head2 Predefined Methods
-This module predefines the methods below;
+This module predefines the methods below:
=over 4
=item $e = Encode::Encoder-E<gt>new([$data, $encoding]);
returns an encoder object. Its data is initialized with $data if
-there, and its encoding is set to $encoding if there.
+present, and its encoding is set to $encoding if present.
When $encoding is omitted, it defaults to utf8 if $data is already in
utf8 or "" (empty string) otherwise.
=item $e-E<gt>data([$data])
-when $data is present, sets instance data to $data and returns the
-object itself. otherwise the current instance data is returned.
+When $data is present, sets the instance data to $data and returns the
+object itself. Otherwise, the current instance data is returned.
=item $e-E<gt>encoding([$encoding])
-when $encoding is present, sets instance encoding to $encoding and
-returns the object itself. otherwise the current instance encoding is
+When $encoding is present, sets the instance encoding to $encoding and
+returns the object itself. Otherwise, the current instance encoding is
returned.
=item $e-E<gt>bytes([$encoding])
-decodes instance data from $encoding, or instance encoding if omitted.
-when the conversion is successful, the enstance encoding will be set
-to "" .
+decodes instance data from $encoding, or the instance encoding if
+omitted. If the conversion is successful, the instance encoding
+will be set to "".
The name I<bytes> was deliberately picked to avoid namespace tainting
-- this module may be used as a base class so method names that appear
=head2 Example: base64 transcoder
-This module is desined to work with L<Encode::Encoding>.
-To make the Base64 transcorder example above really work, you should
-write a module like this.
+This module is designed to work with L<Encode::Encoding>.
+To make the Base64 transcoder example above really work, you could
+write a module like this:
package Encode::Base64;
use base 'Encode::Encoding';
1;
__END__
-And your caller module should be like this;
+And your caller module would be something like this:
use Encode::Encoder;
use Encode::Base64;
encoder($data)->iso_8859_1->base64;
encoder($base64)->bytes('base64')->latin1;
-=head2 operator overloading
+=head2 Operator Overloading
This module overloads two operators, stringify ("") and numify (0+).
-Stringify dumps the data therein.
+Stringify dumps the data inside the object.
-Numify returns the number of bytes therein.
+Numify returns the number of bytes in the instance data.
They come in handy when you want to print or find the size of data.
=head1 SEE ALSO
-L<Encode>
+L<Encode>,
L<Encode::Encoding>
=cut
package Encode::Encoding;
# Base class for classes which implement encodings
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.27 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
sub Define
{
sub new_sequence { return $_[0] }
+sub perlio_ok { 0 }
+
sub needs_lines { 0 }
sub DESTROY {}
not actually loaded the encoding in question. This is because the
current "loading" process is all Perl and a bit slow.
-Once an encoding is loaded then value of the hash is object which
+Once an encoding is loaded, the value of the hash is the object which
implements the encoding. The object should provide the following
interface:
=item -E<gt>name
-Should return the string representing the canonical name of the encoding.
+MUST return the string representing the canonical name of the encoding.
=item -E<gt>new_sequence
This is a placeholder for encodings with state. It should return an
-object which implements this interface, all current implementations
+object which implements this interface. All current implementations
return the original object.
=item -E<gt>encode($string,$check)
-Should return the octet sequence representing I<$string>. If I<$check>
-is true it should modify I<$string> in place to remove the converted
-part (i.e. the whole string unless there is an error). If an error
-occurs it should return the octet sequence for the fragment of string
-that has been converted, and modify $string in-place to remove the
-converted part leaving it starting with the problem fragment.
+MUST return the octet sequence representing I<$string>.
+
+=over 2
+
+=item *
+
+If I<$check> is true, it SHOULD modify I<$string> in place to remove
+the converted part (i.e. the whole string unless there is an error).
+If perlio_ok() is true, SHOULD becomes MUST.
+
+=item *
+
+If an error occurs, it SHOULD return the octet sequence for the
+fragment of string that has been converted and modify $string in-place
+to remove the converted part leaving it starting with the problem
+fragment. If perlio_ok() is true, SHOULD becomes MUST.
+
+=item *
-If check is is false then C<encode> should make a "best effort" to
-convert the string - for example by using a replacement character.
+If I<$check> is is false then C<encode> MUST make a "best effort" to
+convert the string - for example, by using a replacement character.
+
+=back
=item -E<gt>decode($octets,$check)
-Should return the string that I<$octets> represents. If I<$check> is
-true it should modify I<$octets> in place to remove the converted part
-(i.e. the whole sequence unless there is an error). If an error
-occurs it should return the fragment of string that has been
-converted, and modify $octets in-place to remove the converted part
-leaving it starting with the problem fragment.
+MUST return the string that I<$octets> represents.
+
+=over 2
+
+=item *
+
+If I<$check> is true, it SHOULD modify I<$octets> in place to remove
+the converted part (i.e. the whole sequence unless there is an
+error). If perlio_ok() is true, SHOULD becomes MUST.
+
+=item *
-If check is is false then C<decode> should make a "best effort" to
+If an error occurs, it SHOULD return the fragment of string that has
+been converted and modify $octets in-place to remove the converted
+part leaving it starting with the problem fragment. If perlio_ok() is
+true, SHOULD becomes MUST.
+
+=item *
+
+If I<$check> is false then C<decode> should make a "best effort" to
convert the string - for example by using Unicode's "\x{FFFD}" as a
replacement character.
=back
-It should be noted that the check behaviour is different from the
+=item -E<gt>perlio_ok()
+
+If you want your encoding to work with PerlIO, you MUST define this
+method so that it returns 1 when PerlIO is enabled. Here is an
+example;
+
+ sub perlio_ok { exists $INC{"PerlIO/encoding.pm"} }
+
+By default, this method is defined as follows;
+
+ sub perlio_ok { 0 }
+
+=item -E<gt>needs_lines()
+
+If your encoding can work with PerlIO but needs line buffering, you
+MUST define this method so it returns true. 7bit ISO-2022 encodings
+are one example that needs this. When this method is missing, false
+is assumed.
+
+=back
+
+It should be noted that the I<$check> behaviour is different from the
outer public API. The logic is that the "unchecked" case is useful
-when encoding is part of a stream which may be reporting errors
-(e.g. STDERR). In such cases it is desirable to get everything
+when the encoding is part of a stream which may be reporting errors
+(e.g. STDERR). In such cases, it is desirable to get everything
through somehow without causing additional errors which obscure the
-original one. Also the encoding is best placed to know what the
+original one. Also, the encoding is best placed to know what the
correct replacement character is, so if that is the desired behaviour
then letting low level code do it is the most efficient.
-In contrast if check is true, the scheme above allows the encoding to
-do as much as it can and tell layer above how much that was. What is
-lacking at present is a mechanism to report what went wrong. The most
-likely interface will be an additional method call to the object, or
-perhaps (to avoid forcing per-stream objects on otherwise stateless
-encodings) and additional parameter.
+By contrast, if I<$check> is true, the scheme above allows the
+encoding to do as much as it can and tell the layer above how much
+that was. What is lacking at present is a mechanism to report what
+went wrong. The most likely interface will be an additional method
+call to the object, or perhaps (to avoid forcing per-stream objects
+on otherwise stateless encodings) an additional parameter.
It is also highly desirable that encoding classes inherit from
C<Encode::Encoding> as a base class. This allows that class to define
-additional behaviour for all encoding objects. For example built in
-Unicode, UCS-2 and UTF-8 classes use :
+additional behaviour for all encoding objects. For example, built-in
+Unicode, UCS-2, and UTF-8 classes use
package Encode::MyEncoding;
use base qw(Encode::Encoding);
__PACKAGE__->Define(qw(myCanonical myAlias));
-To create an object with bless {Name => ...},$class, and call
+to create an object with C<< bless {Name => ...}, $class >>, and call
define_encoding. They inherit their C<name> method from
C<Encode::Encoding>.
=head2 Compiled Encodings
-For the sake of speed and efficiency, Most of the encodings are now
-supported via I<Compiled Form> that are XS modules generated from UCM
-files. Encode provides enc2xs tool to achieve that. Please see
+For the sake of speed and efficiency, most of the encodings are now
+supported via a I<compiled form>: XS modules generated from UCM
+files. Encode provides the enc2xs tool to achieve that. Please see
L<enc2xs> for more details.
=head1 SEE ALSO
L<perlmod>, L<enc2xs>
-=for future
-
+=begin future
=over 4
=item Scheme 1
-Passed remaining fragment of string being processed.
-Modifies it in place to remove bytes/characters it can understand
-and returns a string used to represent them.
-e.g.
+The fixup routine gets passed the remaining fragment of string being
+processed. It modifies it in place to remove bytes/characters it can
+understand and returns a string used to represent them. For example:
sub fixup {
my $ch = substr($_[0],0,1,'');
return sprintf("\x{%02X}",ord($ch);
}
-This scheme is close to how underlying C code for Encode works, but gives
-the fixup routine very little context.
+This scheme is close to how the underlying C code for Encode works,
+but gives the fixup routine very little context.
=item Scheme 2
-Passed original string, and an index into it of the problem area, and
-output string so far. Appends what it will to output string and
-returns new index into original string. For example:
+The fixup routine gets passed the original string, an index into
+it of the problem area, and the output string so far. It appends
+what it wants to the output string and returns a new index into the
+original string. For example:
sub fixup {
# my ($s,$i,$d) = @_;
}
This scheme gives maximal control to the fixup routine but is more
-complicated to code, and may need internals of Encode to be tweaked to
-keep original string intact.
+complicated to code, and may require that the internals of Encode be tweaked to
+keep the original string intact.
=item Other Schemes
-Hybrids of above.
+Hybrids of the above.
Multiple return values rather than in-place modifications.
=back
+=end future
+
=cut
#
-# $Id: H2Z.pm,v 1.0 2002/03/28 23:26:28 dankogai Exp $
+# $Id: H2Z.pm,v 1.1 2002/04/22 03:43:05 dankogai Exp $
#
package Encode::JP::H2Z;
use strict;
-our $RCSID = q$Id: H2Z.pm,v 1.0 2002/03/28 23:26:28 dankogai Exp $;
-our $VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $RCSID = q$Id: H2Z.pm,v 1.1 2002/04/22 03:43:05 dankogai Exp $;
+our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Carp;
#$_PAT_Z2D = join("|", keys %_Z2D);
sub h2z {
+ no warnings qw(uninitialized);
my $r_str = shift;
my ($keep_dakuten) = @_;
my $n = 0;
package Encode::JP::JIS7;
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.3 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode qw(:fallbacks);
sub needs_lines { 1 }
+sub perlio_ok {
+ exists $INC{"PerlIO/encoding.pm"} or return 0;
+ PerlIO::encoding->VERSION >= 0.03 and return 1;
+ return 0;
+}
+
use Encode::CJKConstants qw(:all);
our $DEBUG = 0;
my $residue = jis_euc(\$str);
# This is for PerlIO
$_[1] = $residue if $chk;
- # use perlqq fallback for euc-jp -> utf8
return Encode::decode('euc-jp', $str, FB_PERLQQ);
}
}
sub euc_jis{
+ no warnings qw(uninitialized);
my $r_str = shift;
my $jis0212 = shift;
$$r_str =~ s{
package Encode::KR::2022_KR;
-use Encode::KR;
+use Encode qw(:fallbacks);
use base 'Encode::Encoding';
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.3 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
my $canon = 'iso-2022-kr';
sub needs_lines { 1 }
+sub perlio_ok {
+ #exists $INC{"PerlIO/encoding.pm"} or return 0;
+ #PerlIO::encoding->VERSION >= 0.03 and return 1;
+ return 0; # for the time being
+}
+
sub decode
{
- my ($obj,$str,$chk) = @_;
+ my ($obj, $str, $chk) = @_;
my $res = $str;
- iso_euc(\$res);
- return Encode::decode('euc-kr', $res, $chk);
+ my $residue = iso_euc(\$res);
+ # This is for PerlIO
+ $_[1] = $residue if $chk;
+ return Encode::decode('euc-kr', $res, FB_PERLQQ);
}
sub encode
{
- my ($obj,$str,$chk) = @_;
- my $res = Encode::encode('euc-kr', $str, $chk);
- euc_iso(\$res);
- return $res;
+ my ($obj, $utf8, $chk) = @_;
+ # empty the input string in the stack so perlio is ok
+ $_[1] = '' if $chk;
+ my $octet = Encode::encode('euc-jp', $utf8, FB_PERLQQ) ;
+ euc_iso(\$octet);
+ return $octet;
}
use Encode::CJKConstants qw(:all);
sub iso_euc{
my $r_str = shift;
$$r_str =~ s/$RE{'2022_KR'}//gox; # remove the designator
- $$r_str =~ s{ # replace chars. in GL
- \x0e # between SO(\x0e) and SI(\x0f)
- ([^\x0f]*) # with chars. in GR
+ $$r_str =~ s{ # replace characters in GL
+ \x0e # between SO(\x0e) and SI(\x0f)
+ ([^\x0f]*) # with characters in GR
\x0f
- }
+ }
{
- my $out= $1;
+ my $out= $1;
$out =~ tr/\x21-\x7e/\xa1-\xfe/;
$out;
}geox;
- $$r_str;
+ my ($residue) = ($$r_str =~ s/(\e.*)$//so);
+ return $residue;
}
sub euc_iso{
+ no warnings qw(uninitialized);
my $r_str = shift;
substr($$r_str,0,0)=$ESC{'2022_KR'}; # put the designator at the beg.
- $$r_str =~ s{ # move KS X 1001 chars. in GR to GL
- ($RE{EUC_C}+) # and enclose them with SO and SI
- }{
- my $str = $1;
- $str =~ tr/\xA1-\xFE/\x21-\x7E/;
- "\x0e" . $str . "\x0f";
- }geox;
+ $$r_str =~ s{ # move KS X 1001 characters in GR to GL
+ ($RE{EUC_C}+) # and enclose them with SO and SI
+ }{
+ my $str = $1;
+ $str =~ tr/\xA1-\xFE/\x21-\x7E/;
+ "\x0e" . $str . "\x0f";
+ }geox;
$$r_str;
}
It is very common to want to do encoding transformations when
reading or writing files, network connections, pipes etc.
If Perl is configured to use the new 'perlio' IO system then
-C<Encode> provides a "layer" (See L<PerlIO>) which can transform
+C<Encode> provides a "layer" (see L<PerlIO>) which can transform
data as it is read or written.
Here is how the blind poet would modernise the encoding:
close($utf8);
close($illiad);
-In addition the new IO system can also be configured to read/write
-UTF-8 encoded characters (as noted above this is efficient):
+In addition, the new IO system can also be configured to read/write
+UTF-8 encoded characters (as noted above, this is efficient):
open(my $fh,'>:utf8','anything');
print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
Either of the above forms of "layer" specifications can be made the default
for a lexical scope with the C<use open ...> pragma. See L<open>.
-Once a handle is open is layers can be altered using C<binmode>.
+Once a handle is open, its layers can be altered using C<binmode>.
-Without any such configuration, or if Perl itself is built using
-system's own IO, then write operations assume that file handle accepts
-only I<bytes> and will C<die> if a character larger than 255 is
-written to the handle. When reading, each octet from the handle
-becomes a byte-in-a-character. Note that this default is the same
-behaviour as bytes-only languages (including Perl before v5.6) would
-have, and is sufficient to handle native 8-bit encodings
-e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
-other encodings and binary data.
+Without any such configuration, or if Perl itself is built using the
+system's own IO, then write operations assume that the file handle
+accepts only I<bytes> and will C<die> if a character larger than 255 is
+written to the handle. When reading, each octet from the handle becomes
+a byte-in-a-character. Note that this default is the same behaviour
+as bytes-only languages (including Perl before v5.6) would have,
+and is sufficient to handle native 8-bit encodings e.g. iso-8859-1,
+EBCDIC etc. and any legacy mechanisms for handling other encodings
+and binary data.
-In other cases it is the programs responsibility to transform
+In other cases, it is the program's responsibility to transform
characters into bytes using the API above before doing writes, and to
transform the bytes read from a handle into characters before doing
"character operations" (e.g. C<lc>, C</\W+/>, ...).
You can also use PerlIO to convert larger amounts of data you don't
-want to bring into memory. For example to convert between ISO-8859-1
+want to bring into memory. For example, to convert between ISO-8859-1
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
Here is a crude diagram of how filehandle, PerlIO, and Encode
interact.
- filehandle <-> PerlIO PerlIO <-> scalar (read/printed)
- \ /
+ filehandle <-> PerlIO PerlIO <-> scalar (read/printed)
+ \ /
Encode
-When PerlIO receives data from either direction, it fills in the buffer
-(currently with 1024 bytes) and pass the buffer to Encode. Encode tries
-to convert the valid part and pass it back to PerlIO, leaving invalid
-parts (usually partial character) in buffer. PerlIO then appends more
-data in buffer, call Encode, and so on until the data stream ends.
+When PerlIO receives data from either direction, it fills a buffer
+(currently with 1024 bytes) and passes the buffer to Encode.
+Encode tries to convert the valid part and passes it back to PerlIO,
+leaving invalid parts (usually a partial character) in the buffer.
+PerlIO then appends more data to the buffer, calls Encode again,
+and so on until the data stream ends.
To do so, PerlIO always calls (de|en)code methods with CHECK set to 1.
-this ensures that the method stops at the right place when it
+This ensures that the method stops at the right place when it
encounters partial character. The following is what happens when
PerlIO and Encode tries to encode (from utf8) more than 1024 bytes
-long and the buffer boundary happens to be between a character.
+and the buffer boundary happens to be in the middle of a character.
A B C .... ~ \x{3000} ....
41 42 43 .... 7E e3 80 80 ....
because it is invalid (partial character).
Unfortunately, this scheme does not work well with escape-based
-encoding such as ISO-2022-JP. Let's see what happens in that case
+encodings such as ISO-2022-JP. Let's see what happens in that case
in the next chapter.
=head1 BUGS
-Now let's see what happens when you try to decode form ISO-2022-JP and
-the buffer cuts in the middle of a character
+Now let's see what happens when you try to decode from ISO-2022-JP and
+the buffer ends in the middle of a character.
JIS208-ESC \x{5f3e}
A B C .... ~ \e $ B |DAN | ....
As you see, the next buffer begins with \x43. But \x43 is 'C' in
ASCII, which is wrong in this case because we are now in JISX 0208
area so it has to convert \x43\x46, not \x43. Unlike utf8 and EUC,
-in escape-based encoding you can't tell if it a given octed is a whole
+in escape-based encodings you can't tell if a given octet is a whole
character or just part of it.
There are actually several ways to solve this problem but none of
-which is fast enough to be practical. From Encode's point of view
-the easiest solution is for PerlIO to implement line buffer instead
-of fixed-length buffer but that makes PerlIO really complicated.
+them is fast enough to be practical. From Encode's point of view,
+the easiest solution is for PerlIO to implement a line buffer instead
+of a fixed-length buffer, but that makes PerlIO really complicated.
-So for the time being, using escape-based encodings in ":encoding()"
-layer of PerlIO does not work well.
+So for the time being, using escape-based encodings in the
+":encoding()" layer of PerlIO does not work well.
=head2 Workaround
print $l;
}
-=head2 How can you tell my encoding fully supports PerlIO ?
+=head2 How can I tell whether my encoding fully supports PerlIO ?
-As of this writing, Any encoding which class belongs to Encode::XS and
-Encode::Unicode works. Encode module has C<perlio_ok> method so you
-can use it before appling PerlIO encoding to the filehandle. Here is
-an example;
+As of this writing, any encoding whose class belongs to Encode::XS and
+Encode::Unicode works. The Encode module has a C<perlio_ok> method
+which you can use before appling PerlIO encoding to the filehandle.
+Here is an example:
my $use_perlio = perlio_ok($enc);
my $layer = $use_perlio ? "<:raw" : "<:encoding($enc)";
L<utf8>,
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
-
=cut
=head1 NAME
-Encode::Supported -- Supported encodings by Encode
+Encode::Supported -- Encodings supported by Encode
=head1 DESCRIPTION
=head2 Encoding Names
Encoding names are case insensitive. White space in names
-is ignored. In addition an encoding may have aliases.
+is ignored. In addition, an encoding may have aliases.
Each encoding has one "canonical" name. The "canonical"
name is chosen from the names of the encoding by picking
the first in the following sequence (with a few exceptions).
-=over
+=over 4
=item *
=item *
-The MIME name as defined in IETF RFCs This includes all "iso-"'s.
+The MIME name as defined in IETF RFCs. This includes all "iso-"s.
=item *
=head2 Encode::Unicode -- other Unicode encodings
Unicode coding schemes other than native utf8 are supported by
-Encode::Unicode which will be autoloaded on demand.
+Encode::Unicode, which will be autoloaded on demand.
----------------------------------------------------------------
UCS-2BE UCS-2, iso-10646-1 [IANA, UC]
UTF-32LE [UC]
----------------------------------------------------------------
-To find how those (UCS-2|UTF-(16|32))(LE|BE)? differ to one another,
+To find how (UCS-2|UTF-(16|32))(LE|BE)? differ from one another,
see L<Encode::Unicode>.
=head2 Encode::Byte -- Extended ASCII
-Encode::Byte implements most of single-byte encodings except for
-Symbols and EBCDIC. The following encodings are based single-byte
-encoding implemented as extended ASCII. For most cases it uses
-\x80-\xff (upper half) to map non-ASCII characters.
+Encode::Byte implements most single-byte encodings except for
+Symbols and EBCDIC. The following encodings are based on single-byte
+encodings implemented as extended ASCII. Most of them map
+\x80-\xff (upper half) to non-ASCII characters.
-=over 2
+=over 4
=item ISO-8859 and corresponding vendor mappings
Since there are so many, they are presented in table format with
-languages and corresponding encoding names by vendors. Note the table
-is sorted in order of ISO-8859 and the corresponding vendor mappings
-are slightly different from that of ISO. See
+languages and corresponding encoding names by vendors. Note that
+the table is sorted in order of ISO-8859 and the corresponding vendor
+mappings are slightly different from that of ISO. See
L<http://czyborra.com/charsets/iso8859.html> for details.
Lang/Regions ISO/Other Std. DOS Windows Macintosh Others
----------------------------------------------------------------
N. America (ASCII) cp437 AdobeStandardEncoding
cp863 (DOSCanadaF)
- W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep
+ W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep
hp-roman8
cp860 (DOSPortuguese)
Cntrl. Europe iso-8859-2 cp852 cp1250 MacCentralEurRoman
Latin3 [1] iso-8859-3
Latin4 [2] iso-8859-4
Cyrillics iso-8859-5 cp855 cp1251 MacCyrillic
- (Also see next section) cp866 MacUkrainian
+ (See also next section) cp866 MacUkrainian
Arabic iso-8859-6 cp864 cp1256 MacArabic
cp1006 MacFarsi
Greek iso-8859-7 cp737 cp1253 MacGreek
Vietnamese viscii cp1258 MacVietnamese
----------------------------------------------------------------
- [1] Esperanto, Maltese, and Turkish. Turkish is now on 8859-5.
- [2] Baltics. Now on 8859-10.
+ [1] Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.
+ [2] Baltics. Now on 8859-10, except for Latvian.
[3] Also know as TIS 620.
- [4] Nicknamed Latin0; Euro sign as well as French and Finnish
- letters that are missing from 8859-1 are added.
+ [4] Nicknamed Latin0; the Euro sign as well as French and Finnish
+ letters that are missing from 8859-1 were added.
All cp* are also available as ibm-*, ms-*, and windows-* . See also
L<http://czyborra.com/charsets/codepages.html>.
Macintosh encodings don't seem to be registered in such entities as
IANA. "Canonical" names in Encode are based upon Apple's Tech Note
1150. See L<http://developer.apple.com/technotes/tn/tn1150.html>
-for details
+for details.
-=item KOI8 - De Facto Standard for Cyrillic world
+=item KOI8 - De Facto Standard for the Cyrillic world
-Though ISO-8859 does have ISO-8859, KOI8 series is far more popular
-in the Net. L<Encode> comes with the following KOI charsets.
+Though ISO-8859 does have ISO-8859-5, the KOI8 series is far more
+popular in the Net. L<Encode> comes with the following KOI charsets.
For gory details, see L<http://czyborra.com/charsets/cyrillic.html>
----------------------------------------------------------------
GSM0338 is for GSM handsets. Though it shares alphanumerals with
ASCII, control character ranges and other parts are mapped very
differently, presumably to store Greek and Cyrillic alphabets.
-This is also covered in Encode::Byte even though it does not
-comply to extended ASCII.
+This is also covered in Encode::Byte even though it is not an
+"extended ASCII" encoding.
=back
-=head2 The CJK: Chinese, Japanese, Korean (Multibyte)
+=head2 CJK: Chinese, Japanese, Korean (Multibyte)
Note that Vietnamese is listed above. Also read "Encoding vs Charset"
-below. Also note these are implemented in distinct module by
-languages, due the the size concerns. Please refer to their
-respective document pages.
+below. Also note that these are implemented in distinct modules by
+countries, due the the size concerns (simplified Chinese is mapped
+to 'CN', continental China, while traditional Chinese is mapped to
+'TW', Taiwan). Please refer to their respective documentataion pages.
=over 4
iso-ir-165
----------------------------------------------------------------
- [1] GB2312 is aliased to this. see L<Microsoft-related naming mess>
- [2] gbk is aliased to this. see L<Microsoft-related naming mess>
+ [1] GB2312 is aliased to this. See L<Microsoft-related naming mess>
+ [2] gbk is aliased to this. See L<Microsoft-related naming mess>
=item Encode::JP -- Japan
=head1 Unsupported encodings
-The following are not supported as yet. Some because they are rarely
-used, some because of technical difficulties. They may be supported by
-external modules via CPAN in future, however.
+The following encodings are not supported as yet; some because they
+are rarely used, some because of technical difficulties. They may
+be supported by external modules via CPAN in the future, however.
=over 4
=item ISO-2022-JP-2 [RFC1554]
Not very popular yet. Needs Unicode Database or equivalent to
-implement encode() (Because it includes JIS X 0208/0212, KSC5601, and
-GB2312 simultaneously, which code points in Unicode overlap. So you
-need to lookup the database to determine what character set a given
+implement encode() (because it includes JIS X 0208/0212, KSC5601, and
+GB2312 simultaneously, whose code points in Unicode overlap. So you
+need to lookup the database to determine to what character set a given
Unicode character should belong).
=item ISO-2022-CN [RFC1922]
-Not very popular. Needs CNS 11643-1 and 2 which are not available in
+Not very popular. Needs CNS 11643-1 and -2 which are not available in
this module. CNS 11643 is supported (via euc-tw) in Encode::HanExtra.
-Autrijus may add support for this encoding in his module in future.
+Autrijus Tang may add support for this encoding in his module in future.
-=item various UP-UX encodings
+=item Various HP-UX encodings
The following are unsupported due to the lack of mapping data.
=item Cyrillic encoding ISO-IR-111
-Anton doubts its usefulness.
+Anton Tagunov doubts its usefulness.
=item ISO-8859-8-1 [Hebrew]
=item Vietnamese encodings VPS
-Though Jungshik has reported that Mozilla supports this encoding it
-was too late before 5.8.0 for us to add one. In future via a separate
-module. See
+Though Jungshik Shin has reported that Mozilla supports this encoding,
+it was too late before 5.8.0 for us to add it. In the future, it
+may be available via a separate module. See
L<http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf>
and
L<http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut>
MacSinhalese, MacTamil, MacTelugu, MacTibetan
MacVietnamese
-The rest of which already available are based upon the vendor mappings
+The rest which are already available are based upon the vendor mappings
at L<http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/> .
=item (Mac) Indic encodings
-The maps for the following is available at L<http://www.unicode.org/>
-but remains unsupport because those encodings need algorithmical
-approach, currently unsupported by F<enc2xs>
+The maps for the following are available at L<http://www.unicode.org/>
+but remain unsupport because those encodings need algorithmical
+approach, currently unsupported by F<enc2xs>:
MacDevanagari
MacGurmukhi
=head1 Encoding vs. Charset -- terminology
-We are used to using the term (character) I<encoding> and I<character set>
-interchangeably. But just as using the term byte and character is
-dangerous and should be differentiated when needed, we need to
-differentiate I<encoding> and I<character set>.
+We are used to using the term (character) I<encoding> and I<character
+set> interchangeably. But just as confusing the terms byte and
+character is dangerous and the terms should be differentiated when
+needed, we need to differentiate I<encoding> and I<character set>.
-To understand that, it's follow how we make computers grok our characters.
+To understand that, here is a description of how we make computers
+grok our characters.
=over 4
=item *
Then we have to give each character a unique ID so your computer can
-tell the difference from 'a' to 'A'. This itemized character
+tell the difference between 'a' and 'A'. This itemized character
repertoire is now a I<character set>.
=item *
If your computer can grow the character set without further
-processing, you can go ahead use it. This is called a I<coded
+processing, you can go ahead and use it. This is called a I<coded
character set> (CCS) or I<raw character encoding>. ASCII is used this
way for most cases.
=item *
-But in many cases especially multi-byte CJK encodings, you have to
+But in many cases, especially multi-byte CJK encodings, you have to
tweak a little more. Your network connection may not accept any data
-with the Most Significant Bit set, Your computer may not be able to
+with the Most Significant Bit set, and your computer may not be able to
tell if a given byte is a whole character or just half of it. So you
have to I<encode> the character set to use it.
A I<character encoding scheme> (CES) determines how to encode a given
character set, or a set of multiple character sets. 7bit ISO-2022 is
-an example of CES. You switch between character sets via I<escape
-sequence>.
+an example of a CES. You switch between character sets via I<escape
+sequences>.
=back
-Technically, or Mathematically speaking, a character set encoded in
+Technically, or mathematically, speaking, a character set encoded in
such a CES that maps character by character may form a CCS. EUC is such
-an example. CES of EUC is as follows;
+an example. The CES of EUC is as follows:
=over 4
=item *
-You can also use 0x8e and 0x8f to tell the following sequence of
-characters belong to yet another character set. each following byte
-is added by 0x80
+You can also use 0x8e and 0x8f to indicate that the following sequence of
+characters belongs to yet another character set. To each following byte
+is added the value 0x80.
=back
-By carefully looking at at the encoded byte sequence, you may find the
-byte sequence conforms a unique number. In that sense EUC is a CCS
+By carefully looking at the encoded byte sequence, you can find that the
+byte sequence conforms a unique number. In that sense, EUC is a CCS
generated by a CES above from up to four CCS (complicated?). UTF-8
-falls into this category. See L<perlUnicode/"UTF-8"> to find how
+falls into this category. See L<perlUnicode/"UTF-8"> to find out how
UTF-8 maps Unicode to a byte sequence.
-You may also find by now why 7bit ISO-2022 cannot conform a CCS. If
-you look at a byte sequence \x21\x21, you can't tell if it is two !'s
-or IDEOGRAPHIC SPACE. EUC maps the latter to \xA1\xA1 so you have no
-trouble between "!!". and " "
+You may also have found out by now why 7bit ISO-2022 cannot comprise
+a CCS. If you look at a byte sequence \x21\x21, you can't tell if
+it is two !'s or IDEOGRAPHIC SPACE. EUC maps the latter to \xA1\xA1
+so you have no trouble differentiating between "!!". and S<" ">.
=head1 Encoding Classification (by Anton Tagunov and Dan Kogai)
choose the most suitable aliases to name them in the context of
such communication.
-=over 2
+=over 4
=item *
-To (en|de) code Encodings marked as C<(**)>, You need
+To (en|de)code encodings marked by C<(**)>, you need
C<Encode::HanExtra>, available from CPAN.
=back
Shift_JIS EUC-JP ISO-2022-JP ISO-2022-JP-1
EUC-KR Big5 GB2312
-are registered to IANA as preferred MIME names and may probably
+are registered with IANA as preferred MIME names and may
be used over the Internet.
C<Shift_JIS> has been officialized by JIS X 0208:1997.
have not been registered with IANA (as of March 2002) but
seem to be supported by major web browsers.
-IANA name for C<EUC-CN> is C<GB2312>.
+The IANA name for C<EUC-CN> is C<GB2312>.
KS_C_5601-1987
Jungshik Shin reports that UTF-16 with a BOM is well accepted
by MS IE 5/6 and NS 4/6. Beware however that
-=over 2
+=over 4
=item *
C<UTF-8> coded data seamlessly passes traditional
command piping (C<cat>, C<more>, etc.) while C<UTF-16> coded
-data is likely to cause confusion (with it's zero bytes,
+data is likely to cause confusion (with its zero bytes,
for example)
=item *
it is beyond the power of words to describe the way HTML browsers
-encode non-C<ASCII> form data. To get a general impression visit
+encode non-C<ASCII> form data. To get a general impression, visit
L<http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html>.
-While encoding of form data has stabilized for C<UTF-8> coded pages
-(at least IE 5/6, NS 6, Opera 6 behave consistently), be sure to
-expect fun (and cross-browser discrepancies) with C<UTF-16> coded
+While encoding of form data has stabilized for C<UTF-8> encoded pages
+(at least IE 5/6, NS 6, and Opera 6 behave consistently), be sure to
+expect fun (and cross-browser discrepancies) with C<UTF-16> encoded
pages!
=back
BIG5PLUS (**)
-is a bit proprietary name.
+is a proprietary name.
=head2 Microsoft-related naming mess
Microsoft products misuse the following names:
-=over 2
+=over 4
=item KS_C_5601-1987
JIS has not endorsed the full Microsoft standard however.
The official C<Shift_JIS> includes only JIS X 0201 and JIS X 0208
-character sets, while Microsoft has always been meaning C<Shift_JIS>
+character sets, while Microsoft has always used C<Shift_JIS>
to encode a wider character repertoire. See C<IANA> registration for
C<Windows-31J>.
-As a historical predecessor Microsoft's variant
-probably has more rights for the name, albeit it may be objected
+As a historical predecessor, Microsoft's variant
+probably has more rights for the name, though it may be objected
that Microsoft shouldn't have used JIS as part of the name
in the first place.
=head1 Glossary
-=over 2
+=over 4
=item character repertoire
-A collection of unique characters. A I<character> set in the most
-strict sense. At this stage characters are not numbered.
+A collection of unique characters. A I<character> set in the strictest
+sense. At this stage, characters are not numbered.
=item coded character set (CCS)
A character set that is mapped in a way computers can use directly.
-Many character encodings including EUC falls in this category.
+Many character encodings, including EUC, fall in this category.
=item character encoding scheme (CES)
has long been used in the meaning of C<encoding>, CES.
-While C<character set> word combination has lost this meaning
-in MIME context since [RFC 2130], C<charset> abbreviation has
-retained it. This is how [RFC 2277], [RFC 2278] bless C<charset>:
-
+While the word combination C<character set> has lost this meaning
+in MIME context since [RFC 2130], the C<charset> abbreviation has
+retained it. This is how [RFC 2277] and [RFC 2278] bless C<charset>:
This document uses the term "charset" to mean a set of rules for
mapping from a sequence of octets to a sequence of characters, such
=item EUC
-Extended Unix Character. See ISO-2022
+Extended Unix Character. See ISO-2022.
=item ISO-2022
-A CES that was carefully designed to coexist with ASCII. There are 7
-bit version and 8 bit version.
+A CES that was carefully designed to coexist with ASCII. There are a 7
+bit version and an 8 bit version.
-7 bit version switches character set via escape sequence so this
+The 7 bit version switches character set via escape sequence so it
cannot form a CCS. Since this is more difficult to handle in programs
-than the 8 bit version, 7 bit version is not very popular except for
-iso-2022-jp, the de facto standard CES for e-mails.
+than the 8 bit version, the 7 bit version is not very popular except for
+iso-2022-jp, the I<de facto> standard CES for e-mails.
-8 bit version can conform a CCS. EUC and ISO-8859 are two examples
+The 8 bit version can form a CCS. EUC and ISO-8859 are two examples
thereof. Pre-5.6 perl could use them as string literals.
=item UCS
Short for I<Universal Character Set>. When you say just UCS, it means
-I<Unicode>
+I<Unicode>.
=item UCS-2
=item Unicode
-A Character Set that aims to include all character repertoire of the
+A character set that aims to include all character repertoires of the
world. Many character sets in various national as well as industrial
standards have become, in a way, just subsets of Unicode.
=item UTF
Short for I<Unicode Transformation Format>. Determines how to map a
-Unicode character into byte sequence.
+Unicode character into a byte sequence.
=item UTF-16
A UTF in 16-bit encoding. Can either be in big endian or little
-endian. Big endian version is called UTF-16BE (equals to UCS-2 +
-Surrogate Support) and little endian version is UTF-16LE.
+endian. The big endian version is called UTF-16BE (equal to UCS-2 +
+surrogate support) and the little endian version is called UTF-16LE.
=back
=head1 References
-=over 2
+=over 4
=item ECMA
European Computer Manufacturers Association
L<http://www.ecma.ch>
-=over 2
+=over 4
-=item EMCA-035 (eq C<ISO-2022>)
+=item ECMA-035 (eq C<ISO-2022>)
L<http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM>
-The very specification of ISO-2022 is available from the link above.
+The specification of ISO-2022 is available from the link above.
=back
Internet Assigned Numbers Authority
L<http://www.iana.org/>
-=over 2
+=over 4
=item Assigned Charset Names by IANA
=item RFC
Request For Comments -- need I say more?
-L<http://www.rfc.net/>, L<http://www.faqs.org/rfcs/>
+L<http://www.rfc-editor.org/>, L<http://www.rfc.net/>,
+L<http://www.faqs.org/rfcs/>
=item UC
Unicode Consortium
L<http://www.unicode.org/>
-=over 2
+=over 4
=item Unicode Glossary
=head2 Other Notable Sites
-=over 2
+=over 4
=item czyborra.com
L<ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf>
-You will find brief info on C<EUC-CN>, C<GBK> and mostly on C<GB 18030>
+You will find brief info on C<EUC-CN>, C<GBK> and mostly on C<GB 18030>.
=item Jungshik Shin's Hangul FAQ
L<http://jshin.net/faq>
-And especially it's subject 8.
+And especially its subject 8.
L<http://jshin.net/faq/qa8.html>
A comprehensive overview of the Korean (C<KS *>) standards.
+=item debian.org: "Introduction to i18n"
+
+A brief description for most of the mentioned CJK encodings is
+contained in
+L<http://www.debian.org/doc/manuals/intro-i18n/ch-codes.en.html>
+
=back
=head2 Offline sources
-=over 2
+=over 4
=item C<CJKV Information Processing> by Ken Lunde
CJKV Information Processing
1999 O'Reilly & Associates, ISBN : 1-56592-224-7
-The modern successor of the C<CJK.inf>.
+The modern successor of C<CJK.inf>.
-Features a comprehensive coverage on CJKV character sets and
+Features a comprehensive coverage of CJKV character sets and
encodings along with many other issues faced by anyone trying
to better support CJKV languages/scripts in all the areas of
information processing.
-To purchase this book visit
+To purchase this book, visit
L<http://www.oreilly.com/catalog/cjkvinfo/>
+or your favourite bookstore.
=back
=cut
-
-I could not find this page because the hostname doesn't resolve!
-
-Brief description for most of the mentioned CJK encodings
-L<http://www.debian.org.ru/doc/manuals/intro-i18n/ch-codes.html>
#
-# $Id: jperl.t,v 1.21 2002/04/14 22:05:20 dankogai Exp $
+# $Id: jperl.t,v 1.23 2002/04/22 09:48:07 dankogai Exp dankogai $
#
# This script is written in euc-jp
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
- require Encode;
- eval { require PerlIO::encoding };
- unless ($INC{"PerlIO/encoding.pm"}
- and PerlIO::encoding->VERSION >= 0.02
- ){
- print "1..0 # Skip:: PerlIO::encoding 0.02 or better required\n";
- exit 0;
- }
- # warn "PerlIO::encoding->VERSION == ", PerlIO::encoding->VERSION, "\n";
$| = 1;
}
use strict;
use File::Basename;
use File::Spec;
-use File::Compare;
+use File::Compare qw(compare_text);
use File::Copy;
use FileHandle;
#use Test::More qw(no_plan);
-use Test::More tests => 20;
+use Test::More tests => 28;
our $DEBUG = 0;
+use Encode (":all");
+eval { require PerlIO::encoding };
+
{
no warnings;
@ARGV and $DEBUG = shift;
- require Encode::JP::JIS7;
- $Encode::JP::JIS7::DEBUG = $DEBUG;
+ #require Encode::JP::JIS7;
+ #require Encode::KR::2022_KR;
+ #$Encode::JP::JIS7::DEBUG = $DEBUG;
}
-Encode->import(":all");
-my $dir = dirname(__FILE__);
-my $ufile = File::Spec->catfile($dir,"jisx0208.ref");
-open my $fh, "<:utf8", $ufile or die "$ufile : $!";
-my @uline = <$fh>;
-my $utext = join('' => @uline);
-close $fh;
+
my $seq = 0;
+my $dir = dirname(__FILE__);
-for my $e (qw/euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1/){
- my $sfile = File::Spec->catfile($dir,"$$.sio");
- my $pfile = File::Spec->catfile($dir,"$$.pio");
+my %e =
+ (
+ jisx0208 => [ qw/euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1/],
+ #ksc5601 => [ qw/euc-kr iso-2022-kr/],
+ ksc5601 => [ qw/euc-kr/],
+ #gb2312 => [ qw/euc-cn hz/],
+ gb2312 => [ qw/euc-cn/],
+ );
- # first create a file without perlio
- dump2file($sfile, &encode($e, $utext, 0));
- # then create a file via perlio without autoflush
-
- SKIP:{
- skip "$e: !perlio_ok", 1 unless perlio_ok($e) or $DEBUG;
- open $fh, ">:encoding($e)", $pfile or die "$sfile : $!";
- binmode $fh;
- $fh->autoflush(0);
- print $fh $utext;
- close $fh;
- $seq++;
- unless (is(compare($sfile, $pfile), 0 => ">:encoding($e)")){
- copy $sfile, "$sfile.$seq";
- copy $pfile, "$pfile.$seq";
- }
- }
-
- # this time print line by line.
- # works even for ISO-2022!
- open $fh, ">:encoding($e)", $pfile or die "$sfile : $!";
- binmode $fh;
- $fh->autoflush(1);
- for my $l (@uline) {
- print $fh $l;
- }
+for my $src(sort keys %e) {
+ my $ufile = File::Spec->catfile($dir,"$src.ref");
+ open my $fh, "<:utf8", $ufile or die "$ufile : $!";
+ my @uline = <$fh>;
+ my $utext = join('' => @uline);
close $fh;
- $seq++;
- unless(is(compare($sfile, $pfile), 0
- => ">:encoding($e); by lines")){
- copy $sfile, "$sfile.$seq";
- copy $pfile, "$pfile.$seq";
- }
- SKIP:{
- skip "$e: !perlio_ok", 2 unless perlio_ok($e) or $DEBUG;
- open $fh, "<:encoding($e)", $pfile or die "$pfile : $!";
- $fh->autoflush(0);
- my $dtext = join('' => <$fh>);
- close $fh;
- $seq++;
- unless(ok($utext eq $dtext, "<:encoding($e)")){
- dump2file("$sfile.$seq", $utext);
- dump2file("$pfile.$seq", $dtext);
- }
- $dtext = '';
- open $fh, "<:encoding($e)", $pfile or die "$pfile : $!";
- while(defined(my $l = <$fh>)) {
- $dtext .= $l;
- }
- close $fh;
- $seq++;
- unless (ok($utext eq $dtext, "<:encoding($e); by lines")) {
- dump2file("$sfile.$seq", $utext);
- dump2file("$pfile.$seq", $dtext);
+ for my $e (@{$e{$src}}){
+ my $sfile = File::Spec->catfile($dir,"$$.sio");
+ my $pfile = File::Spec->catfile($dir,"$$.pio");
+
+ # first create a file without perlio
+ dump2file($sfile, &encode($e, $utext, 0));
+
+ # then create a file via perlio without autoflush
+
+ TODO:{
+ #local $TODO = "$e: !perlio_ok" unless (perlio_ok($e) or $DEBUG);
+ todo_skip "$e: !perlio_ok", 4 unless (perlio_ok($e) or $DEBUG);
+ no warnings 'uninitialized';
+ open $fh, ">:encoding($e)", $pfile or die "$sfile : $!";
+ $fh->autoflush(0);
+ print $fh $utext;
+ close $fh;
+ $seq++;
+ is(compare_text($sfile, $pfile), 0 => ">:encoding($e)");
+ if ($DEBUG){
+ copy $sfile, "$sfile.$seq";
+ copy $pfile, "$pfile.$seq";
+ }
+
+ # this time print line by line.
+ # works even for ISO-2022 but not ISO-2022-KR
+ open $fh, ">:encoding($e)", $pfile or die "$sfile : $!";
+ $fh->autoflush(1);
+ for my $l (@uline) {
+ print $fh $l;
+ }
+ close $fh;
+ $seq++;
+ is(compare_text($sfile, $pfile), 0 => ">:encoding($e) by lines");
+ if ($DEBUG){
+ copy $sfile, "$sfile.$seq";
+ copy $pfile, "$pfile.$seq";
+ }
+ my $dtext;
+ open $fh, "<:encoding($e)", $pfile or die "$pfile : $!";
+ $fh->autoflush(0);
+ $dtext = join('' => <$fh>);
+ close $fh;
+ $seq++;
+ ok($utext eq $dtext, "<:encoding($e)");
+ if ($DEBUG){
+ dump2file("$sfile.$seq", $utext);
+ dump2file("$pfile.$seq", $dtext);
+ }
+ if (perlio_ok($e) or $DEBUG){
+ $dtext = '';
+ open $fh, "<:encoding($e)", $pfile or die "$pfile : $!";
+ while(defined(my $l = <$fh>)) {
+ $dtext .= $l;
+ }
+ close $fh;
+ }
+ $seq++;
+ ok($utext eq $dtext, "<:encoding($e) by lines");
+ if ($DEBUG){
+ dump2file("$sfile.$seq", $utext);
+ dump2file("$pfile.$seq", $dtext);
+ }
}
+ $DEBUG or unlink ($sfile, $pfile);
}
- $DEBUG or unlink ($sfile, $pfile);
}
+
sub dump2file{
no warnings;
#
-# $Id: big5.ucm,v 1.0 2002/03/28 23:26:25 dankogai Exp dankogai $
+# $Id: big5-eten.ucm,v 1.2 2002/04/22 03:41:13 dankogai Exp $
#
# ./compile -n big5-eten -o Encode/big5-eten.ucm Encode/big5-eten.enc
<code_set_name> "big5-eten"
#
-# $Id: big5-hkscs.ucm,v 1.0 2002/03/28 23:26:25 dankogai Exp $
+# $Id: big5-hkscs.ucm,v 1.2 2002/04/22 03:41:13 dankogai Exp $
#
<code_set_name> "big5-hkscs"
<mb_cur_min> 1