From: Jarkko Hietaniemi Date: Mon, 22 Apr 2002 12:44:09 +0000 (+0000) Subject: Upgrade to Encode 1.56, from Dan Kogai. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=0ab8f81ed97bef3f6feac6e615e45b8291ca05fa;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode 1.56, from Dan Kogai. p4raw-id: //depot/perl@16070 --- diff --git a/ext/Encode/AUTHORS b/ext/Encode/AUTHORS index 6c8cc9b..2ba72f8 100644 --- a/ext/Encode/AUTHORS +++ b/ext/Encode/AUTHORS @@ -1,11 +1,11 @@ -# To give due honor to those who have made Encode module what is is today, -# here are easily-from-changelogs-extractable people and their -# (hopefully) current and preferred email addresses (as of early 2001, +# To give due honour to those who have made the Encode module what it +# is today, here are easily-from-changelogs-extractable people and their +# (hopefully) current and preferred email addresses (as of early 2002, # if known). # # The use of this database for anything else than Encode and/or Perl # development is strictly forbidden. (Passive distribution with the Perl -# source code kit or CPAN is naturally allowed.) +# source code kit or CPAN is, of course, allowed.) # # This list is in alphabetical order. -- @@ -16,10 +16,12 @@ Benjamin Goldberg Craig A. Berry Dan Kogai Gerrit P. Haase +Gurusamy Sarathy Jarkko Hietaniemi Jungshik Shin Laszlo Molnar Mark-Jason Dominus +Mattia Barbon Michael G Schwern Nicholas Clark Nick Ing-Simmons diff --git a/ext/Encode/Byte/Byte.pm b/ext/Encode/Byte/Byte.pm index e570505..745ca3c 100644 --- a/ext/Encode/Byte/Byte.pm +++ b/ext/Encode/Byte/Byte.pm @@ -1,6 +1,6 @@ package Encode::Byte; use Encode; -our $VERSION = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use XSLoader; XSLoader::load(__PACKAGE__,$VERSION); @@ -15,7 +15,7 @@ Encode::Byte - Single Byte Encodings =head1 SYNOPSIS use Encode qw/encode decode/; - $greek = encode("iso-885-7", $utf8); # loads Encode::Byte implicitly + $greek = encode("iso-8859-7", $utf8); # loads Encode::Byte implicitly $utf8 = decode("iso-8859-7", $greek); # ditto =head1 ABSTRACT @@ -78,7 +78,7 @@ supported are as follows. cp1251 WinCyrillic cp1252 WinLatin1 cp1253 WinGreek - cp1254 WinTurkiskh + cp1254 WinTurkish cp1255 WinHebrew cp1256 WinArabic cp1257 WinBaltic diff --git a/ext/Encode/CN/CN.pm b/ext/Encode/CN/CN.pm index c031f5c..e163c06 100644 --- a/ext/Encode/CN/CN.pm +++ b/ext/Encode/CN/CN.pm @@ -4,7 +4,7 @@ BEGIN { die "Encode::CN not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use XSLoader; @@ -62,12 +62,12 @@ When you see C on mails and web pages, they really mean C encodings. To fix that, C is aliased to C. Use C when you really mean it. -ASCII part (0x00-0x7f) is preserved for all encodings, even though it -conflicts with mappings by the Unicode Consortium. See +The ASCII region (0x00-0x7f) is preserved for all encodings, even though +this conflicts with mappings by the Unicode Consortium. See L -to find why it is implemented that way. +to find out why it is implemented that way. =head1 SEE ALSO diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 657976d..d34ebce 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,9 +1,42 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 1.52 2002/04/20 23:43:47 dankogai Exp dankogai $ +# $Id: Changes,v 1.56 2002/04/22 09:48:07 dankogai Exp dankogai $ # -1.52 $Date: 2002/04/20 23:43:47 $ +$Revision: 1.56 $ $Date: 2002/04/22 09:48:07 $ +! Encode.pm encoding.pm t/perlio.t t/jperl.t + New PerlIO::encoding 0.04 compliance met + +1.55 2002/04/22 03:43:05 +! Encode.pm Encode.xs Unicode/Unicode.pm + needs_lines() defined so Encode::Encoding is no longer needed + for perlio + +1.54 2002/04/22 02:50:01 +! Encode.pm! Encode.xs! Unicode/Unicode.pm t/perlio.t +! lib/Encode/Encoding.pm lib/Encode/CN/HZ.pm + now perlio_ok is true by default if PerlIO::encoding->VERSION is + 0.03 or larger. POD in Encode::Encoding revised to reflect this. + Encode::XS and Encode::Unicode now has perlio_ok() method. +! lib/Encode/Supported.pod + s/UP-UX/HP-UX/ by jhi +! AUTHORS Byte/Byte.pm CN/CN.pm Encode.pm JP/JP.pm KR/KR.pm README +! Symbol/Symbol.pm TW/TW.pm Unicode/Unicode.pm bin/enc2xs bin/piconv +! bin/ucmlint encoding.pm lib/Encode/Alias.pm lib/Encode/CN/HZ.pm +! lib/Encode/Config.pm lib/Encode/Encoder.pm lib/Encode/Encoding.pm +! lib/Encode/KR/2022_KR.pm lib/Encode/PerlIO.pod +! lib/Encode/Supported.pod + Huge document fixes by Philip. +! AUTHORS +! t/JP.t + s/compare\(/compare_text\(/o by Sarathy. Adds him to AUTHORS + http://public.activestate.com/cgi-bin/perlbrowse?patch=16049 +! t/perlio.t + binmode() after "<:encoding" to make Win32 happy, by Mattia. + Mattia added to AUTHORS file + Message-Id: <3CC3150F.5798.22A05AE@localhost> + +1.52 2002/04/20 23:43:47 ! t/perlio.t TODO: is now SKIP:, as NI-XS requested. Also adds more eraborate failure analysis added. @@ -410,7 +443,7 @@ Typo fixes and improvements by jhi Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al. -1.11 $Date: 2002/04/20 23:43:47 $ +1.11 $Date: 2002/04/22 09:48:07 $ + t/encoding.t + t/jperl.t ! MANIFEST diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index fb80200..dbd7934 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,6 +1,6 @@ package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 1.52 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.56 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; use XSLoader (); XSLoader::load 'Encode'; @@ -66,9 +66,9 @@ sub encodings sub perlio_ok{ exists $INC{"PerlIO/encoding.pm"} or return 0; - my $stash = ref($_[0]); - $stash ||= ref(find_encoding($_[0])); - return ($stash eq "Encode::XS" || $stash eq "Encode::Unicode"); + my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]); + $obj->can("perlio_ok") and return $obj->perlio_ok() unless $@; + return 0; # safety net } sub define_encoding @@ -253,26 +253,11 @@ sub predefine_encodings{ $_[1] = '' if $chk; return $octets; }; - $Encode::Encoding{utf8} = + $Encode::Encoding{utf8} = bless {Name => "utf8"} => "Encode::utf8"; } } -require Encode::Encoding; -@Encode::XS::ISA = qw(Encode::Encoding); - -# This is very dodgy - PerlIO::encoding does "use Encode" and _BEFORE_ it gets a -# chance to set its VERSION we potentially delete it from %INC so it will be re-loaded -# NI-S -eval { - require PerlIO::encoding; - unless (PerlIO::encoding->VERSION >= 0.02){ - delete $INC{"PerlIO/encoding.pm"}; - } -}; -# warn $@ if $@; -@Encode::XS::ISA = qw(Encode::Encoding); - 1; __END__ @@ -285,13 +270,12 @@ Encode - character encodings use Encode; - =head2 Table of Contents -Encode consists of a collection of modules which details are too big +Encode consists of a collection of modules whose details are too big to fit in one document. This POD itself explains the top-level APIs and general topics at a glance. For other topics and more details, -see the PODs below; +see the PODs below: Name Description -------------------------------------------------------- @@ -317,16 +301,16 @@ codepoint" for the character (the exceptions are those platforms where the legacy encoding is some variant of EBCDIC rather than a super-set of ASCII - see L). -Traditionally computer data has been moved around in 8-bit chunks +Traditionally, computer data has been moved around in 8-bit chunks often called "bytes". These chunks are also known as "octets" in networking standards. Perl is widely used to manipulate data of many types - not only strings of characters representing human or computer -languages but also "binary" data being the machines representation of +languages but also "binary" data being the machine's representation of numbers, pixels in an image - or just about anything. -When Perl is processing "binary data" the programmer wants Perl to +When Perl is processing "binary data", the programmer wants Perl to process "sequences of bytes". This is not a problem for Perl - as a -byte has 256 possible values it easily fits in Perl's much larger +byte has 256 possible values, it easily fits in Perl's much larger "logical character". =head2 TERMINOLOGY @@ -346,7 +330,7 @@ I: a character in the range 0..255 =item * I: 8 bits of data, with ordinal values 0..255 -(Term for bytes passed to or from a non-Perl context, e.g. disk file.) +(Term for bytes passed to or from a non-Perl context, e.g. a disk file.) =back @@ -360,32 +344,32 @@ and such details may change in future releases. =item $octets = encode(ENCODING, $string[, CHECK]) -Encodes string from Perl's internal form into I and returns +Encodes a string from Perl's internal form into I and returns a sequence of octets. ENCODING can be either a canonical name or -alias. For encoding names and aliases, see L. -For CHECK see L. +an alias. For encoding names and aliases, see L. +For CHECK, see L. -For example to convert (internally UTF-8 encoded) Unicode string to +For example, to convert (internally UTF-8 encoded) Unicode string to iso-8859-1 (also known as Latin1), $octets = encode("iso-8859-1", $unicode); =item $string = decode(ENCODING, $octets[, CHECK]) -Decode sequence of octets assumed to be in I into Perl's -internal form and returns the resulting string. as in encode(), -ENCODING can be either a canonical name or alias. For encoding names -and aliases, see L. For CHECK see +Decodes a sequence of octets assumed to be in I into Perl's +internal form and returns the resulting string. As in encode(), +ENCODING can be either a canonical name or an alias. For encoding names +and aliases, see L. For CHECK, see L. -For example to convert ISO-8859-1 data to UTF-8: +For example, to convert ISO-8859-1 data to UTF-8: $utf8 = decode("iso-8859-1", $latin1); =item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK]) -Convert B the data between two encodings. -For example to convert ISO-8859-1 data to UTF-8: +Converts B data between two encodings. +For example, to convert ISO-8859-1 data to UTF-8: from_to($data, "iso-8859-1", "utf-8"); @@ -394,36 +378,37 @@ and to convert it back: from_to($data, "utf-8", "iso-8859-1"); Note that because the conversion happens in place, the data to be -converted cannot be a string constant, it must be a scalar variable. +converted cannot be a string constant; it must be a scalar variable. -from_to() return the length of the converted string on success, undef +from_to() returns the length of the converted string on success, undef otherwise. =back =head2 UTF-8 / utf8 -The Unicode consortium defines the UTF-8 standard as a way of encoding -the entire Unicode repertoire as sequences of octets. This encoding is -expected to become very widespread. Perl can use this form internally -to represent strings, so conversions to and from this form are -particularly efficient (as octets in memory do not have to change, -just the meta-data that tells Perl how to treat them). +The Unicode Consortium defines the UTF-8 transformation format as a +way of encoding the entire Unicode repertoire as sequences of octets. +This encoding is expected to become very widespread. Perl can use this +form internally to represent strings, so conversions to and from this +form are particularly efficient (as octets in memory do not have to +change, just the meta-data that tells Perl how to treat them). =over 4 =item $octets = encode_utf8($string); -The characters that comprise string are encoded in Perl's superset of UTF-8 -and the resulting octets returned as a sequence of bytes. All possible -characters have a UTF-8 representation so this function cannot fail. +The characters that comprise $string are encoded in Perl's superset of +UTF-8 and the resulting octets are returned as a sequence of bytes. All +possible characters have a UTF-8 representation so this function cannot +fail. =item $string = decode_utf8($octets [, CHECK]); The sequence of octets represented by $octets is decoded from UTF-8 into a sequence of logical characters. Not all sequences of octets form valid UTF-8 encodings, so it is possible for this call to fail. -For CHECK see L. +For CHECK, see L. =back @@ -438,7 +423,7 @@ ones that are not loaded yet, say @all_encodings = Encode->encodings(":all"); -Or you can give the name of specific module. +Or you can give the name of a specific module. @with_jp = Encode->encodings("Encode::JP"); @@ -446,12 +431,12 @@ When "::" is not in the name, "Encode::" is assumed. @ebcdic = Encode->encodings("EBCDIC"); -To find which encodings are supported by this package in details, +To find out in detail which encodings are supported by this package, see L. =head2 Defining Aliases -To add new alias to a given encoding, Use; +To add a new alias to a given encoding, use: use Encode; use Encode::Alias; @@ -469,16 +454,16 @@ i.e. Encode::resolve_alias("iso-8859-12") # false; nonexistent Encode::resolve_alias($name) eq $name # true if $name is canonical -This resolve_alias() does not need C and is -exported via C. +resolve_alias() does not need C; it can be +exported via C. -See L on details. +See L for details. =head1 Encoding via PerlIO -If your perl supports I, you can use PerlIO layer to directly -decode and encode via filehandle. The following two examples are -totally identical by functionality. +If your perl supports I, you can use a PerlIO layer to decode +and encode directly via a filehandle. The following two examples +are totally identical in their functionality. # via PerlIO open my $in, "<:encoding(shiftjis)", $infile or die; @@ -486,82 +471,90 @@ totally identical by functionality. while(<>){ print; } # via from_to - open my $in, $infile or die; - open my $out, $outfile or die; + open my $in, "<", $infile or die; + open my $out, ">", $outfile or die; while(<>){ - from_to($_, "shiftjis", "euc", 1); + from_to($_, "shiftjis", "euc-jp", 1); } -Unfortunately, not all encodings are PerlIO-savvy. You can check if -your encoding is supported by PerlIO by C method. +Unfortunately, there may be encodings are PerlIO-savvy. You can check +if your encoding is supported by PerlIO by calling the C +method. + + Encode::perlio_ok("hz"); # False + find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available + + use Encode qw(perlio_ok); # exported upon request + perlio_ok("euc-jp") - Encode::perlio_ok("iso-20220jp"); # false - find_encoding("iso-2022-jp")->perlio_ok; # false - use Encode qw(perlio_ok); # exported upon request - perlio_ok("euc-jp") # true if PerlIO is enabled +Fortunately, all encodings that come with Encode core are PerlIO-savvy +except for hz and ISO-2022-kr. See L for details. -For gory details, see L; +For gory details, see L. =head1 Handling Malformed Data =over 4 -THE I argument is used as follows. When you omit it, it is -identical to I = 0. +The I argument is used as follows. When you omit it, +the behaviour is the same as if you had passed a value of 0 for +I. =item I = Encode::FB_DEFAULT ( == 0) -If I is 0, (en|de)code will put I in -place of the malformed character. for UCM-based encodings, -EsubcharE will be used. For Unicode, \xFFFD is used. If the -data is supposed to be UTF-8, an optional lexical warning (category -utf8) is given. +If I is 0, (en|de)code will put a I +in place of a malformed character. For UCM-based encodings, +EsubcharE will be used. For Unicode, "\x{FFFD}" is used. +If the data is supposed to be UTF-8, an optional lexical warning +(category utf8) is given. =item I = Encode::DIE_ON_ERROR (== 1) -If I is 1, methods will die immediately with an error -message. so when I is set, you should trap the fatal error -with eval{} unless you really want to let it die on error. +If I is 1, methods will die immediately with an error +message. Therefore, when I is set to 1, you should trap the +fatal error with eval{} unless you really want to let it die on error. =item I = Encode::FB_QUIET If I is set to Encode::FB_QUIET, (en|de)code will immediately -return processed part on error, with data passed via argument -overwritten with unprocessed part. This is handy when have to -repeatedly call because the source data is chopped in the middle for -some reasons, such as fixed-width buffer. Here is a sample code that -just does this. +return the portion of the data that has been processed so far when +an error occurs. The data argument will be overwritten with +everything after that point (that is, the unprocessed part of data). +This is handy when you have to call decode repeatedly in the case +where your source data may contain partial multi-byte character +sequences, for example because you are reading with a fixed-width +buffer. Here is some sample code that does exactly this: my $data = ''; while(defined(read $fh, $buffer, 256)){ - # buffer may end in partial character so we append + # buffer may end in a partial character so we append $data .= $buffer; $utf8 .= decode($encoding, $data, ENCODE::FB_QUIET); - # $data now contains unprocessed partial character + # $data now contains the unprocessed partial character } =item I = Encode::FB_WARN -This is the same as above, except it warns on error. Handy when you -are debugging the mode above. +This is the same as above, except that it warns on error. Handy when +you are debugging the mode above. =item perlqq mode (I = Encode::FB_PERLQQ) For encodings that are implemented by Encode::XS, CHECK == Encode::FB_PERLQQ turns (en|de)code into C fallback mode. -When you decode, '\xI' will be placed where I is the hex -representation of the octet that could not be decoded to utf8. And -when you encode, '\x{I}' will be placed where I is the -Unicode ID of the character that cannot be found in the character -repertoire of the encoding. +When you decode, '\xI' will be inserted for a malformed character, +where I is the hex representation of the octet that could not be +decoded to utf8. And when you encode, '\x{I}' will be inserted, +where I is the Unicode ID of the character that cannot be found +in the character repertoire of the encoding. =item The bitmask -These modes are actually set via bitmask. here is how FB_XX are laid -out. for FB_XX you can import via C for -generic bitmask constants, you can import via - C. +These modes are actually set via a bitmask. Here is how the FB_XX +constants are laid out. You can import the FB_XX constants via +C; you can import the generic bitmask +constants via C. FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ DIE_ON_ERR 0x0001 X @@ -570,9 +563,9 @@ generic bitmask constants, you can import via LEAVE_SRC 0x0008 PERLQQ 0x0100 X -=head2 Unemplemented fallback schemes +=head2 Unimplemented fallback schemes -In future you will be able to use a code reference to a callback +In the future, you will be able to use a code reference to a callback function for the value of I but its API is still undecided. =head1 Defining Encodings @@ -583,38 +576,38 @@ To define a new encoding, use: define_encoding($object, 'canonicalName' [, alias...]); I will be associated with I<$object>. The object -should provide the interface described in L +should provide the interface described in L. If more than two arguments are provided then additional -arguments are taken as aliases for I<$object> as for C. +arguments are taken as aliases for I<$object>, as for C. See L for more details. =head1 Messing with Perl's Internals The following API uses parts of Perl's internals in the current -implementation. As such they are efficient, but may change. +implementation. As such, they are efficient but may change. =over 4 =item is_utf8(STRING [, CHECK]) -[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING. +[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING. If CHECK is true, also checks the data in STRING for being well-formed UTF-8. Returns true if successful, false otherwise. =item _utf8_on(STRING) -[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is +[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is B checked for being well-formed UTF-8. Do not use unless you B that the STRING is well-formed UTF-8. Returns the previous -state of the UTF-8 flag (so please don't test the return value as -I success or failure), or C if STRING is not a string. +state of the UTF-8 flag (so please don't treat the return value as +indicating success or failure), or C if STRING is not a string. =item _utf8_off(STRING) -[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously. -Returns the previous state of the UTF-8 flag (so please don't test the -return value as I success or failure), or C if STRING is +[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously. +Returns the previous state of the UTF-8 flag (so please don't treat the +return value as indicating success or failure), or C if STRING is not a string. =back @@ -634,7 +627,7 @@ the Perl Unicode Mailing List Eperl-unicode@perl.orgE =head1 MAINTAINER This project was originated by Nick Ing-Simmons and later maintained -by Dan Kogai Edankogai@dan.co.jpE. See AUTHORS for full list +by Dan Kogai Edankogai@dan.co.jpE. See AUTHORS for a full list of people involved. For any questions, use Eperl-unicode@perl.orgE so others can share. diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index b898780..929e66c 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 1.31 2002/04/20 23:43:47 dankogai Exp dankogai $ + $Id: Encode.xs,v 1.33 2002/04/22 03:43:05 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT @@ -8,14 +8,15 @@ #include "XSUB.h" #define U8 U8 #include "encode.h" +# define PERLIO_FILENAME "PerlIO/encoding.pm" /* set 1 or more to profile. t/encoding.t dumps core because of Perl_warner and PerlIO don't work well */ -#define ENCODE_XS_PROFILE 0 +#define ENCODE_XS_PROFILE 0 /* set 0 to disable floating point to calculate buffer size for encode_method(). 1 is recommended. 2 restores NI-S original */ -#define ENCODE_XS_USEFP 1 +#define ENCODE_XS_USEFP 1 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ Perl_croak(aTHX_ "panic_unimplemented"); \ @@ -119,40 +120,40 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, } case ENCODE_NOREP: /* encoding */ - if (dir == enc->f_utf8) { + if (dir == enc->f_utf8) { STRLEN clen; UV ch = - utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), + utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); if (check & ENCODE_DIE_ON_ERR) { Perl_croak( - aTHX_ "\"\\N{U+%" UVxf "}\" does not map to %s, %d", + aTHX_ "\"\\N{U+%" UVxf "}\" does not map to %s, %d", ch, enc->name[0], __LINE__); }else{ if (check & ENCODE_RETURN_ON_ERR){ if (check & ENCODE_WARN_ON_ERR){ Perl_warner( aTHX_ packWARN(WARN_UTF8), - "\"\\N{U+%" UVxf "}\" does not map to %s", + "\"\\N{U+%" UVxf "}\" does not map to %s", ch,enc->name[0]); } goto ENCODE_SET_SRC; }else if (check & ENCODE_PERLQQ){ - SV* perlqq = + SV* perlqq = sv_2mortal(newSVpvf("\\x{%04x}", ch)); sdone += slen + clen; ddone += dlen + SvCUR(perlqq); sv_catsv(dst, perlqq); - } else { + } else { /* fallback char */ sdone += slen + clen; - ddone += dlen + enc->replen; - sv_catpvn(dst, (char*)enc->rep, enc->replen); + ddone += dlen + enc->replen; + sv_catpvn(dst, (char*)enc->rep, enc->replen); } - } + } } /* decoding */ - else { + else { if (check & ENCODE_DIE_ON_ERR){ Perl_croak( aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)", @@ -167,22 +168,22 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, } goto ENCODE_SET_SRC; }else if (check & ENCODE_PERLQQ){ - SV* perlqq = + SV* perlqq = sv_2mortal(newSVpvf("\\x%02X", s[slen])); sdone += slen + 1; ddone += dlen + SvCUR(perlqq); sv_catsv(dst, perlqq); } else { sdone += slen + 1; - ddone += dlen + strlen(FBCHAR_UTF8); - sv_catpv(dst, FBCHAR_UTF8); + ddone += dlen + strlen(FBCHAR_UTF8); + sv_catpv(dst, FBCHAR_UTF8); } } } /* settle variables when fallback */ d = (U8 *)SvEND(dst); - dlen = SvLEN(dst) - ddone - 1; - s = (U8*)SvPVX(src) + sdone; + dlen = SvLEN(dst) - ddone - 1; + s = (U8*)SvPVX(src) + sdone; slen = tlen - sdone; break; @@ -205,10 +206,10 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, if (code && !(check & ENCODE_RETURN_ON_ERR)) { return &PL_sv_undef; } - + SvCUR_set(dst, dlen+ddone); SvPOK_only(dst); - + #if ENCODE_XS_PROFILE if (SvCUR(dst) > SvCUR(src)){ Perl_warn(aTHX_ @@ -217,7 +218,7 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); } #endif - + ENCODE_END: *SvEND(dst) = '\0'; return dst; @@ -263,6 +264,32 @@ CODE: XSRETURN(1); } +void +Method_needs_lines(obj) +SV * obj +CODE: +{ + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + ST(0) = &PL_sv_no; + XSRETURN(1); +} + +void +Method_perlio_ok(obj) +SV * obj +CODE: +{ + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + if (hv_exists(get_hv("INC", 0), + PERLIO_FILENAME, strlen(PERLIO_FILENAME))) + { + ST(0) = &PL_sv_yes; + }else{ + ST(0) = &PL_sv_no; + } + XSRETURN(1); +} + MODULE = Encode PACKAGE = Encode PROTOTYPES: ENABLE @@ -273,7 +300,7 @@ SV * sv CODE: { SV * encoding = items == 2 ? ST(1) : Nullsv; - + if (encoding) RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); else { @@ -310,7 +337,7 @@ CODE: /* Must do things the slow way */ U8 *dest; /* We need a copy to pass to check() */ - U8 *src = (U8*)savepv((char *)s); + U8 *src = (U8*)savepv((char *)s); U8 *send = s + len; New(83, dest, len, U8); /* I think */ @@ -335,8 +362,8 @@ CODE: /* Note change to utf8.c variable naming, for variety */ while (ulen--) { - if ((*s & 0xc0) != 0x80){ - goto failure; + if ((*s & 0xc0) != 0x80){ + goto failure; } else { uv = (uv << 6) | (*s++ & 0x3f); } @@ -422,7 +449,7 @@ CODE: OUTPUT: RETVAL -int +int WARN_ON_ERR() CODE: RETVAL = ENCODE_WARN_ON_ERR; diff --git a/ext/Encode/JP/JP.pm b/ext/Encode/JP/JP.pm index 27fca7d..260d07f 100644 --- a/ext/Encode/JP/JP.pm +++ b/ext/Encode/JP/JP.pm @@ -5,7 +5,7 @@ BEGIN { } } use Encode; -our $VERSION = do { my @r = (q$Revision: 1.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use XSLoader; XSLoader::load(__PACKAGE__,$VERSION); @@ -54,7 +54,7 @@ supported are as follows. =head1 DESCRIPTION -To find how to use this module in detail, see L. +To find out how to use this module in detail, see L. =head1 Note on ISO-2022-JP(-1)? @@ -63,9 +63,12 @@ adds support for JIS X 0212-1990. That means you can use the same code to decode to utf8 but not vice versa. $utf8 = decode('iso-2022-jp-1', $stream); + +and + $utf8 = decode('iso-2022-jp', $stream); -Yields the same result but +yield the same result but $with_0212 = encode('iso-2022-jp-1', $utf8); @@ -73,19 +76,19 @@ is now different from $without_0212 = encode('iso-2022-jp', $utf8 ); -In the latter case, characters that map to 0212 are at first converted -to U+3013 (0xA2AE in EUC-JP; a white square also known as 'Tofu') then -fed to decoding engine. U+FFFD is not used to preserve text layout as -much as possible. +In the latter case, characters that map to 0212 are first converted +to U+3013 (0xA2AE in EUC-JP; a white square also known as 'Tofu' or +'geta mark') then fed to the decoding engine. U+FFFD is not used, +in order to preserve text layout as much as possible. =head1 BUGS -ASCII part (0x00-0x7f) is preserved for all encodings, even though it -conflicts with mappings by the Unicode Consortium. See +The ASCII region (0x00-0x7f) is preserved for all encodings, even +though this conflicts with mappings by the Unicode Consortium. See L -to find why it is implemented that way. +to find out why it is implemented that way. =head1 SEE ALSO diff --git a/ext/Encode/KR/KR.pm b/ext/Encode/KR/KR.pm index f7c9a82..55bd9c0 100644 --- a/ext/Encode/KR/KR.pm +++ b/ext/Encode/KR/KR.pm @@ -4,12 +4,14 @@ BEGIN { die "Encode::KR not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use XSLoader; XSLoader::load(__PACKAGE__,$VERSION); +use Encode::KR::2022_KR; + 1; __END__ @@ -56,12 +58,12 @@ mean "cp949" encodings. To fix that, the following aliases are set; qr/(?:x-)?windows-949$/i => '"cp949"' qr/ks_c_5601-1987$/i => '"cp949"' -ASCII part (0x00-0x7f) is preserved for all encodings, even though it -conflicts with mappings by the Unicode Consortium. See +The ASCII region (0x00-0x7f) is preserved for all encodings, even +though this conflicts with mappings by the Unicode Consortium. See L -to find why it is implemented that way. +to find out why it is implemented that way. =head1 SEE ALSO diff --git a/ext/Encode/README b/ext/Encode/README index b4078d8..0951ffa 100644 --- a/ext/Encode/README +++ b/ext/Encode/README @@ -13,7 +13,7 @@ DESCRIPTION INSTALLATION -To install this module type the following: +To install this module, type the following: perl Makefile.PL make @@ -25,14 +25,14 @@ To install scripts under bin/ directories also, perl Makefile.PL MORE_SCRIPTS make && make test && make install -by default, only enc2xs and piconv are installed. +By default, only enc2xs and piconv are installed. To install *.ucm files also, say perl Makefile.PL INSTALL_UCM make && make test && make install -by default, *.ucm are not installed. +By default, *.ucm are not installed. DEPENDENCIES @@ -41,10 +41,10 @@ This module requires perl5.7.3 or later. MAINTAINER This project was originated by Nick Ing-Simmons and later maintained by -Dan Kogai . See AUTHORS for full list of people +Dan Kogai . See AUTHORS for the full list of people involved. QUESTIONS? -If you have any questions "perldoc Encode" does not answer, please +If you have any questions which "perldoc Encode" does not answer, please feel free to ask at perl-unicode@perl.org. diff --git a/ext/Encode/Symbol/Symbol.pm b/ext/Encode/Symbol/Symbol.pm index 9aed69d..23bd274 100644 --- a/ext/Encode/Symbol/Symbol.pm +++ b/ext/Encode/Symbol/Symbol.pm @@ -1,6 +1,6 @@ package Encode::Symbol; use Encode; -our $VERSION = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use XSLoader; XSLoader::load(__PACKAGE__,$VERSION); @@ -33,7 +33,7 @@ supported are as follows. =head1 DESCRIPTION -To find how to use this module in detail, see L. +To find out how to use this module in detail, see L. =head1 SEE ALSO diff --git a/ext/Encode/TW/TW.pm b/ext/Encode/TW/TW.pm index 0761318..8670071 100644 --- a/ext/Encode/TW/TW.pm +++ b/ext/Encode/TW/TW.pm @@ -4,7 +4,7 @@ BEGIN { die "Encode::TW not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use XSLoader; @@ -25,7 +25,8 @@ Encode::TW - Taiwan-based Chinese Encodings =head1 DESCRIPTION -This module implements Taiwan-based Chinese charset encodings. +This module implements tradition Chinese charset encodings as used +in Taiwan and Hong Kong. Encodings supported are as follows. Canonical Alias Description @@ -39,7 +40,7 @@ Encodings supported are as follows. = Big5 + Microsoft vendor mappings -------------------------------------------------------------------- -To find how to use this module in detail, see L. +To find out how to use this module in detail, see L. =head1 NOTES @@ -53,20 +54,20 @@ extra China-based encodings. Since the original C encoding (1984) is not supported anywhere (glibc and DOS-based systems uses C to mean C; Microsoft -uses C to mean C), a concious decision was made to alias +uses C to mean C), a conscious decision was made to alias C to C, which is the de facto superset of the original big5. The C encoding files are not complete. For common C manipulation, please use C in L, which contains -plane 1-7. +planes 1-7. -ASCII part (0x00-0x7f) is preserved for all encodings, even though it -conflicts with mappings by the Unicode Consortium. See +The ASCII region (0x00-0x7f) is preserved for all encodings, even +though this conflicts with mappings by the Unicode Consortium. See L -to find why it is implemented that way. +to find out why it is implemented that way. =head1 SEE ALSO diff --git a/ext/Encode/Unicode/Unicode.pm b/ext/Encode/Unicode/Unicode.pm index 257989a..fdf826e 100644 --- a/ext/Encode/Unicode/Unicode.pm +++ b/ext/Encode/Unicode/Unicode.pm @@ -3,7 +3,7 @@ package Encode::Unicode; use strict; use warnings; -our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.34 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use XSLoader; XSLoader::load(__PACKAGE__,$VERSION); @@ -47,11 +47,19 @@ sub new_sequence return bless {%$self},ref($self); } +sub needs_lines { 0 }; + +sub perlio_ok { + exists $INC{"PerlIO/encoding.pm"} or return 0; + return 1; +} + # -# three implementation of (en|de)code exist. XS version is the fastest. -# *_modern use # an array and *_classic stick with substr. *_classic is -# much slower but more memory conservative. *_xs is default. +# three implementations of (en|de)code exist. The XS version is the +# fastest. *_modern uses an array and *_classic sticks with substr. +# *_classic is much slower but more memory conservative. +# *_xs is the default. sub set_transcoder{ no warnings qw(redefine); @@ -273,7 +281,7 @@ __END__ =head1 NAME -Encode::Unicode -- Various Unicode Transform Format +Encode::Unicode -- Various Unicode Transformation Formats =cut @@ -308,8 +316,8 @@ UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE. UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf UTF-32 4 Y - is bogus As is BE/LE - UTF-32BE 4 N - bogus As is 0x0010abcd - UTF-32LE 4 N - bogus As is 0xcdab1000 + UTF-32BE 4 N - bogus As is 0x0001abcd + UTF-32LE 4 N - bogus As is 0xcdab0100 UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d ---------------+-----------------+------------------------------ @@ -317,38 +325,41 @@ UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE and UTF-32LE. =head1 Size, Endianness, and BOM -You can categorize these CES by 3 criteria; Size of each character, -Endianness, and Byte Order Mark. +You can categorize these CES by 3 criteria: size of each character, +endianness, and Byte Order Mark. -=head2 by Size +=head2 by size UCS-2 is a fixed-length encoding with each character taking 16 bits. -It B support I. When a surrogate pair is -encountered during decode(), its place is filled with \xFFFD without -I or croaks if I. When a character whose ord value is -larger than 0xFFFF is encountered, it uses 0xFFFD without I or -croaks if . - -UTF-16 is almost the same as UCS-2 but it supports I. +It B support I. When a surrogate pair +is encountered during decode(), its place is filled with \x{FFFD} +if I is 0, or the routine croaks if I is 1. When a +character whose ord value is larger than 0xFFFF is encountered, +its place is filled with \x{FFFD} if I is 0, or the routine +croaks if I is 1. + +UTF-16 is almost the same as UCS-2 but it supports I. When it encounters a high surrogate (0xD800-0xDBFF), it fetches the -following low surrogate (0xDC00-0xDFFF), Cs them to form a -character. Bogus surrogates result in death. When \x{10000} or above -is encountered during encode(), it Cs them and pushes the -surrogate pair to the output stream. +following low surrogate (0xDC00-0xDFFF) and Cs them to +form a character. Bogus surrogates result in death. When \x{10000} +or above is encountered during encode(), it Cs them and +pushes the surrogate pair to the output stream. UTF-32 is a fixed-length encoding with each character taking 32 bits. -Since it is 32-bit there is no need for I. +Since it is 32-bit, there is no need for I. -=head2 by Endianness +=head2 by endianness -First (and now failed) goal of Unicode was to map all character -repertories into a fixed-length integer so programmers are happy. -Since each character is either I or I in C, you have to -put endianness of each platform when you pass data to one another. +The first (and now failed) goal of Unicode was to map all character +repertoires into a fixed-length integer so that programmers are happy. +Since each character is either a I or I in C, you have to +pay attention to the endianness of each platform when you pass data +to one another. Anything marked as BE is Big Endian (or network byte order) and LE is -Little Endian (aka VAX byte order). For anything without, a character -called Byte Order Mark (BOM) is prepended to the head of string. +Little Endian (aka VAX byte order). For anything not marked either +BE or LE, a character called Byte Order Mark (BOM) indicating the +endianness is prepended to the string. =over 4 @@ -362,31 +373,31 @@ called Byte Order Mark (BOM) is prepended to the head of string. =back -This modules handles BOM as follows. +This modules handles the BOM as follows. =over 4 =item * When BE or LE is explicitly stated as the name of encoding, BOM is -simply treated as one of characters (ZERO WIDTH NO-BREAK SPACE). +simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE). =item * -When BE or LE is omitted during decode(), it checks if BOM is in the -beginning of the string and if found endianness is set to what BOM -says. If not found, dies. +When BE or LE is omitted during decode(), it checks if BOM is at the +beginning of the string; if one is found, the endianness is set to +what the BOM says. If no BOM is found, the routine dies. =item * When BE or LE is omitted during encode(), it returns a BE-encoded string with BOM prepended. So when you want to encode a whole text -file, make sure you encode() by whole text, not line by line or each -line, not file, is prepended with BOMs. +file, make sure you encode() the whole text at once, not line by line +or each line, not file, will have a BOM prepended. =item * -C is an exception. Unlike others this is an alias of UCS-2BE. +C is an exception. Unlike others, this is an alias of UCS-2BE. UCS-2 is already registered by IANA and others that way. =back @@ -404,18 +415,19 @@ magnitude so let's forgive them. Vogons here ;) Or, comparing Encode to Babel Fish is completely appropriate -- if you can only stick this into your ear :) -Surrogate pairs were born when Unicode Consortium finally +Surrogate pairs were born when the Unicode Consortium finally admitted that 16 bits were not big enough to hold all the world's -character repertoire. But they have already made UCS-2 16-bit. What +character repertoires. But they already made UCS-2 16-bit. What do we do? -Back then 0xD800-0xDFFF was not allocated. Let's split them half and -use the first half to represent C and the -latter C. That way you can represent 1024 -* 1024 = 1048576 more characters. Now we can store character ranges -up to \x{10ffff} even with 16-bit encodings. This pair of -half-character is now called a I and UTF-16 is the -name of the encoding that embraces them. +Back then, the range 0xD800-0xDFFF was not allocated. Let's split +that range in half and use the first half to represent the C and the second half to represent the C. That way, you can represent 1024 * 1024 = +1048576 more characters. Now we can store character ranges up to +\x{10ffff} even with 16-bit encodings. This pair of half-character is +now called a I and UTF-16 is the name of the encoding +that embraces them. Here is a formula to ensurrogate a Unicode character \x{10000} and above; @@ -432,9 +444,7 @@ perl does not prohibit the use of characters within this range. To perl, every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I. (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit - integer support! (**) - - (**) Is anything beyond \x{11_0000} still Unicode :? + integer support! =head1 SEE ALSO diff --git a/ext/Encode/Unicode/Unicode.xs b/ext/Encode/Unicode/Unicode.xs index bdee3c8..4689b49 100644 --- a/ext/Encode/Unicode/Unicode.xs +++ b/ext/Encode/Unicode/Unicode.xs @@ -1,5 +1,5 @@ /* - $Id: Unicode.xs,v 1.3 2002/04/20 23:43:47 dankogai Exp dankogai $ + $Id: Unicode.xs,v 1.3 2002/04/20 23:43:47 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT diff --git a/ext/Encode/bin/enc2xs b/ext/Encode/bin/enc2xs index f837a47..554167a 100644 --- a/ext/Encode/bin/enc2xs +++ b/ext/Encode/bin/enc2xs @@ -8,7 +8,7 @@ BEGIN { use strict; use Getopt::Std; my @orig_ARGV = @ARGV; -our $VERSION = do { my @r = (q$Revision: 1.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # These may get re-ordered. # RAW is a do_now as inserted by &enter @@ -989,25 +989,25 @@ enc2xs -- Perl Encode Module Generator =head1 DESCRIPTION F builds a Perl extension for use by Encode from either -Unicode Character Mapping files (.ucm) or Tcl Encoding Files -(.enc) Besides internally used during the build process of Encode -module, you can use F to add your own encoding to perl. No -knowledge on XS is necessary. +Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc). +Besides being used internally during the build process of the Encode +module, you can use F to add your own encoding to perl. +No knowledge of XS is necessary. =head1 Quick Guide -If what you want to know as little about Perl possible but needs to +If you want to know as little about Perl as possible but need to add a new encoding, just read this chapter and forget the rest. =over 4 =item 0. -Have a .ucm file ready. You can get it from somewhere or you can -write your own from scratch or you can grab one from Encode -distribution and customize. For UCM format, see the next Chapter. -In the example below, I'll call my theoretical encoding myascii, -defined inI. C<$> is a shell prompt. +Have a .ucm file ready. You can get it from somewhere or you can write +your own from scratch or you can grab one from the Encode distribution +and customize it. For the UCM format, see the next Chapter. In the +example below, I'll call my theoretical encoding myascii, defined +in I. C<$> is a shell prompt. $ ls -F my.ucm @@ -1027,11 +1027,13 @@ Now take a look at your current directory. It should look like this. $ ls -F Makefile.PL My.pm my.ucm t/ -The following files are created. +The following files were created. - Makefle.PL - MakeMaker script - My.pm - Encode Submodule - t/My.t - test file + Makefile.PL - MakeMaker script + My.pm - Encode submodule + t/My.t - test file + +=over 4 =item 1.1. @@ -1041,15 +1043,17 @@ If you want *.ucm installed together with the modules, do as follows; $ mv *.ucm Encode $ enc2xs -M My Encode/*ucm +=back + =item 2. Edit the files generated. You don't have to if you have no time AND no intention to give it to someone else. But it is a good idea to edit -pod and add more tests. +the pod and to add more tests. =item 3. -Now issue a command all Perl Mongers love; +Now issue a command all Perl Mongers love: $ perl5.7.3 Makefile.PL Writing Makefile for Encode::My @@ -1071,9 +1075,9 @@ Now all you have to do is make. chmod 644 blib/arch/auto/Encode/My/My.bs $ -The time it takes varies how fast your machine is and how large your -encoding is. Unless you are working on something big like euc-tw, it -won't take too long. +The time it takes varies depending on how fast your machine is and +how large your encoding is. Unless you are working on something big +like euc-tw, it won't take too long. =item 5. @@ -1094,7 +1098,7 @@ If you are content with the test result, just "make install" =item 7. -If you want to add your encoding to Encode demand-loading list +If you want to add your encoding to Encode's demand-loading list (so you don't have to "use Encode::YourEncoding"), run enc2xs -C @@ -1106,13 +1110,13 @@ After that, "use Encode;" is enough to load your encodings on demand. =head1 The Unicode Character Map -Encode uses The Unicode Character Map (UCM) for source character -mappings. This format is used by ICU package of IBM and adopted by -Nick Ing-Simmons. Since UCM is more flexible than Tcl's Encoding Map -and far more user-friendly, This is the recommended formet for -Encode now. +Encode uses the Unicode Character Map (UCM) format for source character +mappings. This format is used by IBM's ICU package and was adopted +by Nick Ing-Simmons for use with the Encode module. Since UCM is +more flexible than Tcl's Encoding Map and far more user-friendly, +this is the recommended formet for Encode now. -UCM file looks like this. +A UCM file looks like this. # # Comments @@ -1138,25 +1142,25 @@ UCM file looks like this. =item * -Anything that follows C<#> is treated as comments. +Anything that follows C<#> is treated as a comment. =item * -The header section continues until CHARMAP. This section Has a form of -IkeywordE value>, one at a line. For a value, strings must -be quoted. Barewords are treated as numbers. I<\xXX> represents a -byte. +The header section continues until a line containing the word +CHARMAP. This section has a form of IkeywordE value>, one +pair per line. Strings used as values must be quoted. Barewords are +treated as numbers. I<\xXX> represents a byte. Most of the keywords are self-explanatory. I means substitution character, not subcharacter. When you decode a Unicode sequence to this encoding but no matching character is found, the byte sequence defined here will be used. For most cases, the value here is -\x3F, in ASCII this is a question mark. +\x3F; in ASCII, this is a question mark. =item * CHARMAP starts the character map section. Each line has a form as -follows; +follows: \xXX.. |0 # comment ^ ^ ^ @@ -1164,20 +1168,21 @@ follows; | +-------- Encoded byte sequence +-------------- Unicode Character ID in hex -The format is roughly the same as a header section except for fallback -flag. It is | followed by 0..3. And their meaning as follows +The format is roughly the same as a header section except for the +fallback flag: | followed by 0..3. The meaning of the possible +values is as follows: -=over 2 +=over 4 =item |0 -Round trip safe. A character decoded to Unicode encodes back to the -same byte sequence. most character belong to this. +Round trip safe. A character decoded to Unicode encodes back to the +same byte sequence. Most characters have this flag. =item |1 Fallback for unicode -> encoding. When seen, enc2xs adds this -character for encode map only +character for the encode map only. =item |2 @@ -1186,7 +1191,7 @@ Skip sub-char mapping should there be no code point. =item |3 Fallback for encoding -> unicode. When seen, enc2xs adds this -character for decode map only +character for the decode map only. =back @@ -1197,16 +1202,16 @@ And finally, END OF CHARMAP ends the section. =back When you are manually creating a UCM file, you should copy ascii.ucm -or existing encoding which is close to yours than write your own from -scratch. +or an existing encoding which is close to yours, rather than write +your own from scratch. When you do so, make sure you leave at least B to B as -is, unless your environment is on EBCDIC. +is, unless your environment is EBCDIC. B: not all features in UCM are implemented. For example, icu:state is not used. Because of that, you need to write a perl -module if you want to support algorithmical encodings, notablly -ISO-2022 series. Such modules include L, +module if you want to support algorithmical encodings, notably +the ISO-2022 series. Such modules include L, L, and L. =head2 Coping with duplicate mappings @@ -1214,9 +1219,9 @@ L, and L. When you create a map, you SHOULD make your mappings round-trip safe. That is, C stands for all characters that are marked as C<|0>. Here is -how to make sure; +how to make sure: -=over 2 +=over 4 =item * @@ -1228,7 +1233,7 @@ When you have a duplicate entry, mark either one with '|1' or '|3'. =item * -And make sure '|1' or '|3' FOLLOWS '|0' entry. +And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry. =back @@ -1254,21 +1259,31 @@ down, here is what happens. (\xF9\xF9 => U2550 is now overwritten!) The Encode package comes with F, a crude but sufficient -utility to check the integrity of ucm file. Check under Encode/bin -directory for this. +utility to check the integrity of a UCM file. Check under the +Encode/bin directory for this. =head1 Bookmarks +=over 4 + +=item * + ICU Home Page L +=item * + ICU Character Mapping Tables L +=item * + ICU:Conversion Data L +=back + =head1 SEE ALSO L, diff --git a/ext/Encode/bin/piconv b/ext/Encode/bin/piconv index 050006e..81f3403 100644 --- a/ext/Encode/bin/piconv +++ b/ext/Encode/bin/piconv @@ -1,5 +1,5 @@ #!./perl -# $Id: piconv,v 1.23 2002/04/19 05:36:43 dankogai Exp $ +# $Id: piconv,v 1.24 2002/04/22 02:45:50 dankogai Exp $ # use 5.7.3; use strict; @@ -86,42 +86,43 @@ piconv -- iconv(1), reinvented in perl =head1 DESCRIPTION -B is perl version of F, a character encoding converter -widely available for various Unixen today. This script was primarily -a technology demonstrator for Perl 5.8.0, you can use piconv in the -place of iconv for virtually any cases. +B is perl version of B, a character encoding converter +widely available for various Unixen today. This script was primarily +a technology demonstrator for Perl 5.8.0, but you can use piconv in the +place of iconv for virtually any case. -piconv converts character encoding of either STDIN or files specified -in the argument and prints out to STDOUT. +piconv converts the character encoding of either STDIN or files +specified in the argument and prints out to STDOUT. -Here are list of options. +Here is the list of options. =over 4 =item -f from_encoding -Specifies the encoding you are converting from. Unlike F, -this option can be omitted. In such cases the current locale is used. +Specifies the encoding you are converting from. Unlike B, +this option can be omitted. In such cases, the current locale is used. =item -t to_encoding -Specifies the encoding you are converting to. Unlike F, -this option can be omitted. In such cases the current locale is used. +Specifies the encoding you are converting to. Unlike B, +this option can be omitted. In such cases, the current locale is used. -Therefore when both -f and -t are omitted, F just acts like F. +Therefore, when both -f and -t are omitted, B just acts +like B. =item -s I -uses I instead of file for the source of text. Same as F. +uses I instead of file for the source of text. Same as B. =item -l Lists all available encodings, one per line, in case-insensitive -order. Note that only the canonical names are listed, many aliases +order. Note that only the canonical names are listed; many aliases exist. For example, the names are case-insensitive, and many standard -and common aliases work, like "latin1" for "ISO 8859-1", or "ibm850" +and common aliases work, such as "latin1" for "ISO-8859-1", or "ibm850" instead of "cp850", or "winlatin1" for "cp1252". See L -for the full discussion. +for a full discussion. =item -C I @@ -147,7 +148,7 @@ Invokes debugging mode. Primarily for Encode hackers. =item -S scheme Selects which scheme is to be used for conversion. Available schemes -are as follows; +are as follows: =over 4 @@ -166,7 +167,7 @@ The new perlIO layer is used. NI-S' favorite. =back -Like I<-D> option, this is also for Encode hackers. +Like the I<-D> option, this is also for Encode hackers. =back diff --git a/ext/Encode/bin/ucmlint b/ext/Encode/bin/ucmlint index a3fe6c8..99a74d0 100644 --- a/ext/Encode/bin/ucmlint +++ b/ext/Encode/bin/ucmlint @@ -1,10 +1,10 @@ #!/usr/local/bin/perl # -# $Id: ucmlint,v 0.1 2002/04/09 20:04:30 dankogai Exp $ +# $Id: ucmlint,v 0.2 2002/04/22 02:45:50 dankogai Exp $ # use strict; -our $VERSION = do { my @r = (q$Revision: 0.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Getopt::Std; our %Opt; @@ -177,7 +177,7 @@ sub encparse{ __END__ -UCM file looks like this. +A UCM file looks like this. # # Comments diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm index 6a66dfd..420defe 100644 --- a/ext/Encode/encoding.pm +++ b/ext/Encode/encoding.pm @@ -1,5 +1,5 @@ package encoding; -our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.33 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use strict; @@ -11,8 +11,11 @@ BEGIN { } } -our $HAS_PERLIO = exists $INC{"PerlIO/encoding.pm"}; -$HAS_PERLIO or binmode(STDIN); +our $HAS_PERLIO = 0; +eval { require PerlIO::encoding }; +unless ($@){ + $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02); +} sub import { my $class = shift; @@ -34,10 +37,13 @@ sub import { require Carp; Carp::croak "Unknown encoding for $h, '$arg{$h}'"; } - eval qq{ binmode($h, ":encoding($arg{$h})") }; + eval { binmode($h, ":encoding($arg{$h})") }; }else{ unless (exists $arg{$h}){ - eval qq{ binmode($h, ":encoding($name)") }; + eval { + no warnings 'uninitialized'; + binmode($h, ":encoding($name)"); + }; } } if ($@){ @@ -83,7 +89,7 @@ __END__ =head1 NAME -encoding - allows you to write your script in non-ascii or non-utf8 +encoding - allows you to write your script in non-ascii or non-utf8 =head1 SYNOPSIS @@ -93,12 +99,12 @@ encoding - allows you to write your script in non-ascii or non-utf8 # or you can even do this if your shell supports your native encoding perl -Mencoding=latin2 -e '...' # Feeling centrally European? - perl -Mencoding=euc-ko -e '...' + perl -Mencoding=euc-kr -e '...' # Or Korean? # or from the shebang line #!/your/path/to/perl -Mencoding="8859-6" # Arabian Nights - #!/your/path/to/perl -Mencoding=euc-tw + #!/your/path/to/perl -Mencoding=big5 # Taiwanese # more control @@ -118,14 +124,14 @@ encoding - allows you to write your script in non-ascii or non-utf8 Let's start with a bit of history: Perl 5.6.0 introduced Unicode support. You could apply C and regexes even to complex CJK characters -- so long as the script was written in UTF-8. But back -then text editors that supported UTF-8 were still rare and many users -rather chose to write scripts in legacy encodings, given up whole new -feature of Perl 5.6. +then, text editors that supported UTF-8 were still rare and many users +instead chose to write scripts in legacy encodings, giving up a whole +new feature of Perl 5.6. -Rewind to the future: starting from perl 5.8.0 with B +Rewind to the future: starting from perl 5.8.0 with the B pragma, you can write your script in any encoding you like (so long as the C module supports it) and still enjoy Unicode support. -You can write a code in EUC-JP as follows: +You can write code in EUC-JP as follows: my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji #<-char-><-char-> # 4 octets @@ -149,7 +155,7 @@ STDIN, STDOUT, and STDERR to the specified encoding. Therefore, Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", not "\x{99F1}\x{99DD} is the symbol of perl.\n". -You can override this by giving extra arguments, see below. +You can override this by giving extra arguments; see below. =head1 USAGE @@ -157,9 +163,9 @@ You can override this by giving extra arguments, see below. =item use encoding [I] ; -Sets the script encoding to I and filehandle disciplines of -STDIN, STDOUT are set to ":encoding(I)". Note STDERR will -not be changed. +Sets the script encoding to I. Filehandle disciplines of +STDIN and STDOUT are set to ":encoding(I)". Note that STDERR +will not be changed. If no encoding is specified, the environment variable L is consulted. If no encoding can be found, the error C to change disciplines of those. =item use encoding I [ STDIN =E I ...] ; -You can also individually set encodings of STDIN and STDOUT via +You can also individually set encodings of STDIN and STDOUT via the C<< STDIN => I >> form. In this case, you cannot omit the first I. C<< STDIN => undef >> turns the IO transcoding completely off. =item no encoding; -Unsets the script encoding and the disciplines of STDIN, STDOUT are +Unsets the script encoding. The disciplines of STDIN, STDOUT are reset to ":raw" (the default unprocessed raw stream of bytes). =back @@ -188,7 +194,7 @@ reset to ":raw" (the default unprocessed raw stream of bytes). The pragma is a per script, not a per block lexical. Only the last C or C matters, and it affects B. -However, pragma is supported and C can +However, the pragma is supported and C can appear as many times as you want in a given script. The multiple use of this pragma is discouraged. @@ -221,8 +227,9 @@ the C pragma is present, even the 0x80..0xFF range always gets UTF-8 encoded. After all, the best thing about this pragma is that you don't have to -resort to \x... just to spell your name in native a encoding. So feel -free to put your strings in your encoding in quotes and regexes. +resort to \x{....} just to spell your name in a native encoding. +So feel free to put your strings in your encoding in quotes and +regexes. =head1 Non-ASCII Identifiers and Filter option @@ -231,25 +238,25 @@ identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human is a single Han ideograph) work, you still need to write your script in UTF-8 or use a source filter. -In other words, the same restriction as Jperl applies. +In other words, the same restriction as with Jperl applies. -If you dare to experiment, however, you can try Filter option. +If you dare to experiment, however, you can try the Filter option. =over 4 =item use encoding I Filter=E1; -This turns encoding pragma into source filter. While the default +This turns the encoding pragma into a source filter. While the default approach just decodes interpolated literals (in qq() and qr()), this -will apply source filter to entire source code. In this case, STDIN -and STDOUT remain untouched. +will apply a source filter to the entire source code. In this case, +STDIN and STDOUT remain untouched. =back What does this mean? Your source code behaves as if it is written in -UTF-8. So even if your editor only supports Shift_JIS, for example. -You can still try examples in Chapter 15 of C For instance, you can use UTF-8 identifiers. +UTF-8. So even if your editor only supports Shift_JIS, for example, +you can still try examples in Chapter 15 of C. For instance, you can use UTF-8 identifiers. This option is significantly slower and (as of this writing) non-ASCII identifiers are not very stable WITHOUT this option and with the @@ -262,7 +269,7 @@ do not use Filter=E1. use encoding "iso 8859-7"; - # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode. + # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. $a = "\xDF"; $b = "\x{100}"; @@ -287,18 +294,19 @@ do not use Filter=E1. print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; # ... but pack/unpack C are not affected, in case you still - # want back to your native encoding + # want to go back to your native encoding print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; =head1 KNOWN PROBLEMS -For native multibyte encodings (either fixed or variable length) +For native multibyte encodings (either fixed or variable length), the current implementation of the regular expressions may introduce -recoding errors for longer regular expression literals than 127 bytes. +recoding errors for regular expression literals longer than 127 bytes. The encoding pragma is not supported on EBCDIC platforms. -(Porters wanted.) +(Porters who are willing and able to remove this limitation are +welcome.) =head1 SEE ALSO diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm index 2915c42..2439822 100644 --- a/ext/Encode/lib/Encode/Alias.pm +++ b/ext/Encode/lib/Encode/Alias.pm @@ -1,7 +1,7 @@ package Encode::Alias; use strict; use Encode; -our $VERSION = do { my @r = (q$Revision: 1.28 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.29 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; require Exporter; @@ -269,10 +269,10 @@ Currently I can be specified in the following ways: define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); -In this case if I is not a reference it is C-ed to -allow C<$1> etc. to be substituted. The example is one way to alias -names as used in X11 fonts to the MIME names for the iso-8859-* -family. Note the double quote inside the single quote. +In this case, if I is not a reference, it is C-ed +in order to allow C<$1> etc. to be substituted. The example is one +way to alias names as used in X11 fonts to the MIME names for the +iso-8859-* family. Note the double quotes inside the single quotes. If you are using a regex here, you have to use the quotes as shown or it won't work. Also note that regex handling is tricky even for the @@ -282,27 +282,27 @@ experienced. Use it with caution. define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } , ''); - -In this case C<$_> will be set to the name that is being looked up and +In this case, C<$_> will be set to the name that is being looked up and I is passed to the sub as its first argument. The example is another way to alias names as used in X11 fonts to the MIME names for the iso-8859-* family. =back -=head2 Alias overloading +=head2 Alias overloading You can override predefined aliases by simply applying define_alias(). -New alias is always evaluated first and when neccessary define_alias() -flushes internal cache to make new definition available. +The new alias is always evaluated first, and when neccessary, +define_alias() flushes the internal cache to make the new definition +available. - # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a + # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a # superset of SHIFT_JIS define_alias( qr/shift.*jis$/i => '"cp932"' ); define_alias( qr/sjis$/i => '"cp932"' ); -If you want to zap all predefined aliases, you can +If you want to zap all predefined aliases, you can use Encode::Alias->undef_aliases; @@ -310,8 +310,7 @@ to do so. And Encode::Alias->init_aliases; -gets factory setting back. - +gets the factory settings back. =head1 SEE ALSO diff --git a/ext/Encode/lib/Encode/CN/HZ.pm b/ext/Encode/lib/Encode/CN/HZ.pm index c599928..56a8acd 100644 --- a/ext/Encode/lib/Encode/CN/HZ.pm +++ b/ext/Encode/lib/Encode/CN/HZ.pm @@ -3,19 +3,27 @@ package Encode::CN::HZ; use strict; use vars qw($VERSION); -$VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +$VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode (); use Encode::CN; use base 'Encode::Encoding'; -# HZ is but escaped GB, so we implement it with the -# GB2312(raw) encoding here. Cf. RFC 1842 & 1843. +# HZ is only escaped GB, so we implement it with the +# GB2312(raw) encoding here. Cf. RFCs 1842 & 1843. my $canon = 'hz'; my $obj = bless {name => $canon}, __PACKAGE__; $obj->Define($canon); +sub needs_lines { 1 } + +sub perlio_ok { + # exists $INC{"PerlIO/encoding.pm"} or return 0; + # PerlIO::encoding->VERSION >= 0.03 and return 1; + return 0; # for the time being +} + sub decode { my ($obj,$str,$chk) = @_; diff --git a/ext/Encode/lib/Encode/Config.pm b/ext/Encode/lib/Encode/Config.pm index bb58291..dcbc524 100644 --- a/ext/Encode/lib/Encode/Config.pm +++ b/ext/Encode/lib/Encode/Config.pm @@ -2,14 +2,14 @@ # Demand-load module list # package Encode::Config; -our $VERSION = do { my @r = (q$Revision: 1.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use strict; our %ExtModule = ( # Encode::Byte - #iso-8859-1 is on Encode.pm itself + #iso-8859-1 is in Encode.pm itself 'iso-8859-2' => 'Encode::Byte', 'iso-8859-3' => 'Encode::Byte', 'iso-8859-4' => 'Encode::Byte', diff --git a/ext/Encode/lib/Encode/Encoder.pm b/ext/Encode/lib/Encode/Encoder.pm index 793dacf..6536ed2 100644 --- a/ext/Encode/lib/Encode/Encoder.pm +++ b/ext/Encode/lib/Encode/Encoder.pm @@ -1,10 +1,10 @@ # -# $Id: Encoder.pm,v 0.4 2002/04/12 20:23:05 dankogai Exp $ +# $Id: Encoder.pm,v 0.5 2002/04/22 02:45:50 dankogai Exp $ # package Encode::Encoder; use strict; use warnings; -our $VERSION = do { my @r = (q$Revision: 0.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 0.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; require Exporter; our @ISA = qw(Exporter); @@ -119,10 +119,10 @@ Encode::Encoder -- Object Oriented Encoder =head1 ABSTRACT -B allows you to use Encode via OOP style. This is -not only more intuitive than functional approach, but also handier -when you want to stack encodings. Suppose you want your UTF-8 string -converted to Latin1 then Base64, you can simply say +B allows you to use Encode in an object-oriented +style. This is not only more intuitive than a functional approach, +but also handier when you want to stack encodings. Suppose you want +your UTF-8 string converted to Latin1 then Base64: you can simply say my $base64 = encoder($utf8)->latin1->base64; @@ -131,7 +131,7 @@ instead of my $latin1 = encode("latin1", $utf8); my $base64 = encode_base64($utf8); -or lazier and convolted +or the lazier and more convoluted my $base64 = encode_base64(encode("latin1", $utf8)); @@ -143,14 +143,14 @@ Here is how to use this module. =item * -There are at least two instance variable stored in hash reference, +There are at least two instance variables stored in a hash reference, {data} and {encoding}. =item * -When there is no method, it takes the method name as the name of -encoding and encode instance I with I. If successful, -instance I is set accordingly. +When there is no method, it takes the method name as the name of the +encoding and encodes the instance I with I. If successful, +the instance I is set accordingly. =item * @@ -161,14 +161,14 @@ because the stringify operator ("") is overridden to do exactly that. =head2 Predefined Methods -This module predefines the methods below; +This module predefines the methods below: =over 4 =item $e = Encode::Encoder-Enew([$data, $encoding]); returns an encoder object. Its data is initialized with $data if -there, and its encoding is set to $encoding if there. +present, and its encoding is set to $encoding if present. When $encoding is omitted, it defaults to utf8 if $data is already in utf8 or "" (empty string) otherwise. @@ -179,20 +179,20 @@ is an alias of Encode::Encoder-Enew(). This one is exported on demand. =item $e-Edata([$data]) -when $data is present, sets instance data to $data and returns the -object itself. otherwise the current instance data is returned. +When $data is present, sets the instance data to $data and returns the +object itself. Otherwise, the current instance data is returned. =item $e-Eencoding([$encoding]) -when $encoding is present, sets instance encoding to $encoding and -returns the object itself. otherwise the current instance encoding is +When $encoding is present, sets the instance encoding to $encoding and +returns the object itself. Otherwise, the current instance encoding is returned. =item $e-Ebytes([$encoding]) -decodes instance data from $encoding, or instance encoding if omitted. -when the conversion is successful, the enstance encoding will be set -to "" . +decodes instance data from $encoding, or the instance encoding if +omitted. If the conversion is successful, the instance encoding +will be set to "". The name I was deliberately picked to avoid namespace tainting -- this module may be used as a base class so method names that appear @@ -202,9 +202,9 @@ in Encode::Encoding are avoided. =head2 Example: base64 transcoder -This module is desined to work with L. -To make the Base64 transcorder example above really work, you should -write a module like this. +This module is designed to work with L. +To make the Base64 transcoder example above really work, you could +write a module like this: package Encode::Base64; use base 'Encode::Encoding'; @@ -221,7 +221,7 @@ write a module like this. 1; __END__ -And your caller module should be like this; +And your caller module would be something like this: use Encode::Encoder; use Encode::Base64; @@ -231,19 +231,19 @@ And your caller module should be like this; encoder($data)->iso_8859_1->base64; encoder($base64)->bytes('base64')->latin1; -=head2 operator overloading +=head2 Operator Overloading This module overloads two operators, stringify ("") and numify (0+). -Stringify dumps the data therein. +Stringify dumps the data inside the object. -Numify returns the number of bytes therein. +Numify returns the number of bytes in the instance data. They come in handy when you want to print or find the size of data. =head1 SEE ALSO -L +L, L =cut diff --git a/ext/Encode/lib/Encode/Encoding.pm b/ext/Encode/lib/Encode/Encoding.pm index caabc01..16a950a 100644 --- a/ext/Encode/lib/Encode/Encoding.pm +++ b/ext/Encode/lib/Encode/Encoding.pm @@ -1,7 +1,7 @@ package Encode::Encoding; # Base class for classes which implement encodings use strict; -our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.27 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; sub Define { @@ -20,6 +20,8 @@ sub fromUnicode { shift->encode(@_) } sub new_sequence { return $_[0] } +sub perlio_ok { 0 } + sub needs_lines { 0 } sub DESTROY {} @@ -50,7 +52,7 @@ when C has scanned C<@INC> for loadable encodings but has not actually loaded the encoding in question. This is because the current "loading" process is all Perl and a bit slow. -Once an encoding is loaded then value of the hash is object which +Once an encoding is loaded, the value of the hash is the object which implements the encoding. The object should provide the following interface: @@ -58,107 +60,153 @@ interface: =item -Ename -Should return the string representing the canonical name of the encoding. +MUST return the string representing the canonical name of the encoding. =item -Enew_sequence This is a placeholder for encodings with state. It should return an -object which implements this interface, all current implementations +object which implements this interface. All current implementations return the original object. =item -Eencode($string,$check) -Should return the octet sequence representing I<$string>. If I<$check> -is true it should modify I<$string> in place to remove the converted -part (i.e. the whole string unless there is an error). If an error -occurs it should return the octet sequence for the fragment of string -that has been converted, and modify $string in-place to remove the -converted part leaving it starting with the problem fragment. +MUST return the octet sequence representing I<$string>. + +=over 2 + +=item * + +If I<$check> is true, it SHOULD modify I<$string> in place to remove +the converted part (i.e. the whole string unless there is an error). +If perlio_ok() is true, SHOULD becomes MUST. + +=item * + +If an error occurs, it SHOULD return the octet sequence for the +fragment of string that has been converted and modify $string in-place +to remove the converted part leaving it starting with the problem +fragment. If perlio_ok() is true, SHOULD becomes MUST. + +=item * -If check is is false then C should make a "best effort" to -convert the string - for example by using a replacement character. +If I<$check> is is false then C MUST make a "best effort" to +convert the string - for example, by using a replacement character. + +=back =item -Edecode($octets,$check) -Should return the string that I<$octets> represents. If I<$check> is -true it should modify I<$octets> in place to remove the converted part -(i.e. the whole sequence unless there is an error). If an error -occurs it should return the fragment of string that has been -converted, and modify $octets in-place to remove the converted part -leaving it starting with the problem fragment. +MUST return the string that I<$octets> represents. + +=over 2 + +=item * + +If I<$check> is true, it SHOULD modify I<$octets> in place to remove +the converted part (i.e. the whole sequence unless there is an +error). If perlio_ok() is true, SHOULD becomes MUST. + +=item * -If check is is false then C should make a "best effort" to +If an error occurs, it SHOULD return the fragment of string that has +been converted and modify $octets in-place to remove the converted +part leaving it starting with the problem fragment. If perlio_ok() is +true, SHOULD becomes MUST. + +=item * + +If I<$check> is false then C should make a "best effort" to convert the string - for example by using Unicode's "\x{FFFD}" as a replacement character. =back -It should be noted that the check behaviour is different from the +=item -Eperlio_ok() + +If you want your encoding to work with PerlIO, you MUST define this +method so that it returns 1 when PerlIO is enabled. Here is an +example; + + sub perlio_ok { exists $INC{"PerlIO/encoding.pm"} } + +By default, this method is defined as follows; + + sub perlio_ok { 0 } + +=item -Eneeds_lines() + +If your encoding can work with PerlIO but needs line buffering, you +MUST define this method so it returns true. 7bit ISO-2022 encodings +are one example that needs this. When this method is missing, false +is assumed. + +=back + +It should be noted that the I<$check> behaviour is different from the outer public API. The logic is that the "unchecked" case is useful -when encoding is part of a stream which may be reporting errors -(e.g. STDERR). In such cases it is desirable to get everything +when the encoding is part of a stream which may be reporting errors +(e.g. STDERR). In such cases, it is desirable to get everything through somehow without causing additional errors which obscure the -original one. Also the encoding is best placed to know what the +original one. Also, the encoding is best placed to know what the correct replacement character is, so if that is the desired behaviour then letting low level code do it is the most efficient. -In contrast if check is true, the scheme above allows the encoding to -do as much as it can and tell layer above how much that was. What is -lacking at present is a mechanism to report what went wrong. The most -likely interface will be an additional method call to the object, or -perhaps (to avoid forcing per-stream objects on otherwise stateless -encodings) and additional parameter. +By contrast, if I<$check> is true, the scheme above allows the +encoding to do as much as it can and tell the layer above how much +that was. What is lacking at present is a mechanism to report what +went wrong. The most likely interface will be an additional method +call to the object, or perhaps (to avoid forcing per-stream objects +on otherwise stateless encodings) an additional parameter. It is also highly desirable that encoding classes inherit from C as a base class. This allows that class to define -additional behaviour for all encoding objects. For example built in -Unicode, UCS-2 and UTF-8 classes use : +additional behaviour for all encoding objects. For example, built-in +Unicode, UCS-2, and UTF-8 classes use package Encode::MyEncoding; use base qw(Encode::Encoding); __PACKAGE__->Define(qw(myCanonical myAlias)); -To create an object with bless {Name => ...},$class, and call +to create an object with C<< bless {Name => ...}, $class >>, and call define_encoding. They inherit their C method from C. =head2 Compiled Encodings -For the sake of speed and efficiency, Most of the encodings are now -supported via I that are XS modules generated from UCM -files. Encode provides enc2xs tool to achieve that. Please see +For the sake of speed and efficiency, most of the encodings are now +supported via a I: XS modules generated from UCM +files. Encode provides the enc2xs tool to achieve that. Please see L for more details. =head1 SEE ALSO L, L -=for future - +=begin future =over 4 =item Scheme 1 -Passed remaining fragment of string being processed. -Modifies it in place to remove bytes/characters it can understand -and returns a string used to represent them. -e.g. +The fixup routine gets passed the remaining fragment of string being +processed. It modifies it in place to remove bytes/characters it can +understand and returns a string used to represent them. For example: sub fixup { my $ch = substr($_[0],0,1,''); return sprintf("\x{%02X}",ord($ch); } -This scheme is close to how underlying C code for Encode works, but gives -the fixup routine very little context. +This scheme is close to how the underlying C code for Encode works, +but gives the fixup routine very little context. =item Scheme 2 -Passed original string, and an index into it of the problem area, and -output string so far. Appends what it will to output string and -returns new index into original string. For example: +The fixup routine gets passed the original string, an index into +it of the problem area, and the output string so far. It appends +what it wants to the output string and returns a new index into the +original string. For example: sub fixup { # my ($s,$i,$d) = @_; @@ -168,12 +216,12 @@ returns new index into original string. For example: } This scheme gives maximal control to the fixup routine but is more -complicated to code, and may need internals of Encode to be tweaked to -keep original string intact. +complicated to code, and may require that the internals of Encode be tweaked to +keep the original string intact. =item Other Schemes -Hybrids of above. +Hybrids of the above. Multiple return values rather than in-place modifications. @@ -181,4 +229,6 @@ Index into the string could be C allowing C. =back +=end future + =cut diff --git a/ext/Encode/lib/Encode/JP/H2Z.pm b/ext/Encode/lib/Encode/JP/H2Z.pm index 3a8ce4d..9947434 100644 --- a/ext/Encode/lib/Encode/JP/H2Z.pm +++ b/ext/Encode/lib/Encode/JP/H2Z.pm @@ -1,13 +1,13 @@ # -# $Id: H2Z.pm,v 1.0 2002/03/28 23:26:28 dankogai Exp $ +# $Id: H2Z.pm,v 1.1 2002/04/22 03:43:05 dankogai Exp $ # package Encode::JP::H2Z; use strict; -our $RCSID = q$Id: H2Z.pm,v 1.0 2002/03/28 23:26:28 dankogai Exp $; -our $VERSION = do { my @r = (q$Revision: 1.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $RCSID = q$Id: H2Z.pm,v 1.1 2002/04/22 03:43:05 dankogai Exp $; +our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Carp; @@ -125,6 +125,7 @@ use vars qw(%_D2Z $_PAT_D2Z #$_PAT_Z2D = join("|", keys %_Z2D); sub h2z { + no warnings qw(uninitialized); my $r_str = shift; my ($keep_dakuten) = @_; my $n = 0; diff --git a/ext/Encode/lib/Encode/JP/JIS7.pm b/ext/Encode/lib/Encode/JP/JIS7.pm index d058cdb..e38747e 100644 --- a/ext/Encode/lib/Encode/JP/JIS7.pm +++ b/ext/Encode/lib/Encode/JP/JIS7.pm @@ -1,7 +1,7 @@ package Encode::JP::JIS7; use strict; -our $VERSION = do { my @r = (q$Revision: 1.3 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode qw(:fallbacks); @@ -23,6 +23,12 @@ sub new_sequence { $_[0] } sub needs_lines { 1 } +sub perlio_ok { + exists $INC{"PerlIO/encoding.pm"} or return 0; + PerlIO::encoding->VERSION >= 0.03 and return 1; + return 0; +} + use Encode::CJKConstants qw(:all); our $DEBUG = 0; @@ -37,7 +43,6 @@ sub decode($$;$) my $residue = jis_euc(\$str); # This is for PerlIO $_[1] = $residue if $chk; - # use perlqq fallback for euc-jp -> utf8 return Encode::decode('euc-jp', $str, FB_PERLQQ); } @@ -85,6 +90,7 @@ sub jis_euc { } sub euc_jis{ + no warnings qw(uninitialized); my $r_str = shift; my $jis0212 = shift; $$r_str =~ s{ diff --git a/ext/Encode/lib/Encode/KR/2022_KR.pm b/ext/Encode/lib/Encode/KR/2022_KR.pm index 6e34f8b..b6a65b7 100644 --- a/ext/Encode/lib/Encode/KR/2022_KR.pm +++ b/ext/Encode/lib/Encode/KR/2022_KR.pm @@ -1,10 +1,10 @@ package Encode::KR::2022_KR; -use Encode::KR; +use Encode qw(:fallbacks); use base 'Encode::Encoding'; use strict; -our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.3 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; my $canon = 'iso-2022-kr'; @@ -15,20 +15,30 @@ sub name { return $_[0]->{name}; } sub needs_lines { 1 } +sub perlio_ok { + #exists $INC{"PerlIO/encoding.pm"} or return 0; + #PerlIO::encoding->VERSION >= 0.03 and return 1; + return 0; # for the time being +} + sub decode { - my ($obj,$str,$chk) = @_; + my ($obj, $str, $chk) = @_; my $res = $str; - iso_euc(\$res); - return Encode::decode('euc-kr', $res, $chk); + my $residue = iso_euc(\$res); + # This is for PerlIO + $_[1] = $residue if $chk; + return Encode::decode('euc-kr', $res, FB_PERLQQ); } sub encode { - my ($obj,$str,$chk) = @_; - my $res = Encode::encode('euc-kr', $str, $chk); - euc_iso(\$res); - return $res; + my ($obj, $utf8, $chk) = @_; + # empty the input string in the stack so perlio is ok + $_[1] = '' if $chk; + my $octet = Encode::encode('euc-jp', $utf8, FB_PERLQQ) ; + euc_iso(\$octet); + return $octet; } use Encode::CJKConstants qw(:all); @@ -38,29 +48,31 @@ use Encode::CJKConstants qw(:all); sub iso_euc{ my $r_str = shift; $$r_str =~ s/$RE{'2022_KR'}//gox; # remove the designator - $$r_str =~ s{ # replace chars. in GL - \x0e # between SO(\x0e) and SI(\x0f) - ([^\x0f]*) # with chars. in GR + $$r_str =~ s{ # replace characters in GL + \x0e # between SO(\x0e) and SI(\x0f) + ([^\x0f]*) # with characters in GR \x0f - } + } { - my $out= $1; + my $out= $1; $out =~ tr/\x21-\x7e/\xa1-\xfe/; $out; }geox; - $$r_str; + my ($residue) = ($$r_str =~ s/(\e.*)$//so); + return $residue; } sub euc_iso{ + no warnings qw(uninitialized); my $r_str = shift; substr($$r_str,0,0)=$ESC{'2022_KR'}; # put the designator at the beg. - $$r_str =~ s{ # move KS X 1001 chars. in GR to GL - ($RE{EUC_C}+) # and enclose them with SO and SI - }{ - my $str = $1; - $str =~ tr/\xA1-\xFE/\x21-\x7E/; - "\x0e" . $str . "\x0f"; - }geox; + $$r_str =~ s{ # move KS X 1001 characters in GR to GL + ($RE{EUC_C}+) # and enclose them with SO and SI + }{ + my $str = $1; + $str =~ tr/\xA1-\xFE/\x21-\x7E/; + "\x0e" . $str . "\x0f"; + }geox; $$r_str; } diff --git a/ext/Encode/lib/Encode/PerlIO.pod b/ext/Encode/lib/Encode/PerlIO.pod index c076b27..e433ea5 100644 --- a/ext/Encode/lib/Encode/PerlIO.pod +++ b/ext/Encode/lib/Encode/PerlIO.pod @@ -7,7 +7,7 @@ Encode::PerlIO -- a detailed document on Encode and PerlIO It is very common to want to do encoding transformations when reading or writing files, network connections, pipes etc. If Perl is configured to use the new 'perlio' IO system then -C provides a "layer" (See L) which can transform +C provides a "layer" (see L) which can transform data as it is read or written. Here is how the blind poet would modernise the encoding: @@ -20,8 +20,8 @@ Here is how the blind poet would modernise the encoding: close($utf8); close($illiad); -In addition the new IO system can also be configured to read/write -UTF-8 encoded characters (as noted above this is efficient): +In addition, the new IO system can also be configured to read/write +UTF-8 encoded characters (as noted above, this is efficient): open(my $fh,'>:utf8','anything'); print $fh "Any \x{0021} string \N{SMILEY FACE}\n"; @@ -29,25 +29,25 @@ UTF-8 encoded characters (as noted above this is efficient): Either of the above forms of "layer" specifications can be made the default for a lexical scope with the C pragma. See L. -Once a handle is open is layers can be altered using C. +Once a handle is open, its layers can be altered using C. -Without any such configuration, or if Perl itself is built using -system's own IO, then write operations assume that file handle accepts -only I and will C if a character larger than 255 is -written to the handle. When reading, each octet from the handle -becomes a byte-in-a-character. Note that this default is the same -behaviour as bytes-only languages (including Perl before v5.6) would -have, and is sufficient to handle native 8-bit encodings -e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling -other encodings and binary data. +Without any such configuration, or if Perl itself is built using the +system's own IO, then write operations assume that the file handle +accepts only I and will C if a character larger than 255 is +written to the handle. When reading, each octet from the handle becomes +a byte-in-a-character. Note that this default is the same behaviour +as bytes-only languages (including Perl before v5.6) would have, +and is sufficient to handle native 8-bit encodings e.g. iso-8859-1, +EBCDIC etc. and any legacy mechanisms for handling other encodings +and binary data. -In other cases it is the programs responsibility to transform +In other cases, it is the program's responsibility to transform characters into bytes using the API above before doing writes, and to transform the bytes read from a handle into characters before doing "character operations" (e.g. C, C, ...). You can also use PerlIO to convert larger amounts of data you don't -want to bring into memory. For example to convert between ISO-8859-1 +want to bring into memory. For example, to convert between ISO-8859-1 (Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines): open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!; @@ -71,21 +71,22 @@ data in your script. Here is a crude diagram of how filehandle, PerlIO, and Encode interact. - filehandle <-> PerlIO PerlIO <-> scalar (read/printed) - \ / + filehandle <-> PerlIO PerlIO <-> scalar (read/printed) + \ / Encode -When PerlIO receives data from either direction, it fills in the buffer -(currently with 1024 bytes) and pass the buffer to Encode. Encode tries -to convert the valid part and pass it back to PerlIO, leaving invalid -parts (usually partial character) in buffer. PerlIO then appends more -data in buffer, call Encode, and so on until the data stream ends. +When PerlIO receives data from either direction, it fills a buffer +(currently with 1024 bytes) and passes the buffer to Encode. +Encode tries to convert the valid part and passes it back to PerlIO, +leaving invalid parts (usually a partial character) in the buffer. +PerlIO then appends more data to the buffer, calls Encode again, +and so on until the data stream ends. To do so, PerlIO always calls (de|en)code methods with CHECK set to 1. -this ensures that the method stops at the right place when it +This ensures that the method stops at the right place when it encounters partial character. The following is what happens when PerlIO and Encode tries to encode (from utf8) more than 1024 bytes -long and the buffer boundary happens to be between a character. +and the buffer boundary happens to be in the middle of a character. A B C .... ~ \x{3000} .... 41 42 43 .... 7E e3 80 80 .... @@ -97,13 +98,13 @@ Encode converts from the beginning to \x7E, leaving \xe3 in the buffer because it is invalid (partial character). Unfortunately, this scheme does not work well with escape-based -encoding such as ISO-2022-JP. Let's see what happens in that case +encodings such as ISO-2022-JP. Let's see what happens in that case in the next chapter. =head1 BUGS -Now let's see what happens when you try to decode form ISO-2022-JP and -the buffer cuts in the middle of a character +Now let's see what happens when you try to decode from ISO-2022-JP and +the buffer ends in the middle of a character. JIS208-ESC \x{5f3e} A B C .... ~ \e $ B |DAN | .... @@ -114,16 +115,16 @@ the buffer cuts in the middle of a character As you see, the next buffer begins with \x43. But \x43 is 'C' in ASCII, which is wrong in this case because we are now in JISX 0208 area so it has to convert \x43\x46, not \x43. Unlike utf8 and EUC, -in escape-based encoding you can't tell if it a given octed is a whole +in escape-based encodings you can't tell if a given octet is a whole character or just part of it. There are actually several ways to solve this problem but none of -which is fast enough to be practical. From Encode's point of view -the easiest solution is for PerlIO to implement line buffer instead -of fixed-length buffer but that makes PerlIO really complicated. +them is fast enough to be practical. From Encode's point of view, +the easiest solution is for PerlIO to implement a line buffer instead +of a fixed-length buffer, but that makes PerlIO really complicated. -So for the time being, using escape-based encodings in ":encoding()" -layer of PerlIO does not work well. +So for the time being, using escape-based encodings in the +":encoding()" layer of PerlIO does not work well. =head2 Workaround @@ -137,12 +138,12 @@ the buffer never gets full. Here is an example. print $l; } -=head2 How can you tell my encoding fully supports PerlIO ? +=head2 How can I tell whether my encoding fully supports PerlIO ? -As of this writing, Any encoding which class belongs to Encode::XS and -Encode::Unicode works. Encode module has C method so you -can use it before appling PerlIO encoding to the filehandle. Here is -an example; +As of this writing, any encoding whose class belongs to Encode::XS and +Encode::Unicode works. The Encode module has a C method +which you can use before appling PerlIO encoding to the filehandle. +Here is an example: my $use_perlio = perlio_ok($enc); my $layer = $use_perlio ? "<:raw" : "<:encoding($enc)"; @@ -164,6 +165,5 @@ L, L, the Perl Unicode Mailing List Eperl-unicode@perl.orgE - =cut diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod index 5a24f0f..806f85b 100644 --- a/ext/Encode/lib/Encode/Supported.pod +++ b/ext/Encode/lib/Encode/Supported.pod @@ -1,18 +1,18 @@ =head1 NAME -Encode::Supported -- Supported encodings by Encode +Encode::Supported -- Encodings supported by Encode =head1 DESCRIPTION =head2 Encoding Names Encoding names are case insensitive. White space in names -is ignored. In addition an encoding may have aliases. +is ignored. In addition, an encoding may have aliases. Each encoding has one "canonical" name. The "canonical" name is chosen from the names of the encoding by picking the first in the following sequence (with a few exceptions). -=over +=over 4 =item * @@ -22,7 +22,7 @@ frequently used words like 'utf8' don't need to do alias lookups. =item * -The MIME name as defined in IETF RFCs This includes all "iso-"'s. +The MIME name as defined in IETF RFCs. This includes all "iso-"s. =item * @@ -68,7 +68,7 @@ The following encodings are always available. =head2 Encode::Unicode -- other Unicode encodings Unicode coding schemes other than native utf8 are supported by -Encode::Unicode which will be autoloaded on demand. +Encode::Unicode, which will be autoloaded on demand. ---------------------------------------------------------------- UCS-2BE UCS-2, iso-10646-1 [IANA, UC] @@ -81,31 +81,31 @@ Encode::Unicode which will be autoloaded on demand. UTF-32LE [UC] ---------------------------------------------------------------- -To find how those (UCS-2|UTF-(16|32))(LE|BE)? differ to one another, +To find how (UCS-2|UTF-(16|32))(LE|BE)? differ from one another, see L. =head2 Encode::Byte -- Extended ASCII -Encode::Byte implements most of single-byte encodings except for -Symbols and EBCDIC. The following encodings are based single-byte -encoding implemented as extended ASCII. For most cases it uses -\x80-\xff (upper half) to map non-ASCII characters. +Encode::Byte implements most single-byte encodings except for +Symbols and EBCDIC. The following encodings are based on single-byte +encodings implemented as extended ASCII. Most of them map +\x80-\xff (upper half) to non-ASCII characters. -=over 2 +=over 4 =item ISO-8859 and corresponding vendor mappings Since there are so many, they are presented in table format with -languages and corresponding encoding names by vendors. Note the table -is sorted in order of ISO-8859 and the corresponding vendor mappings -are slightly different from that of ISO. See +languages and corresponding encoding names by vendors. Note that +the table is sorted in order of ISO-8859 and the corresponding vendor +mappings are slightly different from that of ISO. See L for details. Lang/Regions ISO/Other Std. DOS Windows Macintosh Others ---------------------------------------------------------------- N. America (ASCII) cp437 AdobeStandardEncoding cp863 (DOSCanadaF) - W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep + W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep hp-roman8 cp860 (DOSPortuguese) Cntrl. Europe iso-8859-2 cp852 cp1250 MacCentralEurRoman @@ -115,7 +115,7 @@ L for details. Latin3 [1] iso-8859-3 Latin4 [2] iso-8859-4 Cyrillics iso-8859-5 cp855 cp1251 MacCyrillic - (Also see next section) cp866 MacUkrainian + (See also next section) cp866 MacUkrainian Arabic iso-8859-6 cp864 cp1256 MacArabic cp1006 MacFarsi Greek iso-8859-7 cp737 cp1253 MacGreek @@ -134,11 +134,11 @@ L for details. Vietnamese viscii cp1258 MacVietnamese ---------------------------------------------------------------- - [1] Esperanto, Maltese, and Turkish. Turkish is now on 8859-5. - [2] Baltics. Now on 8859-10. + [1] Esperanto, Maltese, and Turkish. Turkish is now on 8859-9. + [2] Baltics. Now on 8859-10, except for Latvian. [3] Also know as TIS 620. - [4] Nicknamed Latin0; Euro sign as well as French and Finnish - letters that are missing from 8859-1 are added. + [4] Nicknamed Latin0; the Euro sign as well as French and Finnish + letters that are missing from 8859-1 were added. All cp* are also available as ibm-*, ms-*, and windows-* . See also L. @@ -146,12 +146,12 @@ L. Macintosh encodings don't seem to be registered in such entities as IANA. "Canonical" names in Encode are based upon Apple's Tech Note 1150. See L -for details +for details. -=item KOI8 - De Facto Standard for Cyrillic world +=item KOI8 - De Facto Standard for the Cyrillic world -Though ISO-8859 does have ISO-8859, KOI8 series is far more popular -in the Net. L comes with the following KOI charsets. +Though ISO-8859 does have ISO-8859-5, the KOI8 series is far more +popular in the Net. L comes with the following KOI charsets. For gory details, see L ---------------------------------------------------------------- @@ -165,17 +165,18 @@ For gory details, see L GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII, control character ranges and other parts are mapped very differently, presumably to store Greek and Cyrillic alphabets. -This is also covered in Encode::Byte even though it does not -comply to extended ASCII. +This is also covered in Encode::Byte even though it is not an +"extended ASCII" encoding. =back -=head2 The CJK: Chinese, Japanese, Korean (Multibyte) +=head2 CJK: Chinese, Japanese, Korean (Multibyte) Note that Vietnamese is listed above. Also read "Encoding vs Charset" -below. Also note these are implemented in distinct module by -languages, due the the size concerns. Please refer to their -respective document pages. +below. Also note that these are implemented in distinct modules by +countries, due the the size concerns (simplified Chinese is mapped +to 'CN', continental China, while traditional Chinese is mapped to +'TW', Taiwan). Please refer to their respective documentataion pages. =over 4 @@ -191,8 +192,8 @@ respective document pages. iso-ir-165 ---------------------------------------------------------------- - [1] GB2312 is aliased to this. see L - [2] gbk is aliased to this. see L + [1] GB2312 is aliased to this. See L + [2] gbk is aliased to this. See L =item Encode::JP -- Japan @@ -278,27 +279,27 @@ For symbols and dingbats. =head1 Unsupported encodings -The following are not supported as yet. Some because they are rarely -used, some because of technical difficulties. They may be supported by -external modules via CPAN in future, however. +The following encodings are not supported as yet; some because they +are rarely used, some because of technical difficulties. They may +be supported by external modules via CPAN in the future, however. =over 4 =item ISO-2022-JP-2 [RFC1554] Not very popular yet. Needs Unicode Database or equivalent to -implement encode() (Because it includes JIS X 0208/0212, KSC5601, and -GB2312 simultaneously, which code points in Unicode overlap. So you -need to lookup the database to determine what character set a given +implement encode() (because it includes JIS X 0208/0212, KSC5601, and +GB2312 simultaneously, whose code points in Unicode overlap. So you +need to lookup the database to determine to what character set a given Unicode character should belong). =item ISO-2022-CN [RFC1922] -Not very popular. Needs CNS 11643-1 and 2 which are not available in +Not very popular. Needs CNS 11643-1 and -2 which are not available in this module. CNS 11643 is supported (via euc-tw) in Encode::HanExtra. -Autrijus may add support for this encoding in his module in future. +Autrijus Tang may add support for this encoding in his module in future. -=item various UP-UX encodings +=item Various HP-UX encodings The following are unsupported due to the lack of mapping data. @@ -307,7 +308,7 @@ The following are unsupported due to the lack of mapping data. =item Cyrillic encoding ISO-IR-111 -Anton doubts its usefulness. +Anton Tagunov doubts its usefulness. =item ISO-8859-8-1 [Hebrew] @@ -325,9 +326,9 @@ Ditto. =item Vietnamese encodings VPS -Though Jungshik has reported that Mozilla supports this encoding it -was too late before 5.8.0 for us to add one. In future via a separate -module. See +Though Jungshik Shin has reported that Mozilla supports this encoding, +it was too late before 5.8.0 for us to add it. In the future, it +may be available via a separate module. See L and L @@ -343,14 +344,14 @@ The following are unsupported due to the lack of mapping data. MacSinhalese, MacTamil, MacTelugu, MacTibetan MacVietnamese -The rest of which already available are based upon the vendor mappings +The rest which are already available are based upon the vendor mappings at L . =item (Mac) Indic encodings -The maps for the following is available at L -but remains unsupport because those encodings need algorithmical -approach, currently unsupported by F +The maps for the following are available at L +but remain unsupport because those encodings need algorithmical +approach, currently unsupported by F: MacDevanagari MacGurmukhi @@ -367,12 +368,13 @@ maps that I could find at L . =head1 Encoding vs. Charset -- terminology -We are used to using the term (character) I and I -interchangeably. But just as using the term byte and character is -dangerous and should be differentiated when needed, we need to -differentiate I and I. +We are used to using the term (character) I and I interchangeably. But just as confusing the terms byte and +character is dangerous and the terms should be differentiated when +needed, we need to differentiate I and I. -To understand that, it's follow how we make computers grok our characters. +To understand that, here is a description of how we make computers +grok our characters. =over 4 @@ -384,34 +386,34 @@ collection of characters I. =item * Then we have to give each character a unique ID so your computer can -tell the difference from 'a' to 'A'. This itemized character +tell the difference between 'a' and 'A'. This itemized character repertoire is now a I. =item * If your computer can grow the character set without further -processing, you can go ahead use it. This is called a I (CCS) or I. ASCII is used this way for most cases. =item * -But in many cases especially multi-byte CJK encodings, you have to +But in many cases, especially multi-byte CJK encodings, you have to tweak a little more. Your network connection may not accept any data -with the Most Significant Bit set, Your computer may not be able to +with the Most Significant Bit set, and your computer may not be able to tell if a given byte is a whole character or just half of it. So you have to I the character set to use it. A I (CES) determines how to encode a given character set, or a set of multiple character sets. 7bit ISO-2022 is -an example of CES. You switch between character sets via I. +an example of a CES. You switch between character sets via I. =back -Technically, or Mathematically speaking, a character set encoded in +Technically, or mathematically, speaking, a character set encoded in such a CES that maps character by character may form a CCS. EUC is such -an example. CES of EUC is as follows; +an example. The CES of EUC is as follows: =over 4 @@ -426,22 +428,22 @@ members by adding 0x80 to each byte. =item * -You can also use 0x8e and 0x8f to tell the following sequence of -characters belong to yet another character set. each following byte -is added by 0x80 +You can also use 0x8e and 0x8f to indicate that the following sequence of +characters belongs to yet another character set. To each following byte +is added the value 0x80. =back -By carefully looking at at the encoded byte sequence, you may find the -byte sequence conforms a unique number. In that sense EUC is a CCS +By carefully looking at the encoded byte sequence, you can find that the +byte sequence conforms a unique number. In that sense, EUC is a CCS generated by a CES above from up to four CCS (complicated?). UTF-8 -falls into this category. See L to find how +falls into this category. See L to find out how UTF-8 maps Unicode to a byte sequence. -You may also find by now why 7bit ISO-2022 cannot conform a CCS. If -you look at a byte sequence \x21\x21, you can't tell if it is two !'s -or IDEOGRAPHIC SPACE. EUC maps the latter to \xA1\xA1 so you have no -trouble between "!!". and " " +You may also have found out by now why 7bit ISO-2022 cannot comprise +a CCS. If you look at a byte sequence \x21\x21, you can't tell if +it is two !'s or IDEOGRAPHIC SPACE. EUC maps the latter to \xA1\xA1 +so you have no trouble differentiating between "!!". and S<" ">. =head1 Encoding Classification (by Anton Tagunov and Dan Kogai) @@ -450,11 +452,11 @@ applicability for information exchange over the Internet and to choose the most suitable aliases to name them in the context of such communication. -=over 2 +=over 4 =item * -To (en|de) code Encodings marked as C<(**)>, You need +To (en|de)code encodings marked by C<(**)>, you need C, available from CPAN. =back @@ -465,7 +467,7 @@ Encoding names Shift_JIS EUC-JP ISO-2022-JP ISO-2022-JP-1 EUC-KR Big5 GB2312 -are registered to IANA as preferred MIME names and may probably +are registered with IANA as preferred MIME names and may be used over the Internet. C has been officialized by JIS X 0208:1997. @@ -482,7 +484,7 @@ with Encode. See L for details. have not been registered with IANA (as of March 2002) but seem to be supported by major web browsers. -IANA name for C is C. +The IANA name for C is C. KS_C_5601-1987 @@ -498,7 +500,7 @@ are IANA-registered Cs. See [RFC 2781] for details. Jungshik Shin reports that UTF-16 with a BOM is well accepted by MS IE 5/6 and NS 4/6. Beware however that -=over 2 +=over 4 =item * @@ -510,17 +512,17 @@ then C support C coded data seamlessly passes traditional command piping (C, C, etc.) while C coded -data is likely to cause confusion (with it's zero bytes, +data is likely to cause confusion (with its zero bytes, for example) =item * it is beyond the power of words to describe the way HTML browsers -encode non-C form data. To get a general impression visit +encode non-C form data. To get a general impression, visit L. -While encoding of form data has stabilized for C coded pages -(at least IE 5/6, NS 6, Opera 6 behave consistently), be sure to -expect fun (and cross-browser discrepancies) with C coded +While encoding of form data has stabilized for C encoded pages +(at least IE 5/6, NS 6, and Opera 6 behave consistently), be sure to +expect fun (and cross-browser discrepancies) with C encoded pages! =back @@ -542,13 +544,13 @@ names. BIG5PLUS (**) -is a bit proprietary name. +is a proprietary name. =head2 Microsoft-related naming mess Microsoft products misuse the following names: -=over 2 +=over 4 =item KS_C_5601-1987 @@ -595,12 +597,12 @@ Microsoft's understanding of C. JIS has not endorsed the full Microsoft standard however. The official C includes only JIS X 0201 and JIS X 0208 -character sets, while Microsoft has always been meaning C +character sets, while Microsoft has always used C to encode a wider character repertoire. See C registration for C. -As a historical predecessor Microsoft's variant -probably has more rights for the name, albeit it may be objected +As a historical predecessor, Microsoft's variant +probably has more rights for the name, though it may be objected that Microsoft shouldn't have used JIS as part of the name in the first place. @@ -612,17 +614,17 @@ Encode separately supports C and C. =head1 Glossary -=over 2 +=over 4 =item character repertoire -A collection of unique characters. A I set in the most -strict sense. At this stage characters are not numbered. +A collection of unique characters. A I set in the strictest +sense. At this stage, characters are not numbered. =item coded character set (CCS) A character set that is mapped in a way computers can use directly. -Many character encodings including EUC falls in this category. +Many character encodings, including EUC, fall in this category. =item character encoding scheme (CES) @@ -635,10 +637,9 @@ example of being both a CCS and CES. has long been used in the meaning of C, CES. -While C word combination has lost this meaning -in MIME context since [RFC 2130], C abbreviation has -retained it. This is how [RFC 2277], [RFC 2278] bless C: - +While the word combination C has lost this meaning +in MIME context since [RFC 2130], the C abbreviation has +retained it. This is how [RFC 2277] and [RFC 2278] bless C: This document uses the term "charset" to mean a set of rules for mapping from a sequence of octets to a sequence of characters, such @@ -650,25 +651,25 @@ retained it. This is how [RFC 2277], [RFC 2278] bless C: =item EUC -Extended Unix Character. See ISO-2022 +Extended Unix Character. See ISO-2022. =item ISO-2022 -A CES that was carefully designed to coexist with ASCII. There are 7 -bit version and 8 bit version. +A CES that was carefully designed to coexist with ASCII. There are a 7 +bit version and an 8 bit version. -7 bit version switches character set via escape sequence so this +The 7 bit version switches character set via escape sequence so it cannot form a CCS. Since this is more difficult to handle in programs -than the 8 bit version, 7 bit version is not very popular except for -iso-2022-jp, the de facto standard CES for e-mails. +than the 8 bit version, the 7 bit version is not very popular except for +iso-2022-jp, the I standard CES for e-mails. -8 bit version can conform a CCS. EUC and ISO-8859 are two examples +The 8 bit version can form a CCS. EUC and ISO-8859 are two examples thereof. Pre-5.6 perl could use them as string literals. =item UCS Short for I. When you say just UCS, it means -I +I. =item UCS-2 @@ -677,20 +678,20 @@ octets. =item Unicode -A Character Set that aims to include all character repertoire of the +A character set that aims to include all character repertoires of the world. Many character sets in various national as well as industrial standards have become, in a way, just subsets of Unicode. =item UTF Short for I. Determines how to map a -Unicode character into byte sequence. +Unicode character into a byte sequence. =item UTF-16 A UTF in 16-bit encoding. Can either be in big endian or little -endian. Big endian version is called UTF-16BE (equals to UCS-2 + -Surrogate Support) and little endian version is UTF-16LE. +endian. The big endian version is called UTF-16BE (equal to UCS-2 + +surrogate support) and the little endian version is called UTF-16LE. =back @@ -703,20 +704,20 @@ L, L =head1 References -=over 2 +=over 4 =item ECMA European Computer Manufacturers Association L -=over 2 +=over 4 -=item EMCA-035 (eq C) +=item ECMA-035 (eq C) L -The very specification of ISO-2022 is available from the link above. +The specification of ISO-2022 is available from the link above. =back @@ -725,7 +726,7 @@ The very specification of ISO-2022 is available from the link above. Internet Assigned Numbers Authority L -=over 2 +=over 4 =item Assigned Charset Names by IANA @@ -745,14 +746,15 @@ L =item RFC Request For Comments -- need I say more? -L, L +L, L, +L =item UC Unicode Consortium L -=over 2 +=over 4 =item Unicode Glossary @@ -766,7 +768,7 @@ The glossary of this document is based upon this site. =head2 Other Notable Sites -=over 2 +=over 4 =item czyborra.com @@ -783,44 +785,46 @@ Somewhat obsolete (last update in 1996), but still useful. Also try L -You will find brief info on C, C and mostly on C +You will find brief info on C, C and mostly on C. =item Jungshik Shin's Hangul FAQ L -And especially it's subject 8. +And especially its subject 8. L A comprehensive overview of the Korean (C) standards. +=item debian.org: "Introduction to i18n" + +A brief description for most of the mentioned CJK encodings is +contained in +L + =back =head2 Offline sources -=over 2 +=over 4 =item C by Ken Lunde CJKV Information Processing 1999 O'Reilly & Associates, ISBN : 1-56592-224-7 -The modern successor of the C. +The modern successor of C. -Features a comprehensive coverage on CJKV character sets and +Features a comprehensive coverage of CJKV character sets and encodings along with many other issues faced by anyone trying to better support CJKV languages/scripts in all the areas of information processing. -To purchase this book visit +To purchase this book, visit L +or your favourite bookstore. =back =cut - -I could not find this page because the hostname doesn't resolve! - -Brief description for most of the mentioned CJK encodings -L diff --git a/ext/Encode/t/jperl.t b/ext/Encode/t/jperl.t index 83fc12f..dd95324 100644 --- a/ext/Encode/t/jperl.t +++ b/ext/Encode/t/jperl.t @@ -1,5 +1,5 @@ # -# $Id: jperl.t,v 1.21 2002/04/14 22:05:20 dankogai Exp $ +# $Id: jperl.t,v 1.23 2002/04/22 09:48:07 dankogai Exp dankogai $ # # This script is written in euc-jp diff --git a/ext/Encode/t/perlio.t b/ext/Encode/t/perlio.t index 3381a12..3b82e9e 100644 --- a/ext/Encode/t/perlio.t +++ b/ext/Encode/t/perlio.t @@ -12,111 +12,122 @@ BEGIN { print "1..0 # Skip: EBCDIC\n"; exit 0; } - require Encode; - eval { require PerlIO::encoding }; - unless ($INC{"PerlIO/encoding.pm"} - and PerlIO::encoding->VERSION >= 0.02 - ){ - print "1..0 # Skip:: PerlIO::encoding 0.02 or better required\n"; - exit 0; - } - # warn "PerlIO::encoding->VERSION == ", PerlIO::encoding->VERSION, "\n"; $| = 1; } use strict; use File::Basename; use File::Spec; -use File::Compare; +use File::Compare qw(compare_text); use File::Copy; use FileHandle; #use Test::More qw(no_plan); -use Test::More tests => 20; +use Test::More tests => 28; our $DEBUG = 0; +use Encode (":all"); +eval { require PerlIO::encoding }; + { no warnings; @ARGV and $DEBUG = shift; - require Encode::JP::JIS7; - $Encode::JP::JIS7::DEBUG = $DEBUG; + #require Encode::JP::JIS7; + #require Encode::KR::2022_KR; + #$Encode::JP::JIS7::DEBUG = $DEBUG; } -Encode->import(":all"); -my $dir = dirname(__FILE__); -my $ufile = File::Spec->catfile($dir,"jisx0208.ref"); -open my $fh, "<:utf8", $ufile or die "$ufile : $!"; -my @uline = <$fh>; -my $utext = join('' => @uline); -close $fh; + my $seq = 0; +my $dir = dirname(__FILE__); -for my $e (qw/euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1/){ - my $sfile = File::Spec->catfile($dir,"$$.sio"); - my $pfile = File::Spec->catfile($dir,"$$.pio"); +my %e = + ( + jisx0208 => [ qw/euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1/], + #ksc5601 => [ qw/euc-kr iso-2022-kr/], + ksc5601 => [ qw/euc-kr/], + #gb2312 => [ qw/euc-cn hz/], + gb2312 => [ qw/euc-cn/], + ); - # first create a file without perlio - dump2file($sfile, &encode($e, $utext, 0)); - # then create a file via perlio without autoflush - - SKIP:{ - skip "$e: !perlio_ok", 1 unless perlio_ok($e) or $DEBUG; - open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; - binmode $fh; - $fh->autoflush(0); - print $fh $utext; - close $fh; - $seq++; - unless (is(compare($sfile, $pfile), 0 => ">:encoding($e)")){ - copy $sfile, "$sfile.$seq"; - copy $pfile, "$pfile.$seq"; - } - } - - # this time print line by line. - # works even for ISO-2022! - open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; - binmode $fh; - $fh->autoflush(1); - for my $l (@uline) { - print $fh $l; - } +for my $src(sort keys %e) { + my $ufile = File::Spec->catfile($dir,"$src.ref"); + open my $fh, "<:utf8", $ufile or die "$ufile : $!"; + my @uline = <$fh>; + my $utext = join('' => @uline); close $fh; - $seq++; - unless(is(compare($sfile, $pfile), 0 - => ">:encoding($e); by lines")){ - copy $sfile, "$sfile.$seq"; - copy $pfile, "$pfile.$seq"; - } - SKIP:{ - skip "$e: !perlio_ok", 2 unless perlio_ok($e) or $DEBUG; - open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; - $fh->autoflush(0); - my $dtext = join('' => <$fh>); - close $fh; - $seq++; - unless(ok($utext eq $dtext, "<:encoding($e)")){ - dump2file("$sfile.$seq", $utext); - dump2file("$pfile.$seq", $dtext); - } - $dtext = ''; - open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; - while(defined(my $l = <$fh>)) { - $dtext .= $l; - } - close $fh; - $seq++; - unless (ok($utext eq $dtext, "<:encoding($e); by lines")) { - dump2file("$sfile.$seq", $utext); - dump2file("$pfile.$seq", $dtext); + for my $e (@{$e{$src}}){ + my $sfile = File::Spec->catfile($dir,"$$.sio"); + my $pfile = File::Spec->catfile($dir,"$$.pio"); + + # first create a file without perlio + dump2file($sfile, &encode($e, $utext, 0)); + + # then create a file via perlio without autoflush + + TODO:{ + #local $TODO = "$e: !perlio_ok" unless (perlio_ok($e) or $DEBUG); + todo_skip "$e: !perlio_ok", 4 unless (perlio_ok($e) or $DEBUG); + no warnings 'uninitialized'; + open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; + $fh->autoflush(0); + print $fh $utext; + close $fh; + $seq++; + is(compare_text($sfile, $pfile), 0 => ">:encoding($e)"); + if ($DEBUG){ + copy $sfile, "$sfile.$seq"; + copy $pfile, "$pfile.$seq"; + } + + # this time print line by line. + # works even for ISO-2022 but not ISO-2022-KR + open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; + $fh->autoflush(1); + for my $l (@uline) { + print $fh $l; + } + close $fh; + $seq++; + is(compare_text($sfile, $pfile), 0 => ">:encoding($e) by lines"); + if ($DEBUG){ + copy $sfile, "$sfile.$seq"; + copy $pfile, "$pfile.$seq"; + } + my $dtext; + open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; + $fh->autoflush(0); + $dtext = join('' => <$fh>); + close $fh; + $seq++; + ok($utext eq $dtext, "<:encoding($e)"); + if ($DEBUG){ + dump2file("$sfile.$seq", $utext); + dump2file("$pfile.$seq", $dtext); + } + if (perlio_ok($e) or $DEBUG){ + $dtext = ''; + open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; + while(defined(my $l = <$fh>)) { + $dtext .= $l; + } + close $fh; + } + $seq++; + ok($utext eq $dtext, "<:encoding($e) by lines"); + if ($DEBUG){ + dump2file("$sfile.$seq", $utext); + dump2file("$pfile.$seq", $dtext); + } } + $DEBUG or unlink ($sfile, $pfile); } - $DEBUG or unlink ($sfile, $pfile); } + sub dump2file{ no warnings; diff --git a/ext/Encode/ucm/big5-eten.ucm b/ext/Encode/ucm/big5-eten.ucm index 456b8be..452b85b 100644 --- a/ext/Encode/ucm/big5-eten.ucm +++ b/ext/Encode/ucm/big5-eten.ucm @@ -1,5 +1,5 @@ # -# $Id: big5.ucm,v 1.0 2002/03/28 23:26:25 dankogai Exp dankogai $ +# $Id: big5-eten.ucm,v 1.2 2002/04/22 03:41:13 dankogai Exp $ # # ./compile -n big5-eten -o Encode/big5-eten.ucm Encode/big5-eten.enc "big5-eten" diff --git a/ext/Encode/ucm/big5-hkscs.ucm b/ext/Encode/ucm/big5-hkscs.ucm index 8c1aac1..cb0b850 100644 --- a/ext/Encode/ucm/big5-hkscs.ucm +++ b/ext/Encode/ucm/big5-hkscs.ucm @@ -1,5 +1,5 @@ # -# $Id: big5-hkscs.ucm,v 1.0 2002/03/28 23:26:25 dankogai Exp $ +# $Id: big5-hkscs.ucm,v 1.2 2002/04/22 03:41:13 dankogai Exp $ # "big5-hkscs" 1