From: Jarkko Hietaniemi Date: Fri, 19 Apr 2002 12:58:00 +0000 (+0000) Subject: Upgrade to Encode 1.50, from Dan Kogai. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=85982a32ef23cb53c2fae6d3861dd7dc62e3ab17;p=p5sagit%2Fp5-mst-13.2.git Upgrade to Encode 1.50, from Dan Kogai. p4raw-id: //depot/perl@16001 --- diff --git a/MANIFEST b/MANIFEST index 7c58789..0aa28b1 100644 --- a/MANIFEST +++ b/MANIFEST @@ -224,6 +224,9 @@ ext/Encode/Symbol/Makefile.PL Encode extension ext/Encode/Symbol/Symbol.pm Encode extension ext/Encode/TW/Makefile.PL Encode extension ext/Encode/TW/TW.pm Encode extension +ext/Encode/Unicode/Makefile.PL Encode extension +ext/Encode/Unicode/Unicode.pm Encode extension +ext/Encode/Unicode/Unicode.xs Encode extension ext/Encode/bin/enc2xs Encode module generator ext/Encode/bin/piconv iconv by perl ext/Encode/bin/ucm2table Table Generator for testing @@ -240,8 +243,8 @@ ext/Encode/lib/Encode/Encoding.pm Encode extension ext/Encode/lib/Encode/JP/H2Z.pm Encode extension ext/Encode/lib/Encode/JP/JIS7.pm Encode extension ext/Encode/lib/Encode/KR/2022_KR.pm Encode extension -ext/Encode/lib/Encode/Supported.pod Documents supported encodings -ext/Encode/lib/Encode/Unicode.pm Encode extension +ext/Encode/lib/Encode/PerlIO.pod Documents for Encode & PerlIO +ext/Encode/lib/Encode/Supported.pod Documents for supported encodings ext/Encode/t/Aliases.t Encode extension test ext/Encode/t/CN.t Encode extension test ext/Encode/t/Encode.t Encode extension test @@ -252,9 +255,12 @@ ext/Encode/t/TW.t Encode extension test ext/Encode/t/Unicode.t Encode extension test ext/Encode/t/bogus.ucm Sample data for ucmlint ext/Encode/t/encoding.t encoding extension test +ext/Encode/t/fallback.t Encode extension test ext/Encode/t/gb2312.euc test data ext/Encode/t/gb2312.ref test data ext/Encode/t/grow.t Encode extension test +ext/Encode/t/jisx0201.euc test data +ext/Encode/t/jisx0201.ref test data ext/Encode/t/jisx0208.euc test data ext/Encode/t/jisx0208.ref test data ext/Encode/t/jisx0212.euc test data @@ -262,6 +268,7 @@ ext/Encode/t/jisx0212.ref test data ext/Encode/t/jperl.t encoding extension test ext/Encode/t/ksc5601.euc test data ext/Encode/t/ksc5601.ref test data +ext/Encode/t/perlio.t ext/Encode/t/unibench.pl Unicode benchmark ext/Encode/ucm/8859-1.ucm Unicode Character Map ext/Encode/ucm/8859-10.ucm Unicode Character Map diff --git a/ext/Encode/CN/CN.pm b/ext/Encode/CN/CN.pm index 5952cab..c031f5c 100644 --- a/ext/Encode/CN/CN.pm +++ b/ext/Encode/CN/CN.pm @@ -4,7 +4,7 @@ BEGIN { die "Encode::CN not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use XSLoader; @@ -59,8 +59,8 @@ also contains extra Taiwan-based encodings. =head1 BUGS When you see C on mails and web pages, they really -mean "euc-cn" encodings. To fix that, gb2312 is aliased to euc-cn. Use -gb2312-raw when you really mean it. +mean C encodings. To fix that, C is aliased to C. +Use C when you really mean it. ASCII part (0x00-0x7f) is preserved for all encodings, even though it conflicts with mappings by the Unicode Consortium. See @@ -71,6 +71,6 @@ to find why it is implemented that way. =head1 SEE ALSO -L,L +L =cut diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 06cc9b6..aba9ce6 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,9 +1,48 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 1.42 2002/04/17 03:01:20 dankogai Exp dankogai $ +# $Id: Changes,v 1.50 2002/04/19 06:13:02 dankogai Exp $ # +1.50 $Date: 2002/04/19 06:13:02 $ +! ! Encode.pm Encode.xs Encode/encoding.h ++ t/fallback.pm + New Fallback API imlemented and documented. See "perldoc Encode" + for details +! lib/Encode/JP/JIS7.pm Encode.pm ++ lib/Encode/PerlIO.pod t/perlio.t + API compliance met. However, it still does not work unless perlio + implements line buffer. See BUGS section in perldoc Encode::PerlIO + As a sensible workaround, perlio_ok() added to Encode. +! encoding.pm +! lib/Encode/Supported.pod + Doc fixes from jhi + Message-Id: <20020418174647.J8466@alpha.hut.fi> +! CN/CN.pm + Doc fixes from Autrijus + Message-Id: <20020418144131.GA10987@not.autrijus.org> +! Encode.pm + perlqq mode documented +! t/JP.t ++ t/jisx0201.euc t/jisx0201.ref +! t/jisx0208.euc t/jisx0208.ref + t/JP.t tests more rigorously and with other encodings + t/jisx0201.* added to test JIS7 encodings. jisx0208 is now PURELY + in jis0208 (used to contain jisx0201 part). +! Encode/Makefile_PL.e2x + The resulting Makefile.PL that "enc2xs -M" creates now auto-discovers + enc2xs and encode.h rather than hard-coded. This allows the resulting + module fully CPANizable. +! encoding.pm t/JP.t t/KR.t + PerlIO detection simplified (checks %INC instead of eval{}) +! Encode.xs Encode/encode.h ++ Unicode/Makefile.PL Unicode/Unicode.pm Unicode/Unicode.xs +- lib/Encode/Unicode.pm + (en|de)code_xs relocated to where it belongs. Source reindented + to my taste +! bin/enc2xs + Additional (U8 *) cast added as suggested by jhi + Message-Id: <20020417165916.A28599@alpha.hut.fi> -1.42 $Date: 2002/04/17 03:01:20 $ +1.42 Date: 2002/04/17 - lib/Encode/XS.pm no-op module; Thought of adding a pod there but enc2xs has one so gone. @@ -335,7 +374,7 @@ Typo fixes and improvements by jhi Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al. -1.11 $Date: 2002/04/17 03:01:20 $ +1.11 $Date: 2002/04/19 06:13:02 $ + t/encoding.t + t/jperl.t ! MANIFEST diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 3dd63a8..d1c5494 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,6 +1,6 @@ package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 1.42 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.50 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; require DynaLoader; @@ -9,28 +9,31 @@ require Exporter; our @ISA = qw(Exporter DynaLoader); # Public, encouraged API is exported by default -our @EXPORT = qw ( - decode - decode_utf8 - encode - encode_utf8 - encodings - find_encoding + +our @EXPORT = qw( + decode decode_utf8 encode encode_utf8 + encodings find_encoding ); +our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC PERLQQ); +our @FB_CONSTS = qw(FB_DEFAULT FB_QUIET FB_WARN FB_PERLQQ FB_CROAK); + our @EXPORT_OK = - qw( - _utf8_off - _utf8_on - define_encoding - from_to - is_16bit - is_8bit - is_utf8 - resolve_alias - utf8_downgrade - utf8_upgrade - ); + ( + qw( + _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit + is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade + ), + @FB_FLAGS, @FB_CONSTS, + ); + +our %EXPORT_TAGS = + ( + all => [ @EXPORT, @EXPORT_OK ], + fallbacks => [ @FB_CONSTS ], + fallback_all => [ @FB_CONSTS, @FB_FLAGS ], + ); + bootstrap Encode (); @@ -64,6 +67,13 @@ sub encodings grep {!/^(?:Internal|Unicode)$/o} keys %Encoding; } +sub perlio_ok{ + exists $INC{"PerlIO/encoding.pm"} or return 0; + my $stash = ref($_[0]); + $stash ||= ref(find_encoding($_[0])); + return ($stash eq "Encode::XS" || $stash eq "Encode::Unicode"); +} + sub define_encoding { my $obj = shift; @@ -253,7 +263,8 @@ sub predefine_encodings{ require Encode::Encoding; -eval { require PerlIO::encoding }; +eval qq{ use PerlIO::encoding 0.02 }; +# warn $@ if $@; 1; @@ -366,12 +377,7 @@ For example to convert ISO-8859-1 data to UTF-8: =item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK]) -Convert B the data between two encodings. How did the data -in $string originally get to be in FROM_ENCODING? Either using -encode() or through PerlIO: See L. -For encoding names and aliases, see L. -For CHECK see L. - +Convert B the data between two encodings. For example to convert ISO-8859-1 data to UTF-8: from_to($data, "iso-8859-1", "utf-8"); @@ -461,81 +467,103 @@ exported via C. See L on details. -=head1 Encoding and IO +=head1 Encoding via PerlIO -It is very common to want to do encoding transformations when -reading or writing files, network connections, pipes etc. -If Perl is configured to use the new 'perlio' IO system then -C provides a "layer" (See L) which can transform -data as it is read or written. +If your perl supports I, you can use PerlIO layer to directly +decode and encode via filehandle. The following two examples are +totally identical by functionality. -Here is how the blind poet would modernise the encoding: + # via PerlIO + open my $in, "<:encoding(shiftjis)", $infile or die; + open my $out, ">:encoding(euc-jp)", $outfile or die; + while(<>){ print; } - use Encode; - open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek'); - open(my $utf8,'>:utf8','iliad.utf8'); - my @epic = <$iliad>; - print $utf8 @epic; - close($utf8); - close($illiad); + # via from_to + open my $in, $infile or die; + open my $out, $outfile or die; + while(<>){ + from_to($_, "shiftjis", "euc", 1); + } -In addition the new IO system can also be configured to read/write -UTF-8 encoded characters (as noted above this is efficient): +Unfortunately, not all encodings are PerlIO-savvy. You can check if +your encoding is supported by PerlIO by C method. - open(my $fh,'>:utf8','anything'); - print $fh "Any \x{0021} string \N{SMILEY FACE}\n"; + Encode::perlio_ok("iso-20220jp"); # false + find_encoding("iso-2022-jp")->perlio_ok; # false + use Encode qw(perlio_ok); # exported upon request + perlio_ok("euc-jp") # true if PerlIO is enabled -Either of the above forms of "layer" specifications can be made the default -for a lexical scope with the C pragma. See L. +For gory details, see L; -Once a handle is open is layers can be altered using C. +=head1 Handling Malformed Data -Without any such configuration, or if Perl itself is built using -system's own IO, then write operations assume that file handle accepts -only I and will C if a character larger than 255 is -written to the handle. When reading, each octet from the handle -becomes a byte-in-a-character. Note that this default is the same -behaviour as bytes-only languages (including Perl before v5.6) would -have, and is sufficient to handle native 8-bit encodings -e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling -other encodings and binary data. +=over 4 -In other cases it is the programs responsibility to transform -characters into bytes using the API above before doing writes, and to -transform the bytes read from a handle into characters before doing -"character operations" (e.g. C, C, ...). +THE I argument is used as follows. When you omit it, it is +identical to I = 0. -You can also use PerlIO to convert larger amounts of data you don't -want to bring into memory. For example to convert between ISO-8859-1 -(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines): +=item I = Encode::FB_DEFAULT ( == 0) - open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!; - open(G, ">:utf8", "data.utf") or die $!; - while () { print G } +If I is 0, (en|de)code will put I in +place of the malformed character. for UCM-based encodings, +EsubcharE will be used. For Unicode, \xFFFD is used. If the +data is supposed to be UTF-8, an optional lexical warning (category +utf8) is given. - # Could also do "print G " but that would pull - # the whole file into memory just to write it out again. +=item I = Encode::DIE_ON_ERROR (== 1) -More examples: +If I is 1, methods will die immediately with an error +message. so when I is set, you should trap the fatal error +with eval{} unless you really want to let it die on error. - open(my $f, "<:encoding(cp1252)") - open(my $g, ">:encoding(iso-8859-2)") - open(my $h, ">:encoding(latin9)") # iso-8859-15 +=item I = Encode::FB_QUIET -See L for more information. +If I is set to Encode::FB_QUIET, (en|de)code will immediately +return proccessed part on error, with data passed via argument +overwritten with unproccessed part. This is handy when have to +repeatedly call because the source data is chopped in the middle for +some reasons, such as fixed-width buffer. Here is a sample code that +just does this. -See also L for how to change the default encoding of the -data in your script. + my $data = ''; + while(defined(read $fh, $buffer, 256)){ + # buffer may end in partial character so we append + $data .= $buffer; + $utf8 .= decode($encoding, $data, ENCODE::FB_QUIET); + # $data now contains unprocessed partial character + } -=head1 Handling Malformed Data +=item I = Encode::FB_WARN -If I is not set, (en|de)code will put I in -place of the malformed character. for UCM-based encodings, -EsubcharE will be used. For Unicode, \xFFFD is used. If the -data is supposed to be UTF-8, an optional lexical warning (category -utf8) is given. +This is the same as above, except it warns on error. Handy when you +are debugging the mode above. + +=item perlqq mode (I = Encode::FB_PERLQQ) + +For encodings that are implemented by Encode::XS, CHECK == +Encode::FB_PERLQQ turns (en|de)code into C fallback mode. + +When you decode, '\xI' will be placed where I is the hex +representation of the octet that could not be decoded to utf8. And +when you encode, '\x{I}' will be placed where I is the +Unicode ID of the character that cannot be found in the character +repartoire of the encoding. + +=item The bitmask + +These modes are actually set via bitmask. here is how FB_XX are laid +out. for FB_XX you can import via C for +generic bitmask constants, you can import via + C. + + FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ + DIE_ON_ERR 0x0001 X + WARN_ON_ERR 0x0002 X + RETURN_ON_ERR 0x0004 X X + LEAVE_SRC 0x0008 + PERLQQ 0x0100 X -If I is true but not a code reference, dies with an error message. +=head2 Unemplemented fallback schemes In future you will be able to use a code reference to a callback function for the value of I but its API is still undecided. @@ -588,7 +616,7 @@ not a string. L, L, -L, +L, L, L, L, @@ -596,7 +624,7 @@ L, L, the Perl Unicode Mailing List Eperl-unicode@perl.orgE -head2 MAINTAINER +=head1 MAINTAINER This project was originated by Nick Ing-Simmons and later maintained by Dan Kogai Edankogai@dan.co.jpE. See AUTHORS for full list diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index a7a6eba..9806d59 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,99 +1,31 @@ +/* + $Id: Encode.xs,v 1.29 2002/04/19 05:36:43 dankogai Exp $ + */ + #define PERL_NO_GET_CONTEXT #include "EXTERN.h" #include "perl.h" #include "XSUB.h" #define U8 U8 #include "encode.h" -#include "def_t.h" - -#define FBCHAR 0xFFFd -#define FBCHAR_UTF8 "\xEF\xBF\xBD" -#define BOM_BE 0xFeFF -#define BOM16LE 0xFFFe -#define BOM32LE 0xFFFe0000 -#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) -#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) -#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) -#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) - -static UV -enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) -{ - U8 *s = *sp; - UV v = 0; - if (s+size > e) { - croak("Partial character %c",(char) endian); - } - switch(endian) { - case 'N': - v = *s++; - v = (v << 8) | *s++; - case 'n': - v = (v << 8) | *s++; - v = (v << 8) | *s++; - break; - case 'V': - case 'v': - v |= *s++; - v |= (*s++ << 8); - if (endian == 'v') - break; - v |= (*s++ << 16); - v |= (*s++ << 24); - break; - default: - croak("Unknown endian %c",(char) endian); - break; - } - *sp = s; - return v; -} - -void -enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) -{ - U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size); - switch(endian) { - case 'v': - case 'V': - d += SvCUR(result); - SvCUR_set(result,SvCUR(result)+size); - while (size--) { - *d++ = value & 0xFF; - value >>= 8; - } - break; - case 'n': - case 'N': - SvCUR_set(result,SvCUR(result)+size); - d += SvCUR(result); - while (size--) { - *--d = value & 0xFF; - value >>= 8; - } - break; - default: - croak("Unknown endian %c",(char) endian); - break; - } -} -#define ENCODE_XS_PROFILE 0 /* set 1 or more to profile. - t/encoding.t dumps core because of - Perl_warner and PerlIO don't work well */ +/* set 1 or more to profile. t/encoding.t dumps core because of + Perl_warner and PerlIO don't work well */ +#define ENCODE_XS_PROFILE 0 -#define ENCODE_XS_USEFP 1 /* set 0 to disable floating point to calculate - buffer size for encode_method(). - 1 is recommended. 2 restores NI-S original */ +/* set 0 to disable floating point to calculate buffer size for + encode_method(). 1 is recommended. 2 restores NI-S original */ +#define ENCODE_XS_USEFP 1 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ Perl_croak(aTHX_ "panic_unimplemented"); \ return (y)0; /* fool picky compilers */ \ } +/**/ UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) - UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) +UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) -void + void Encode_XSEncoding(pTHX_ encode_t * enc) { dSP; @@ -114,12 +46,13 @@ Encode_XSEncoding(pTHX_ encode_t * enc) void call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) { - /* Exists for breakpointing */ + /* Exists for breakpointing */ } + static SV * encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, - int check) + int check) { STRLEN slen; U8 *s = (U8 *) SvPV(src, slen); @@ -128,157 +61,163 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, STRLEN sdone = 0; /* We allocate slen+1. - PerlIO dumps core if this value is smaller than this. */ + PerlIO dumps core if this value is smaller than this. */ SV *dst = sv_2mortal(newSV(slen+1)); - if (slen) { - U8 *d = (U8 *) SvPVX(dst); - STRLEN dlen = SvLEN(dst)-1; - int code; - while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) { - SvCUR_set(dst, dlen+ddone); - SvPOK_only(dst); - -#if ENCODE_XS_PROFILE >= 3 - Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d\n",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1); -#endif + U8 *d = (U8 *)SvPVX(dst); + STRLEN dlen = SvLEN(dst)-1; + int code; + + if (!slen){ + SvCUR_set(dst, 0); + SvPOK_only(dst); + goto ENCODE_END; + } + + while (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check)) + { + SvCUR_set(dst, dlen+ddone); + SvPOK_only(dst); - if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL) - break; - - switch (code) { - case ENCODE_NOSPACE: - { - STRLEN more = 0; /* make sure you initialize! */ - STRLEN sleft; - sdone += slen; - ddone += dlen; - sleft = tlen - sdone; + if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL){ + break; + } + switch (code) { + case ENCODE_NOSPACE: + { + STRLEN more = 0; /* make sure you initialize! */ + STRLEN sleft; + sdone += slen; + ddone += dlen; + sleft = tlen - sdone; #if ENCODE_XS_PROFILE >= 2 - Perl_warn(aTHX_ - "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", - more, sdone, sleft, SvLEN(dst)); + Perl_warn(aTHX_ + "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", + more, sdone, sleft, SvLEN(dst)); #endif - if (sdone != 0) { /* has src ever been processed ? */ + if (sdone != 0) { /* has src ever been processed ? */ #if ENCODE_XS_USEFP == 2 - more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone - - SvLEN(dst); + more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone + - SvLEN(dst); #elif ENCODE_XS_USEFP - more = (1.0*SvLEN(dst)+1)/sdone * sleft; + more = (1.0*SvLEN(dst)+1)/sdone * sleft; #else - /* safe until SvLEN(dst) == MAX_INT/16 */ - more = (16*SvLEN(dst)+1)/sdone/16 * sleft; -#endif - } - more += UTF8_MAXLEN; /* insurance policy */ -#if ENCODE_XS_PROFILE >= 2 - Perl_warn(aTHX_ - "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", - more, sdone, sleft, SvLEN(dst)); + /* safe until SvLEN(dst) == MAX_INT/16 */ + more = (16*SvLEN(dst)+1)/sdone/16 * sleft; #endif - d = (U8 *) SvGROW(dst, SvLEN(dst) + more); - /* dst need to grow need MORE bytes! */ - if (ddone >= SvLEN(dst)) { - Perl_croak(aTHX_ "Destination couldn't be grown."); - } - dlen = SvLEN(dst)-ddone-1; - d += ddone; - s += slen; - slen = tlen-sdone; - continue; } - - case ENCODE_NOREP: - if (dir == enc->f_utf8) { - STRLEN clen; - UV ch = - utf8n_to_uvuni(s + slen, (SvCUR(src) - slen), - &clen, 0); - if (!check) { /* fallback char */ + more += UTF8_MAXLEN; /* insurance policy */ + d = (U8 *) SvGROW(dst, SvLEN(dst) + more); + /* dst need to grow need MORE bytes! */ + if (ddone >= SvLEN(dst)) { + Perl_croak(aTHX_ "Destination couldn't be grown."); + } + dlen = SvLEN(dst)-ddone-1; + d += ddone; + s += slen; + slen = tlen-sdone; + continue; + } + case ENCODE_NOREP: + /* encoding */ + if (dir == enc->f_utf8) { + STRLEN clen; + UV ch = + utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), &clen, 0); + if (check & ENCODE_DIE_ON_ERR) { + Perl_croak( + aTHX_ "\"\\N{U+%" UVxf "}\" does not map to %s, %d", + ch, enc->name[0], __LINE__); + }else{ + if (check & ENCODE_RETURN_ON_ERR){ + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner( + aTHX_ packWARN(WARN_UTF8), + "\"\\N{U+%" UVxf "}\" does not map to %s", + ch,enc->name[0]); + } + goto ENCODE_SET_SRC; + }else if (check & ENCODE_PERLQQ){ + SV* perlqq = + sv_2mortal(newSVpvf("\\x{%04x}", ch)); sdone += slen + clen; - ddone += dlen + enc->replen; - sv_catpvn(dst, (char*)enc->rep, enc->replen); - } - else if (check == -1){ /* perlqq */ - SV* perlqq = - sv_2mortal(newSVpvf("\\x{%x}", ch)); - sdone += slen + clen; - ddone += dlen + SvLEN(perlqq); - sv_catsv(dst, perlqq); + ddone += dlen + SvCUR(perlqq); + sv_catsv(dst, perlqq); + } else { + /* fallback char */ + sdone += slen + clen; + ddone += dlen + enc->replen; + sv_catpvn(dst, (char*)enc->rep, enc->replen); } - else { - Perl_croak(aTHX_ - "\"\\N{U+%" UVxf - "}\" does not map to %s", ch, - enc->name[0]); - } + } } - else { - if (!check){ /* fallback char */ - sdone += slen + 1; - ddone += dlen + strlen(FBCHAR_UTF8); - sv_catpv(dst, FBCHAR_UTF8); - } - else if (check == -1){ /* perlqq */ - SV* perlqq = + /* decoding */ + else { + if (check & ENCODE_DIE_ON_ERR){ + Perl_croak( + aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)", + enc->name[0], (U8) s[slen], code); + }else{ + if (check & ENCODE_RETURN_ON_ERR){ + if (check & ENCODE_WARN_ON_ERR){ + Perl_warner( + aTHX_ packWARN(WARN_UTF8), + "%s \"\\x%02X\" does not map to Unicode (%d)", + enc->name[0], (U8) s[slen], code); + } + goto ENCODE_SET_SRC; + }else if (check & ENCODE_PERLQQ){ + SV* perlqq = sv_2mortal(newSVpvf("\\x%02X", s[slen])); - sdone += slen + 1; - ddone += dlen + SvLEN(perlqq); - sv_catsv(dst, perlqq); - } - else { - /* UTF-8 is supposed to be "Universal" so should not - happen for real characters, but some encodings - have non-assigned codes which may occur. */ - Perl_croak(aTHX_ "%s \"\\x%02X\" " - "does not map to Unicode (%d)", - enc->name[0], (U8) s[slen], code); + sdone += slen + 1; + ddone += dlen + SvCUR(perlqq); + sv_catsv(dst, perlqq); + } else { + sdone += slen + 1; + ddone += dlen + strlen(FBCHAR_UTF8); + sv_catpv(dst, FBCHAR_UTF8); + } } } + /* settle variables when fallback */ dlen = SvCUR(dst); d = (U8*)SvPVX(dst) + dlen; s = (U8*)SvPVX(src) + sdone; slen = tlen - sdone; break; - default: - Perl_croak(aTHX_ "Unexpected code %d converting %s %s", - code, (dir == enc->f_utf8) ? "to" : "from", - enc->name[0]); - return &PL_sv_undef; - } + default: + Perl_croak(aTHX_ "Unexpected code %d converting %s %s", + code, (dir == enc->f_utf8) ? "to" : "from", + enc->name[0]); + return &PL_sv_undef; } - SvCUR_set(dst, dlen+ddone); - SvPOK_only(dst); - if (check) { - sdone = SvCUR(src) - (slen+sdone); - if (sdone) { -#if 1 - /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly - SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0 - type SVs and sv_clear() calls it ... - */ - sv_setpvn(src, (char*)s+slen, sdone); -#else - Move(s + slen, SvPVX(src), sdone , U8); -#endif - } - SvCUR_set(src, sdone); + } + ENCODE_SET_SRC: + if (check & ~ENCODE_LEAVE_SRC){ + sdone = SvCUR(src) - (slen+sdone); + if (sdone) { + sv_setpvn(src, (char*)s+slen, sdone); } + SvCUR_set(src, sdone); } - else { - SvCUR_set(dst, 0); - SvPOK_only(dst); + /* warn("check = 0x%X, code = 0x%d\n", check, code); */ + if (code && !(check & ENCODE_RETURN_ON_ERR)) { + return &PL_sv_undef; } + + SvCUR_set(dst, dlen+ddone); + SvPOK_only(dst); + #if ENCODE_XS_PROFILE if (SvCUR(dst) > SvCUR(src)){ - Perl_warn(aTHX_ - "SvLEN(dst)=%d, SvCUR(dst)=%d. " - "%d bytes unused(%f %%)\n", - SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), - (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); - + Perl_warn(aTHX_ + "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n", + SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), + (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); } #endif + + ENCODE_END: *SvEND(dst) = '\0'; return dst; } @@ -291,11 +230,11 @@ void Method_name(obj) SV * obj CODE: - { - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0]))); - XSRETURN(1); - } +{ + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0]))); + XSRETURN(1); +} void Method_decode(obj,src,check = 0) @@ -303,181 +242,23 @@ SV * obj SV * src int check CODE: - { - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); - SvUTF8_on(ST(0)); - XSRETURN(1); - } - -void -Method_encode(obj,src,check = 0) -SV * obj -SV * src -int check -CODE: - { - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - sv_utf8_upgrade(src); - ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); - XSRETURN(1); - } - -MODULE = Encode PACKAGE = Encode::Unicode - -void -decode_xs(obj, str, chk = &PL_sv_undef) -SV * obj -SV * str -SV * chk -CODE: { - int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); - U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); - int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); - SV *result = newSVpvn("",0); - STRLEN ulen; - U8 *s = (U8 *)SvPVbyte(str,ulen); - U8 *e = (U8 *)SvEND(str); - ST(0) = sv_2mortal(result); - SvUTF8_on(result); - - if (!endian && s+size <= e) { - UV bom; - endian = (size == 4) ? 'N' : 'n'; - bom = enc_unpack(aTHX_ &s,e,size,endian); - if (bom != BOM_BE) { - if (bom == BOM16LE) { - endian = 'v'; - } - else if (bom == BOM32LE) { - endian = 'V'; - } - else { - croak("%s:Unregognised BOM %"UVxf, - SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),bom); - } - } -#if 0 - /* Update endian for this sequence */ - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); -#endif - } - while (s < e && s+size <= e) { - UV ord = enc_unpack(aTHX_ &s,e,size,endian); - U8 *d; - if (size != 4 && invalid_ucs2(ord)) { - if (ucs2) { - if (SvTRUE(chk)) { - croak("%s:no surrogates allowed %"UVxf, - SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); - } - if (s+size <= e) { - enc_unpack(aTHX_ &s,e,size,endian); /* skip the next one as well */ - } - ord = FBCHAR; - } - else { - UV lo; - if (!isHiSurrogate(ord)) { - croak("%s:Malformed HI surrogate %"UVxf, - SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); - } - if (s+size > e) { - /* Partial character */ - s -= size; /* back up to 1st half */ - break; /* And exit loop */ - } - lo = enc_unpack(aTHX_ &s,e,size,endian); - if (!isLoSurrogate(lo)){ - croak("%s:Malformed LO surrogate %"UVxf, - SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); - } - ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); - } - } - d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); - d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); - SvCUR_set(result,d - (U8 *)SvPVX(result)); - } - if (SvTRUE(chk)) { - if (s < e) { - Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", - SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); - Move(s,SvPVX(str),e-s,U8); - SvCUR_set(str,(e-s)); - } - else { - SvCUR_set(str,0); - } - *SvEND(str) = '\0'; - } + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); + SvUTF8_on(ST(0)); XSRETURN(1); } void -encode_xs(obj, utf8, chk = &PL_sv_undef) +Method_encode(obj,src,check = 0) SV * obj -SV * utf8 -SV * chk +SV * src +int check CODE: { - int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); - U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); - int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); - SV *result = newSVpvn("",0); - STRLEN ulen; - U8 *s = (U8 *)SvPVutf8(utf8,ulen); - U8 *e = (U8 *)SvEND(utf8); - ST(0) = sv_2mortal(result); - if (!endian) { - endian = (size == 4) ? 'N' : 'n'; - enc_pack(aTHX_ result,size,endian,BOM_BE); -#if 0 - /* Update endian for this sequence */ - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); -#endif - } - while (s < e && s+UTF8SKIP(s) <= e) { - STRLEN len; - UV ord = utf8n_to_uvuni(s, e-s, &len, 0); - s += len; - if (size != 4 && invalid_ucs2(ord)) { - if (!issurrogate(ord)){ - if (ucs2) { - if (SvTRUE(chk)) { - croak("%s:code point \"\\x{"UVxf"}\" too high", - SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord); - } - enc_pack(aTHX_ result,size,endian,FBCHAR); - }else{ - UV hi = ((ord - 0x10000) >> 10) + 0xD800; - UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; - enc_pack(aTHX_ result,size,endian,hi); - enc_pack(aTHX_ result,size,endian,lo); - } - } - else { - /* not supposed to happen */ - enc_pack(aTHX_ result,size,endian,FBCHAR); - } - } - else { - enc_pack(aTHX_ result,size,endian,ord); - } - } - if (SvTRUE(chk)) { - if (s < e) { - Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", - SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); - Move(s,SvPVX(utf8),e-s,U8); - SvCUR_set(utf8,(e-s)); - } - else { - SvCUR_set(utf8,0); - } - *SvEND(utf8) = '\0'; - } + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + sv_utf8_upgrade(src); + ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); XSRETURN(1); } @@ -487,152 +268,224 @@ PROTOTYPES: ENABLE I32 _bytes_to_utf8(sv, ...) - SV * sv - CODE: - { - SV * encoding = items == 2 ? ST(1) : Nullsv; - - if (encoding) - RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); - else { - STRLEN len; - U8* s = (U8*)SvPV(sv, len); - U8* converted; - - converted = bytes_to_utf8(s, &len); /* This allocs */ - sv_setpvn(sv, (char *)converted, len); - SvUTF8_on(sv); /* XXX Should we? */ - Safefree(converted); /* ... so free it */ - RETVAL = len; - } - } - OUTPUT: - RETVAL +SV * sv +CODE: +{ + SV * encoding = items == 2 ? ST(1) : Nullsv; + + if (encoding) + RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); + else { + STRLEN len; + U8* s = (U8*)SvPV(sv, len); + U8* converted; + + converted = bytes_to_utf8(s, &len); /* This allocs */ + sv_setpvn(sv, (char *)converted, len); + SvUTF8_on(sv); /* XXX Should we? */ + Safefree(converted); /* ... so free it */ + RETVAL = len; + } +} +OUTPUT: + RETVAL I32 _utf8_to_bytes(sv, ...) - SV * sv - CODE: - { - SV * to = items > 1 ? ST(1) : Nullsv; - SV * check = items > 2 ? ST(2) : Nullsv; - - if (to) - RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); - else { - STRLEN len; - U8 *s = (U8*)SvPV(sv, len); - - RETVAL = 0; - if (SvTRUE(check)) { - /* Must do things the slow way */ - U8 *dest; - U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */ - U8 *send = s + len; - - New(83, dest, len, U8); /* I think */ - - while (s < send) { - if (*s < 0x80) - *dest++ = *s++; - else { - STRLEN ulen; - UV uv = *s++; - - /* Have to do it all ourselves because of error routine, - aargh. */ - if (!(uv & 0x40)) - goto failure; - if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } - else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } - else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } - else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } - else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } - else if (!(uv & 0x01)) { ulen = 7; uv = 0; } - else { ulen = 13; uv = 0; } - - /* Note change to utf8.c variable naming, for variety */ - while (ulen--) { - if ((*s & 0xc0) != 0x80) - goto failure; +SV * sv +CODE: +{ + SV * to = items > 1 ? ST(1) : Nullsv; + SV * check = items > 2 ? ST(2) : Nullsv; + + if (to) { + RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); + } else { + STRLEN len; + U8 *s = (U8*)SvPV(sv, len); + + RETVAL = 0; + if (SvTRUE(check)) { + /* Must do things the slow way */ + U8 *dest; + /* We need a copy to pass to check() */ + U8 *src = (U8*)savepv((char *)s); + U8 *send = s + len; + + New(83, dest, len, U8); /* I think */ + + while (s < send) { + if (*s < 0x80){ + *dest++ = *s++; + } else { + STRLEN ulen; + UV uv = *s++; + + /* Have to do it all ourselves because of error routine, + aargh. */ + if (!(uv & 0x40)){ goto failure; } + if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } + else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } + else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } + else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } + else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } + else if (!(uv & 0x01)) { ulen = 7; uv = 0; } + else { ulen = 13; uv = 0; } - else - uv = (uv << 6) | (*s++ & 0x3f); + /* Note change to utf8.c variable naming, for variety */ + while (ulen--) { + if ((*s & 0xc0) != 0x80){ + goto failure; + } else { + uv = (uv << 6) | (*s++ & 0x3f); + } } if (uv > 256) { failure: - call_failure(check, s, dest, src); - /* Now what happens? */ + call_failure(check, s, dest, src); + /* Now what happens? */ } *dest++ = (U8)uv; - } - } - } else - RETVAL = (utf8_to_bytes(s, &len) ? len : 0); - } + } + } + } else { + RETVAL = (utf8_to_bytes(s, &len) ? len : 0); } - OUTPUT: - RETVAL + } +} +OUTPUT: + RETVAL bool is_utf8(sv, check = 0) SV * sv int check - CODE: - { - if (SvGMAGICAL(sv)) /* it could be $1, for example */ - sv = newSVsv(sv); /* GMAGIG will be done */ - if (SvPOK(sv)) { - RETVAL = SvUTF8(sv) ? TRUE : FALSE; - if (RETVAL && - check && - !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) - RETVAL = FALSE; - } else { +CODE: +{ + if (SvGMAGICAL(sv)) /* it could be $1, for example */ + sv = newSVsv(sv); /* GMAGIG will be done */ + if (SvPOK(sv)) { + RETVAL = SvUTF8(sv) ? TRUE : FALSE; + if (RETVAL && + check && + !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) RETVAL = FALSE; - } - if (sv != ST(0)) - SvREFCNT_dec(sv); /* it was a temp copy */ - } - OUTPUT: - RETVAL + } else { + RETVAL = FALSE; + } + if (sv != ST(0)) + SvREFCNT_dec(sv); /* it was a temp copy */ +} +OUTPUT: + RETVAL SV * _utf8_on(sv) - SV * sv - CODE: - { - if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - SvUTF8_on(sv); - } else { - RETVAL = &PL_sv_undef; - } - } - OUTPUT: - RETVAL +SV * sv +CODE: +{ + if (SvPOK(sv)) { + SV *rsv = newSViv(SvUTF8(sv)); + RETVAL = rsv; + SvUTF8_on(sv); + } else { + RETVAL = &PL_sv_undef; + } +} +OUTPUT: + RETVAL SV * _utf8_off(sv) - SV * sv - CODE: - { - if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - SvUTF8_off(sv); - } else { - RETVAL = &PL_sv_undef; - } - } - OUTPUT: - RETVAL +SV * sv +CODE: +{ + if (SvPOK(sv)) { + SV *rsv = newSViv(SvUTF8(sv)); + RETVAL = rsv; + SvUTF8_off(sv); + } else { + RETVAL = &PL_sv_undef; + } +} +OUTPUT: + RETVAL + +PROTOTYPES: DISABLE + + +int +DIE_ON_ERR() +CODE: + RETVAL = ENCODE_DIE_ON_ERR; +OUTPUT: + RETVAL + +int +WARN_ON_ERR() +CODE: + RETVAL = ENCODE_WARN_ON_ERR; +OUTPUT: + RETVAL + +int +LEAVE_SRC() +CODE: + RETVAL = ENCODE_LEAVE_SRC; +OUTPUT: + RETVAL + +int +RETURN_ON_ERR() +CODE: + RETVAL = ENCODE_RETURN_ON_ERR; +OUTPUT: + RETVAL + +int +PERLQQ() +CODE: + RETVAL = ENCODE_PERLQQ; +OUTPUT: + RETVAL + +int +FB_DEFAULT() +CODE: + RETVAL = ENCODE_FB_DEFAULT; +OUTPUT: + RETVAL + +int +FB_CROAK() +CODE: + RETVAL = ENCODE_FB_CROAK; +OUTPUT: + RETVAL + +int +FB_QUIET() +CODE: + RETVAL = ENCODE_FB_QUIET; +OUTPUT: + RETVAL + +int +FB_WARN() +CODE: + RETVAL = ENCODE_FB_WARN; +OUTPUT: + RETVAL + +int +FB_PERLQQ() +CODE: + RETVAL = ENCODE_FB_PERLQQ; +OUTPUT: + RETVAL BOOT: { -#if defined(USE_PERLIO) && !defined(USE_SFIO) -/* PerlIO_define_layer(aTHX_ &PerlIO_encode); */ -#endif +#include "def_t.h" #include "def_t.exh" } diff --git a/ext/Encode/Encode/Makefile_PL.e2x b/ext/Encode/Encode/Makefile_PL.e2x index 8571033..c55b6e3 100644 --- a/ext/Encode/Encode/Makefile_PL.e2x +++ b/ext/Encode/Encode/Makefile_PL.e2x @@ -14,9 +14,30 @@ my %tables = ( ); #### DO NOT EDIT BEYOND THIS POINT! -my $enc2xs = '$_Enc2xs_'; +require File::Spec; +my ($enc2xs, $encode_h) = (); +PATHLOOP: +for my $d (split /:/, $ENV{PATH}){ + for my $f (qw/enc2xs enc2xs5.7.3/){ + my $path = File::Spec->catfile($d, $f); + -x $path and $enc2xs = $path and last PATHLOOP; + } +} +$enc2xs or die "enc2xs not found!"; +print "enc2xs is $enc2xs\n"; +my %encode_h = (); +for my $d (@INC){ + my $dir = File::Spec->catfile($d, "Encode"); + my $file = File::Spec->catfile($dir, "encode.h"); + -f $file and $encode_h{$dir} = -M $file; +} +%encode_h or die "encode.h not found!"; +# find the latest one +($encode_h) = sort {$encode_h{$b} <=> $encode_h{$a}} keys %encode_h; +print "encode.h is at $encode_h\n"; + WriteMakefile( - INC => "-I$_E2X_", + INC => "-I$encode_h", #### END_OF_HEADER -- DO NOT EDIT THIS LINE BY HAND! #### NAME => 'Encode::'.$name, VERSION_FROM => "$name.pm", @@ -27,6 +48,9 @@ WriteMakefile( DIST_DEFAULT => 'all tardist', }, MAN3PODS => {}, + PREREQ_PM => { + 'Encode' => "1.41", + }, # OS 390 winges about line numbers > 64K ??? XSOPT => '-nolinenumbers', ); diff --git a/ext/Encode/Encode/encode.h b/ext/Encode/Encode/encode.h index f19cdc2..04df7f9 100644 --- a/ext/Encode/Encode/encode.h +++ b/ext/Encode/Encode/encode.h @@ -2,52 +2,57 @@ #define ENCODE_H #ifndef U8 -/* A tad devious this: - perl normally has a #define for U8 - if that isn't present - then we typedef it - leaving it #ifndef so we can do data parts without +/* + A tad devious this: + perl normally has a #define for U8 - if that isn't present then we + typedef it - leaving it #ifndef so we can do data parts without getting extern references to the code parts - */ +*/ typedef unsigned char U8; #endif typedef struct encpage_s encpage_t; - struct encpage_s { - /* fields ordered to pack nicely on 32-bit machines */ - const U8 *seq; /* Packed output sequences we generate if we match */ - encpage_t *next; /* Page to go to if we match */ - U8 min; /* Min value of octet to match this entry */ - U8 max; /* Max value of octet to match this entry */ - U8 dlen; /* destination length - size of entries in seq */ - U8 slen; /* source length - number of source octets needed */ + /* fields ordered to pack nicely on 32-bit machines */ + const U8 *seq; /* Packed output sequences we generate + if we match */ + encpage_t *next; /* Page to go to if we match */ + U8 min; /* Min value of octet to match this entry */ + U8 max; /* Max value of octet to match this entry */ + U8 dlen; /* destination length - + size of entries in seq */ + U8 slen; /* source length - + number of source octets needed */ }; /* - At any point in a translation there is a page pointer which points at an array - of the above structures. - - Basic operation : - get octet from source stream. - if (octet >= min && octet < max) { - if slen is 0 then we cannot represent this character. - if we have less than slen octets (including this one) then we have a partial character. - otherwise - copy dlen octets from seq + dlen*(octet-min) to output - (dlen may be zero if we don't know yet.) - load page pointer with next to continue. - (is slen is one this is end of a character) - get next octet. - } - else { - increment the page pointer to look at next slot in the array - } - - arrays SHALL be constructed so there is an entry which matches ..0xFF at the end, - and either maps it or indicates no representation. - - if MSB of slen is set then mapping is an approximate "FALLBACK" entry. + At any point in a translation there is a page pointer which points + at an array of the above structures. + + Basic operation : + get octet from source stream. + if (octet >= min && octet < max) { + if slen is 0 then we cannot represent this character. + if we have less than slen octets (including this one) then + we have a partial character. + otherwise + copy dlen octets from seq + dlen*(octet-min) to output + (dlen may be zero if we don't know yet.) + load page pointer with next to continue. + (is slen is one this is end of a character) + get next octet. + } + else { + increment the page pointer to look at next slot in the array + } + + arrays SHALL be constructed so there is an entry which matches + ..0xFF at the end, and either maps it or indicates no + representation. + + if MSB of slen is set then mapping is an approximate "FALLBACK" entry. */ @@ -55,13 +60,16 @@ struct encpage_s typedef struct encode_s encode_t; struct encode_s { - encpage_t *t_utf8; /* Starting table for translation from the encoding to UTF-8 form */ - encpage_t *f_utf8; /* Starting table for translation from UTF-8 to the encoding */ - const U8 *rep; /* Replacement character in this encoding e.g. "?" */ - int replen; /* Number of octets to represent replacement character */ - U8 min_el; /* Minimum octets to represent a character */ - U8 max_el; /* Maximum octets to represent a character */ - const char *name[2]; /* name(s) of this encoding */ + encpage_t *t_utf8; /* Starting table for translation from + the encoding to UTF-8 form */ + encpage_t *f_utf8; /* Starting table for translation + from UTF-8 to the encoding */ + const U8 *rep; /* Replacement character in this encoding + e.g. "?" */ + int replen; /* Number of octets in rep */ + U8 min_el; /* Minimum octets to represent a character */ + U8 max_el; /* Maximum octets to represent a character */ + const char *name[2]; /* name(s) of this encoding */ }; #ifdef U8 @@ -72,10 +80,25 @@ extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, extern void Encode_DefineEncoding(encode_t *enc); -#endif +#endif /* U8 */ #define ENCODE_NOSPACE 1 #define ENCODE_PARTIAL 2 #define ENCODE_NOREP 3 #define ENCODE_FALLBACK 4 -#endif + +#define FBCHAR_UTF8 "\xEF\xBF\xBD" + +#define ENCODE_DIE_ON_ERR 0x0001 /* croaks immediately */ +#define ENCODE_WARN_ON_ERR 0x0002 /* warn on error; may proceed */ +#define ENCODE_RETURN_ON_ERR 0x0004 /* immediately returns on NOREP */ +#define ENCODE_LEAVE_SRC 0x0008 /* $src updated unless set */ +#define ENCODE_PERLQQ 0x0100 /* perlqq fallback string */ + +#define ENCODE_FB_DEFAULT 0x0000 +#define ENCODE_FB_CROAK 0x0001 +#define ENCODE_FB_QUIET ENCODE_RETURN_ON_ERR +#define ENCODE_FB_WARN (ENCODE_RETURN_ON_ERR|ENCODE_WARN_ON_ERR) +#define ENCODE_FB_PERLQQ ENCODE_PERLQQ + +#endif /* ENCODE_H */ diff --git a/ext/Encode/JP/JP.pm b/ext/Encode/JP/JP.pm index 1a4d42e..27fca7d 100644 --- a/ext/Encode/JP/JP.pm +++ b/ext/Encode/JP/JP.pm @@ -5,7 +5,7 @@ BEGIN { } } use Encode; -our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use XSLoader; XSLoader::load(__PACKAGE__,$VERSION); @@ -89,6 +89,6 @@ to find why it is implemented that way. =head1 SEE ALSO -L,L +L =cut diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index 499998b..b203d86 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -26,6 +26,9 @@ Symbol/Makefile.PL Encode extension Symbol/Symbol.pm Encode extension TW/Makefile.PL Encode extension TW/TW.pm Encode extension +Unicode/Makefile.PL Encode extension +Unicode/Unicode.pm Encode extension +Unicode/Unicode.xs Encode extension bin/enc2xs Encode module generator bin/piconv iconv by perl bin/ucm2table Table Generator for testing @@ -42,8 +45,8 @@ lib/Encode/Encoding.pm Encode extension lib/Encode/JP/H2Z.pm Encode extension lib/Encode/JP/JIS7.pm Encode extension lib/Encode/KR/2022_KR.pm Encode extension -lib/Encode/Supported.pod Documents supported encodings -lib/Encode/Unicode.pm Encode extension +lib/Encode/PerlIO.pod Documents for Encode & PerlIO +lib/Encode/Supported.pod Documents for supported encodings t/Aliases.t Encode extension test t/CN.t Encode extension test t/Encode.t Encode extension test @@ -54,9 +57,12 @@ t/TW.t Encode extension test t/Unicode.t Encode extension test t/bogus.ucm Sample data for ucmlint t/encoding.t encoding extension test +t/fallback.t Encode extension test t/gb2312.euc test data t/gb2312.ref test data t/grow.t Encode extension test +t/jisx0201.euc test data +t/jisx0201.ref test data t/jisx0208.euc test data t/jisx0208.ref test data t/jisx0212.euc test data @@ -64,6 +70,7 @@ t/jisx0212.ref test data t/jperl.t encoding extension test t/ksc5601.euc test data t/ksc5601.ref test data +t/perlio.t t/unibench.pl Unicode benchmark ucm/8859-1.ucm Unicode Character Map ucm/8859-10.ucm Unicode Character Map diff --git a/ext/Encode/TW/TW.pm b/ext/Encode/TW/TW.pm index 294144a..21dab51 100644 --- a/ext/Encode/TW/TW.pm +++ b/ext/Encode/TW/TW.pm @@ -4,7 +4,7 @@ BEGIN { die "Encode::TW not supported on EBCDIC\n"; } } -our $VERSION = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use XSLoader; @@ -43,14 +43,14 @@ To find how to use this module in detail, see L. =head1 NOTES -Due to size concerns, C (Extended Unix Character) and C +Due to size concerns, C (Extended Unix Character), C +(Chinese Character Code for Information Interchange) and C (CMEX's Big5+) are distributed separately on CPAN, under the name L. That module also contains extra China-based encodings. =head1 BUGS -The C encoding files are not complete (only the first two planes, -C<11643-1> and C<11643-2>, exist in the distribution). For common CNS11643 +The C encoding files are not complete. For common C manipulation, please use C in L, which contains plane 1-7. @@ -63,6 +63,6 @@ to find why it is implemented that way. =head1 SEE ALSO -L,L +L =cut diff --git a/ext/Encode/Unicode/Makefile.PL b/ext/Encode/Unicode/Makefile.PL new file mode 100644 index 0000000..d2dfdff --- /dev/null +++ b/ext/Encode/Unicode/Makefile.PL @@ -0,0 +1,11 @@ +use 5.7.2; +use strict; +use ExtUtils::MakeMaker; + +WriteMakefile( + INC => "-I../Encode", + NAME => 'Encode::Unicode', + VERSION_FROM => "Unicode.pm", + MAN3PODS => {}, + ); + diff --git a/ext/Encode/lib/Encode/Unicode.pm b/ext/Encode/Unicode/Unicode.pm similarity index 99% rename from ext/Encode/lib/Encode/Unicode.pm rename to ext/Encode/Unicode/Unicode.pm index 55ae975..257989a 100644 --- a/ext/Encode/lib/Encode/Unicode.pm +++ b/ext/Encode/Unicode/Unicode.pm @@ -3,41 +3,10 @@ package Encode::Unicode; use strict; use warnings; -our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; -# -# Aux. subs & constants -# - -sub FBCHAR(){ 0xFFFd } -sub BOM_BE(){ 0xFeFF } -sub BOM16LE(){ 0xFFFe } -sub BOM32LE(){ 0xFFFe0000 } - -sub valid_ucs2($){ - return - (0 <= $_[0] && $_[0] < 0xD800) - || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF); -} - -sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF } -sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 } -sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF } - -sub ensurrogate($){ - use integer; # we have divisions - my $uni = shift; - my $hi = ($uni - 0x10000) / 0x400 + 0xD800; - my $lo = ($uni - 0x10000) % 0x400 + 0xDC00; - return ($hi, $lo); -} - -sub desurrogate($$){ - my ($hi, $lo) = @_; - return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00); -} - -sub Mask { {2 => 0xffff, 4 => 0xffffffff} } +use XSLoader; +XSLoader::load(__PACKAGE__,$VERSION); # # Object Generator 8 transcoders all at once! @@ -105,6 +74,40 @@ sub set_transcoder{ set_transcoder("xs"); # +# Aux. subs & constants +# + +sub FBCHAR(){ 0xFFFd } +sub BOM_BE(){ 0xFeFF } +sub BOM16LE(){ 0xFFFe } +sub BOM32LE(){ 0xFFFe0000 } + +sub valid_ucs2($){ + return + (0 <= $_[0] && $_[0] < 0xD800) + || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF); +} + +sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF } +sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 } +sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF } + +sub ensurrogate($){ + use integer; # we have divisions + my $uni = shift; + my $hi = ($uni - 0x10000) / 0x400 + 0xD800; + my $lo = ($uni - 0x10000) % 0x400 + 0xDC00; + return ($hi, $lo); +} + +sub desurrogate($$){ + my ($hi, $lo) = @_; + return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00); +} + +sub Mask { {2 => 0xffff, 4 => 0xffffffff} } + +# # *_modern are much faster but guzzle more memory # diff --git a/ext/Encode/Unicode/Unicode.xs b/ext/Encode/Unicode/Unicode.xs new file mode 100644 index 0000000..4e21de9 --- /dev/null +++ b/ext/Encode/Unicode/Unicode.xs @@ -0,0 +1,245 @@ +/* + $Id: Unicode.xs,v 1.2 2002/04/19 05:36:43 dankogai Exp $ + */ + +#define PERL_NO_GET_CONTEXT +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" + +#define FBCHAR 0xFFFd +#define BOM_BE 0xFeFF +#define BOM16LE 0xFFFe +#define BOM32LE 0xFFFe0000 +#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) +#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) +#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) +#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) + +static UV +enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) +{ + U8 *s = *sp; + UV v = 0; + if (s+size > e) { + croak("Partial character %c",(char) endian); + } + switch(endian) { + case 'N': + v = *s++; + v = (v << 8) | *s++; + case 'n': + v = (v << 8) | *s++; + v = (v << 8) | *s++; + break; + case 'V': + case 'v': + v |= *s++; + v |= (*s++ << 8); + if (endian == 'v') + break; + v |= (*s++ << 16); + v |= (*s++ << 24); + break; + default: + croak("Unknown endian %c",(char) endian); + break; + } + *sp = s; + return v; +} + +void +enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) +{ + U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size); + switch(endian) { + case 'v': + case 'V': + d += SvCUR(result); + SvCUR_set(result,SvCUR(result)+size); + while (size--) { + *d++ = value & 0xFF; + value >>= 8; + } + break; + case 'n': + case 'N': + SvCUR_set(result,SvCUR(result)+size); + d += SvCUR(result); + while (size--) { + *--d = value & 0xFF; + value >>= 8; + } + break; + default: + croak("Unknown endian %c",(char) endian); + break; + } +} + +MODULE = Encode::Unicode PACKAGE = Encode::Unicode + +void +decode_xs(obj, str, chk = &PL_sv_undef) +SV * obj +SV * str +SV * chk +CODE: +{ + int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); + U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); + int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); + SV *result = newSVpvn("",0); + STRLEN ulen; + U8 *s = (U8 *)SvPVbyte(str,ulen); + U8 *e = (U8 *)SvEND(str); + ST(0) = sv_2mortal(result); + SvUTF8_on(result); + + if (!endian && s+size <= e) { + UV bom; + endian = (size == 4) ? 'N' : 'n'; + bom = enc_unpack(aTHX_ &s,e,size,endian); + if (bom != BOM_BE) { + if (bom == BOM16LE) { + endian = 'v'; + } + else if (bom == BOM32LE) { + endian = 'V'; + } + else { + croak("%s:Unregognised BOM %"UVxf, + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)), + bom); + } + } +#if 0 + /* Update endian for this sequence */ + hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); +#endif + } + while (s < e && s+size <= e) { + UV ord = enc_unpack(aTHX_ &s,e,size,endian); + U8 *d; + if (size != 4 && invalid_ucs2(ord)) { + if (ucs2) { + if (SvTRUE(chk)) { + croak("%s:no surrogates allowed %"UVxf, + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)), + ord); + } + if (s+size <= e) { + /* skip the next one as well */ + enc_unpack(aTHX_ &s,e,size,endian); + } + ord = FBCHAR; + } + else { + UV lo; + if (!isHiSurrogate(ord)) { + croak("%s:Malformed HI surrogate %"UVxf, + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)), + ord); + } + if (s+size > e) { + /* Partial character */ + s -= size; /* back up to 1st half */ + break; /* And exit loop */ + } + lo = enc_unpack(aTHX_ &s,e,size,endian); + if (!isLoSurrogate(lo)){ + croak("%s:Malformed LO surrogate %"UVxf, + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)), + ord); + } + ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); + } + } + d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); + d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); + SvCUR_set(result,d - (U8 *)SvPVX(result)); + } + if (SvTRUE(chk)) { + if (s < e) { + Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); + Move(s,SvPVX(str),e-s,U8); + SvCUR_set(str,(e-s)); + } + else { + SvCUR_set(str,0); + } + *SvEND(str) = '\0'; + } + XSRETURN(1); +} + +void +encode_xs(obj, utf8, chk = &PL_sv_undef) + SV * obj +SV * utf8 +SV * chk +CODE: +{ + int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); + U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); + int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); + SV *result = newSVpvn("",0); + STRLEN ulen; + U8 *s = (U8 *)SvPVutf8(utf8,ulen); + U8 *e = (U8 *)SvEND(utf8); + ST(0) = sv_2mortal(result); + if (!endian) { + endian = (size == 4) ? 'N' : 'n'; + enc_pack(aTHX_ result,size,endian,BOM_BE); +#if 0 + /* Update endian for this sequence */ + hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); +#endif + } + while (s < e && s+UTF8SKIP(s) <= e) { + STRLEN len; + UV ord = utf8n_to_uvuni(s, e-s, &len, 0); + s += len; + if (size != 4 && invalid_ucs2(ord)) { + if (!issurrogate(ord)){ + if (ucs2) { + if (SvTRUE(chk)) { + croak("%s:code point \"\\x{"UVxf"}\" too high", + SvPV_nolen( + *hv_fetch((HV *)SvRV(obj),"Name",4,0)) + ,ord); + } + enc_pack(aTHX_ result,size,endian,FBCHAR); + }else{ + UV hi = ((ord - 0x10000) >> 10) + 0xD800; + UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; + enc_pack(aTHX_ result,size,endian,hi); + enc_pack(aTHX_ result,size,endian,lo); + } + } + else { + /* not supposed to happen */ + enc_pack(aTHX_ result,size,endian,FBCHAR); + } + } + else { + enc_pack(aTHX_ result,size,endian,ord); + } + } + if (SvTRUE(chk)) { + if (s < e) { + Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); + Move(s,SvPVX(utf8),e-s,U8); + SvCUR_set(utf8,(e-s)); + } + else { + SvCUR_set(utf8,0); + } + *SvEND(utf8) = '\0'; + } + XSRETURN(1); +} + diff --git a/ext/Encode/bin/enc2xs b/ext/Encode/bin/enc2xs index aa99f12..9fb57bc 100644 --- a/ext/Encode/bin/enc2xs +++ b/ext/Encode/bin/enc2xs @@ -8,7 +8,7 @@ BEGIN { use strict; use Getopt::Std; my @orig_ARGV = @ARGV; -our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # These may get re-ordered. # RAW is a do_now as inserted by &enter @@ -261,7 +261,7 @@ if ($doC) #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el); my $replen = 0; $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g); - my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8*)"$rep"),$replen,$min_el,$max_el); + my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el); my $sym = "${enc}_encoding"; $sym =~ s/\W+/_/g; print C "encode_t $sym = \n"; diff --git a/ext/Encode/bin/piconv b/ext/Encode/bin/piconv index 3880dea..050006e 100644 --- a/ext/Encode/bin/piconv +++ b/ext/Encode/bin/piconv @@ -1,5 +1,5 @@ #!./perl -# $Id: piconv,v 1.22 2002/04/16 23:35:00 dankogai Exp $ +# $Id: piconv,v 1.23 2002/04/19 05:36:43 dankogai Exp $ # use 5.7.3; use strict; @@ -19,7 +19,7 @@ my $to = $Opt{t} || $locale or help("to_encoding unspecified"); $Opt{s} and Encode::from_to($Opt{s}, $from, $to) and print $Opt{s} and exit; my $scheme = exists $Scheme{$Opt{S}} ? $Opt{S} : 'from_to'; $Opt{C} ||= $Opt{c}; -$Opt{p} and $Opt{C} = -1; +$Opt{p} and $Opt{C} = Encode::FB_PERLQQ; if ($Opt{D}){ my $cfrom = Encode->getEncoding($from)->name; diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm index 618535f..f187324 100644 --- a/ext/Encode/encoding.pm +++ b/ext/Encode/encoding.pm @@ -1,5 +1,5 @@ package encoding; -our $VERSION = do { my @r = (q$Revision: 1.28 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode; use strict; @@ -11,15 +11,8 @@ BEGIN { } } -our $HAS_PERLIO_ENCODING; - -eval { require PerlIO::encoding; }; -if ($@){ - $HAS_PERLIO_ENCODING = 0; -}else{ - $HAS_PERLIO_ENCODING = 1; - binmode(STDIN); -} +our $HAS_PERLIO = exists $INC{"PerlIO/encoding.pm"}; +$HAS_PERLIO or binmode(STDIN); sub import { my $class = shift; @@ -34,7 +27,7 @@ sub import { } unless ($arg{Filter}){ ${^ENCODING} = $enc; # this is all you need, actually. - $HAS_PERLIO_ENCODING or return 1; + $HAS_PERLIO or return 1; for my $h (qw(STDIN STDOUT)){ if ($arg{$h}){ unless (defined find_encoding($arg{$h})) { @@ -85,6 +78,7 @@ sub unimport{ 1; __END__ + =pod =head1 NAME diff --git a/ext/Encode/lib/Encode/JP/JIS7.pm b/ext/Encode/lib/Encode/JP/JIS7.pm index 8cc40ca..18d8b16 100644 --- a/ext/Encode/lib/Encode/JP/JIS7.pm +++ b/ext/Encode/lib/Encode/JP/JIS7.pm @@ -1,7 +1,7 @@ package Encode::JP::JIS7; use strict; -our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; require Encode; for my $name ('7bit-jis', 'iso-2022-jp', 'iso-2022-jp-1'){ @@ -21,6 +21,8 @@ sub new_sequence { $_[0] }; use Encode::CJKConstants qw(:all); +our $DEBUG = 0; + # # decode is identical for all 2022 variants # @@ -28,8 +30,11 @@ use Encode::CJKConstants qw(:all); sub decode { my ($obj,$str,$chk) = @_; - jis_euc(\$str); - return Encode::decode('euc-jp', $str, $chk); + my $residue = jis_euc(\$str); + # This is for PerlIO + $_[1] = $residue if $chk; + # use perlqq fallback for euc-jp -> utf8 + return Encode::decode('euc-jp', $str, 0); } # @@ -39,12 +44,14 @@ sub decode sub encode { require Encode::JP::H2Z; - my ($obj,$str,$chk) = @_; + my ($obj, $utf8, $chk) = @_; + # empty the input string in the stack so perlio is ok + $_[1] = '' if $chk; my ($h2z, $jis0212) = @$obj{qw(h2z jis0212)}; - my $result = Encode::encode('euc-jp', $str, $chk); - $h2z and &Encode::JP::H2Z::h2z(\$result); - euc_jis(\$result, $jis0212); - return $result; + my $octet = Encode::encode('euc-jp', $utf8, 0) ; + $h2z and &Encode::JP::H2Z::h2z(\$octet); + euc_jis(\$octet, $jis0212); + return $octet; } @@ -57,19 +64,20 @@ sub jis_euc { ([^\e]*) ) { - my ($esc, $str) = ($1, $2); + my ($esc, $chunk) = ($1, $2); if ($esc !~ /$RE{ISO_ASC}/o) { - $str =~ tr/\x21-\x7e/\xa1-\xfe/; + $chunk =~ tr/\x21-\x7e/\xa1-\xfe/; if ($esc =~ /$RE{JIS_KANA}/o) { - $str =~ s/([\xa1-\xdf])/\x8e$1/og; + $chunk =~ s/([\xa1-\xdf])/\x8e$1/og; } elsif ($esc =~ /$RE{JIS_0212}/o) { - $str =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; + $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; } } - $str; + $chunk; }geox; - $$r_str; + my ($residue) = ($$r_str =~ s/(\e.*)$//so); + return $residue; } sub euc_jis{ @@ -78,18 +86,18 @@ sub euc_jis{ $$r_str =~ s{ ((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+) }{ - my $str = $1; + my $chunk = $1; my $esc = - ( $str =~ tr/\x8E//d ) ? $ESC{KANA} : - ( $str =~ tr/\x8F//d ) ? $ESC{JIS_0212} : + ( $chunk =~ tr/\x8E//d ) ? $ESC{KANA} : + ( $chunk =~ tr/\x8F//d ) ? $ESC{JIS_0212} : $ESC{JIS_0208}; if ($esc eq $ESC{JIS_0212} && !$jis0212){ # fallback to '?' - $str =~ tr/\xA1-\xFE/\x3F/; + $chunk =~ tr/\xA1-\xFE/\x3F/; }else{ - $str =~ tr/\xA1-\xFE/\x21-\x7E/; + $chunk =~ tr/\xA1-\xFE/\x21-\x7E/; } - $esc . $str . $ESC{ASC}; + $esc . $chunk . $ESC{ASC}; }geox; $$r_str =~ s/\Q$ESC{ASC}\E diff --git a/ext/Encode/lib/Encode/PerlIO.pod b/ext/Encode/lib/Encode/PerlIO.pod new file mode 100644 index 0000000..c076b27 --- /dev/null +++ b/ext/Encode/lib/Encode/PerlIO.pod @@ -0,0 +1,169 @@ +=head1 NAME + +Encode::PerlIO -- a detailed document on Encode and PerlIO + +=head1 Overview + +It is very common to want to do encoding transformations when +reading or writing files, network connections, pipes etc. +If Perl is configured to use the new 'perlio' IO system then +C provides a "layer" (See L) which can transform +data as it is read or written. + +Here is how the blind poet would modernise the encoding: + + use Encode; + open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek'); + open(my $utf8,'>:utf8','iliad.utf8'); + my @epic = <$iliad>; + print $utf8 @epic; + close($utf8); + close($illiad); + +In addition the new IO system can also be configured to read/write +UTF-8 encoded characters (as noted above this is efficient): + + open(my $fh,'>:utf8','anything'); + print $fh "Any \x{0021} string \N{SMILEY FACE}\n"; + +Either of the above forms of "layer" specifications can be made the default +for a lexical scope with the C pragma. See L. + +Once a handle is open is layers can be altered using C. + +Without any such configuration, or if Perl itself is built using +system's own IO, then write operations assume that file handle accepts +only I and will C if a character larger than 255 is +written to the handle. When reading, each octet from the handle +becomes a byte-in-a-character. Note that this default is the same +behaviour as bytes-only languages (including Perl before v5.6) would +have, and is sufficient to handle native 8-bit encodings +e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling +other encodings and binary data. + +In other cases it is the programs responsibility to transform +characters into bytes using the API above before doing writes, and to +transform the bytes read from a handle into characters before doing +"character operations" (e.g. C, C, ...). + +You can also use PerlIO to convert larger amounts of data you don't +want to bring into memory. For example to convert between ISO-8859-1 +(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines): + + open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!; + open(G, ">:utf8", "data.utf") or die $!; + while () { print G } + + # Could also do "print G " but that would pull + # the whole file into memory just to write it out again. + +More examples: + + open(my $f, "<:encoding(cp1252)") + open(my $g, ">:encoding(iso-8859-2)") + open(my $h, ">:encoding(latin9)") # iso-8859-15 + +See also L for how to change the default encoding of the +data in your script. + +=head1 How does it work? + +Here is a crude diagram of how filehandle, PerlIO, and Encode +interact. + + filehandle <-> PerlIO PerlIO <-> scalar (read/printed) + \ / + Encode + +When PerlIO receives data from either direction, it fills in the buffer +(currently with 1024 bytes) and pass the buffer to Encode. Encode tries +to convert the valid part and pass it back to PerlIO, leaving invalid +parts (usually partial character) in buffer. PerlIO then appends more +data in buffer, call Encode, and so on until the data stream ends. + +To do so, PerlIO always calls (de|en)code methods with CHECK set to 1. +this ensures that the method stops at the right place when it +encounters partial character. The following is what happens when +PerlIO and Encode tries to encode (from utf8) more than 1024 bytes +long and the buffer boundary happens to be between a character. + + A B C .... ~ \x{3000} .... + 41 42 43 .... 7E e3 80 80 .... + <- buffer ---------------> + << encoded >>>>>>>>>> + <- next buffer ------ + +Encode converts from the beginning to \x7E, leaving \xe3 in the buffer +because it is invalid (partial character). + +Unfortunately, this scheme does not work well with escape-based +encoding such as ISO-2022-JP. Let's see what happens in that case +in the next chapter. + +=head1 BUGS + +Now let's see what happens when you try to decode form ISO-2022-JP and +the buffer cuts in the middle of a character + + JIS208-ESC \x{5f3e} + A B C .... ~ \e $ B |DAN | .... + 41 42 43 .... 7E 1b 24 41 43 46 .... + <- buffer ---------------------------> + << encoded >>>>>>>>>>>>>>>>>>>>>>> + +As you see, the next buffer begins with \x43. But \x43 is 'C' in +ASCII, which is wrong in this case because we are now in JISX 0208 +area so it has to convert \x43\x46, not \x43. Unlike utf8 and EUC, +in escape-based encoding you can't tell if it a given octed is a whole +character or just part of it. + +There are actually several ways to solve this problem but none of +which is fast enough to be practical. From Encode's point of view +the easiest solution is for PerlIO to implement line buffer instead +of fixed-length buffer but that makes PerlIO really complicated. + +So for the time being, using escape-based encodings in ":encoding()" +layer of PerlIO does not work well. + +=head2 Workaround + +If you still insist, you can at least use ":encoding()" by making sure +the buffer never gets full. Here is an example. + + use FileHandle; + binmode(STDOUT, ":encoding(7bit-jis)"); + STDOUT->autoflush(1); # don't forget this! + for my $l (@lines){ # $l cannot be longer than 1023 bytes + print $l; + } + +=head2 How can you tell my encoding fully supports PerlIO ? + +As of this writing, Any encoding which class belongs to Encode::XS and +Encode::Unicode works. Encode module has C method so you +can use it before appling PerlIO encoding to the filehandle. Here is +an example; + + my $use_perlio = perlio_ok($enc); + my $layer = $use_perlio ? "<:raw" : "<:encoding($enc)"; + open my $fh, $layer, $file or die "$file : $!"; + while(<$fh>){ + $_ = decode($enc, $_) unless $use_perlio; + # .... + } + +=head1 SEE ALSO + +L, +L, +L, +L, +L, +L, +L, +L, +the Perl Unicode Mailing List Eperl-unicode@perl.orgE + + +=cut + diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod index debb06e..d292a01 100644 --- a/ext/Encode/lib/Encode/Supported.pod +++ b/ext/Encode/lib/Encode/Supported.pod @@ -105,7 +105,7 @@ L for details. ---------------------------------------------------------------- N. America (ASCII) cp437 AdobeStandardEncoding cp863 (DOSCanadaF) - W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep + W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep hp-roman8 cp860 (DOSPortuguese) Cntrl. Europe iso-8859-2 cp852 cp1250 MacCentralEurRoman @@ -158,6 +158,7 @@ For gory details, see L koi8-f koi8-r cp878 [RFC1489] koi8-u [RFC2319] + ---------------------------------------------------------------- =item gsm0338 - Hentai Latin 1 @@ -594,8 +595,8 @@ Microsoft's understanding of C. JIS has not endorsed the full Microsoft standard however. The official C includes only JIS X 0201 and JIS X 0208 -subsets, while Microsoft has always been meaning C to -encode a wider character repertoire. See C registration for +character sets, while Microsoft has always been meaning C +to encode a wider character repertoire. See C registration for C. As a historical predecessor Microsoft's variant diff --git a/ext/Encode/t/JP.t b/ext/Encode/t/JP.t index f904986..4192a7c 100644 --- a/ext/Encode/t/JP.t +++ b/ext/Encode/t/JP.t @@ -8,10 +8,6 @@ BEGIN { print "1..0 # Skip: Encode was not built\n"; exit 0; } - unless (find PerlIO::Layer 'perlio') { - print "1..0 # Skip: PerlIO was not built\n"; - exit 0; - } if (ord("A") == 193) { print "1..0 # Skip: EBCDIC\n"; exit 0; @@ -19,7 +15,7 @@ BEGIN { $| = 1; } use strict; -use Test::More tests => 27; +use Test::More tests => 37; #use Test::More qw(no_plan); use Encode; use File::Basename; @@ -29,103 +25,65 @@ require_ok "Encode::JP"; my ($src, $uni, $dst, $txt, $euc, $utf, $ref, $rnd); -ok(defined(my $enc = find_encoding('euc-jp'))); -ok($enc->isa('Encode::XS')); -is($enc->name,'euc-jp'); +ok(defined(my $enc = find_encoding('euc-jp')), 'find_encoding'); +ok($enc->isa('Encode::XS'), 'ISA'); +is($enc->name,'euc-jp', '$enc->name'); my $dir = dirname(__FILE__); -my @subcodings = qw(jisx0212 jisx0208); - -for my $subcoding (@subcodings){ - $euc = File::Spec->catfile($dir,"$subcoding.euc"); +for my $charset (qw(jisx0201 jisx0212 jisx0208)){ + $euc = File::Spec->catfile($dir,"$charset.euc"); $utf = File::Spec->catfile($dir,"$$.utf8"); - $ref = File::Spec->catfile($dir,"$subcoding.ref"); + $ref = File::Spec->catfile($dir,"$charset.ref"); $rnd = File::Spec->catfile($dir,"$$.rnd"); - print "# Basic decode test\n"; - open($src,"<",$euc) || die "Cannot open $euc:$!"; + + open($src,"<",$euc) or die "Cannot open $euc:$!"; binmode($src); - ok(defined($src) && fileno($src)); $txt = join('',<$src>); - open($dst,">:utf8",$utf) || die "Cannot open $utf:$!"; - binmode($dst); - ok(defined($dst) && fileno($dst)); - eval{ $uni = $enc->decode($txt,1) }; - $@ and print $@; - ok(defined($uni)); - is(length($txt),0); - print $dst $uni; - close($dst); close($src); - ok(compare($utf,$ref) == 0); -} - -print "# Basic encode test\n"; -open($src,"<:utf8",$ref) || die "Cannot open $ref:$!"; -binmode($src); -ok(defined($src) && fileno($src)); -$uni = join('',<$src>); -open($dst,">",$rnd) || die "Cannot open $rnd:$!"; -binmode($dst); -ok(defined($dst) && fileno($dst)); -$txt = $enc->encode($uni,1); -ok(defined($txt)); -is(length($uni),0); -print $dst $txt; -close($dst); -close($src); -ok(compare($euc,$rnd) == 0); - -is($enc->name,'euc-jp'); - -my $skip_perlio; -eval { require PerlIO::encoding; }; -if ($@){ - $skip_perlio = 1; -}else{ - $skip_perlio = 0; - binmode(STDIN); -} - -$skip_perlio ||= (@ARGV and shift eq 'perlio'); + + eval{ $uni = $enc->decode($txt, 1) }; + $@ and print $@; + ok(defined($uni), "decode $charset"); + is(length($txt),0, "decode $charset completely"); -SKIP: { - skip "PerlIO Encoding Needed", 6 if $skip_perlio; - print "# src :encoding test\n"; - open($src,":utf8",$utf) || die "Cannot open $utf:$!"; + open($dst,">:utf8",$utf) or die "Cannot open $utf:$!"; binmode($dst); - ok(defined($dst) || fileno($dst)); - my $out = select($dst); - while (<$src>){ print; } - close($dst); - close($src); + print $dst $uni; + close($dst); + is(compare($utf, $ref), 0, "$utf eq $ref"); + + open $src, "<:utf8", $ref or die "$ref : $!"; + $uni = join('', <$src>); + close $src; - TODO: - { - local $TODO = 'needs debugging on VMS' if $^O eq 'VMS'; - ok(compare($utf,$ref) == 0); - } - select($out); + for my $canon (qw(euc-jp shiftjis + 7bit-jis iso-2022-jp iso-2022-jp-1)){ + my $test = \&is; + if ($charset eq 'jisx0201'){ + $canon eq 'iso-2022-jp' and $test = \&isnt; + $canon eq 'iso-2022-jp-1' and $test = \&isnt; + }elsif($charset eq 'jisx0212'){ + $canon eq 'shiftjis' and $test = \&isnt; + $canon eq 'iso-2022-jp' and $test = \&isnt; + } + my $rt = ($test eq \&is) ? 'RT' : 'non-RT'; + $test->($uni, decode($canon, encode($canon, $uni)), + "$rt $charset $canon"); + + } - print "# dst :encoding test\n"; - open($src,"<:utf8",$ref) || die "Cannot open $ref:$!"; - binmode($src); - ok(defined($src) || fileno($src)); - open($dst,">encoding(euc-jp)",$rnd) || die "Cannot open $rnd:$!"; + eval{ $txt = $enc->encode($uni,1) }; + $@ and print $@; + ok(defined($txt), "encode $charset"); + is(length($uni), 0, "encode $charset completely"); + + open($dst,">", $rnd) or die "Cannot open $utf:$!"; binmode($dst); - ok(defined($dst) || fileno($dst)); - $out = select($dst); - while (<$src>) { print; } - close($dst); - close($src); - ok(compare($euc,$rnd) == 0); - select($out); + print $dst $txt; + close($dst); + is(compare($euc, $rnd), 0 => "$rnd eq $euc"); } -is($enc->name,'euc-jp'); - END { 1 while unlink($utf,$rnd); } diff --git a/ext/Encode/t/KR.t b/ext/Encode/t/KR.t index e42271b..0cf1908 100644 --- a/ext/Encode/t/KR.t +++ b/ext/Encode/t/KR.t @@ -19,7 +19,7 @@ BEGIN { $| = 1; } use strict; -use Test::More tests => 22; +use Test::More tests => 15; #use Test::More qw(no_plan); use Encode; use File::Basename; @@ -77,55 +77,6 @@ ok(compare($euc,$rnd) == 0); is($enc->name,'euc-kr'); -my $skip_perlio; -eval { require PerlIO::encoding; }; -if ($@){ - $skip_perlio = 1; -}else{ - $skip_perlio = 0; - binmode(STDIN); -} - -$skip_perlio ||= (@ARGV and shift eq 'perlio'); - -SKIP: { - skip "PerlIO Encoding Needed", 6 if $skip_perlio; - print "# src :encoding test\n"; - open($src,":utf8",$utf) || die "Cannot open $utf:$!"; - binmode($dst); - ok(defined($dst) || fileno($dst)); - my $out = select($dst); - while (<$src>) { print; } - close($dst); - close($src); - - TODO: - { - local $TODO = 'needs debugging on VMS' if $^O eq 'VMS'; - ok(compare($utf,$ref) == 0); - } - select($out); - - print "# dst :encoding test\n"; - open($src,"<:utf8",$ref) || die "Cannot open $ref:$!"; - binmode($src); - ok(defined($src) || fileno($src)); - open($dst,">encoding(euc-kr)",$rnd) || die "Cannot open $rnd:$!"; - binmode($dst); - ok(defined($dst) || fileno($dst)); - $out = select($dst); - while (<$src>) { print; } - close($dst); - close($src); - ok(compare($euc,$rnd) == 0); - select($out); -} - -is($enc->name,'euc-kr'); - END { 1 while unlink($utf,$rnd); } diff --git a/ext/Encode/t/fallback.t b/ext/Encode/t/fallback.t new file mode 100644 index 0000000..cf867be --- /dev/null +++ b/ext/Encode/t/fallback.t @@ -0,0 +1,77 @@ +BEGIN { + if ($ENV{'PERL_CORE'}){ + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + $| = 1; +} + +use strict; +#use Test::More qw(no_plan); +use Test::More tests => 15; +use Encode q(:all); + + +my $original = ''; +my $nofallback = ''; +my ($fallenback, $quiet, $perlqq); +for my $i (0x20..0x7e){ + $original .= chr($i); +} +$fallenback = $quiet = $perlqq = $nofallback = $original; + +my $residue = ''; +for my $i (0x80..0xff){ + $original .= chr($i); + $residue .= chr($i); + $fallenback .= '?'; + $perlqq .= sprintf("\\x{%04x}", $i); +} +utf8::upgrade($original); +my $meth = find_encoding('ascii'); + +my $src = $original; +my $dst = $meth->encode($src, FB_DEFAULT); +is($dst, $fallenback, "FB_DEFAULT"); +is($src, $original, "FB_DEFAULT residue"); + +$src = $original; +eval{ $dst = $meth->encode($src, FB_CROAK) }; +like($@, qr/does not map to ascii/o, "FB_CROAK"); +is($src, $original, "FB_CROAK residue"); + +$src = $original; +eval{ $dst = $meth->encode($src, FB_CROAK) }; +like($@, qr/does not map to ascii/o, "FB_CROAK"); +is($src, $original, "FB_CROAK residue"); + + +$src = $nofallback; +eval{ $dst = $meth->encode($src, FB_CROAK) }; +is($@, '', "FB_CROAK on success"); +is($src, '', "FB_CROAK on success residue"); + +$src = $original; +$dst = $meth->encode($src, FB_QUIET); +is($dst, $quiet, "FB_QUIET"); +is($src, $residue, "FB_QUIET residue"); + +{ + my $message; + local $SIG{__WARN__} = sub { $message = $_[0] }; + $src = $original; + $dst = $meth->encode($src, FB_WARN); + is($dst, $quiet, "FB_WARN"); + is($src, $residue, "FB_WARN residue"); + like($message, qr/does not map to ascii/o, "FB_WARN message"); +} + +$src = $original; +$dst = $meth->encode($src, FB_PERLQQ); +is($dst, $perlqq, "FB_PERLQQ"); +is($src, '', "FB_PERLQQ residue"); diff --git a/ext/Encode/t/jisx0201.euc b/ext/Encode/t/jisx0201.euc new file mode 100644 index 0000000..55ed5fc --- /dev/null +++ b/ext/Encode/t/jisx0201.euc @@ -0,0 +1,6 @@ +0x0020: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > +0x0040: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ +0x0060: ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ +0x8ea0: Ž¡ Ž¢ Ž£ Ž¤ Ž¥ Ž¦ Ž§ Ž¨ Ž© Žª Ž« Ž¬ Ž­ Ž® Ž¯ Ž° Ž± Ž² Ž³ Ž´ Žµ Ž¶ Ž· Ž¸ Ž¹ Žº Ž» Ž¼ Ž½ Ž¾ +0x8ec0: ŽÀ ŽÁ ŽÂ ŽÃ ŽÄ ŽÅ ŽÆ ŽÇ ŽÈ ŽÉ ŽÊ ŽË ŽÌ ŽÍ ŽÎ ŽÏ ŽÐ ŽÑ ŽÒ ŽÓ ŽÔ ŽÕ ŽÖ Ž× ŽØ ŽÙ ŽÚ ŽÛ ŽÜ ŽÝ ŽÞ +0x8ee0: diff --git a/ext/Encode/t/jisx0201.ref b/ext/Encode/t/jisx0201.ref new file mode 100644 index 0000000..3380453 --- /dev/null +++ b/ext/Encode/t/jisx0201.ref @@ -0,0 +1,6 @@ +0x0020: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > +0x0040: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ +0x0060: ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ +0x8ea0: 。 ï½¢ ï½£ 、 ï½¥ ヲ ァ ィ ゥ ェ ォ ャ ï½­ ï½® ッ ï½° ï½± ï½² ï½³ ï½´ ï½µ カ ï½· ク ï½¹ コ ï½» ï½¼ ï½½ ï½¾ +0x8ec0: タ チ ツ テ ト ナ ニ ヌ ネ ノ ハ ヒ フ ヘ ホ マ ミ ム メ モ ヤ ユ ヨ ラ リ ル レ ロ ワ ン ゙ +0x8ee0: diff --git a/ext/Encode/t/jisx0208.euc b/ext/Encode/t/jisx0208.euc index 8178409..72d9263 100644 --- a/ext/Encode/t/jisx0208.euc +++ b/ext/Encode/t/jisx0208.euc @@ -1,9 +1,3 @@ -0x0020: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > -0x0040: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ -0x0060: ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ -0x8ea0: Ž¡ Ž¢ Ž£ Ž¤ Ž¥ Ž¦ Ž§ Ž¨ Ž© Žª Ž« Ž¬ Ž­ Ž® Ž¯ Ž° Ž± Ž² Ž³ Ž´ Žµ Ž¶ Ž· Ž¸ Ž¹ Žº Ž» Ž¼ Ž½ Ž¾ -0x8ec0: ŽÀ ŽÁ ŽÂ ŽÃ ŽÄ ŽÅ ŽÆ ŽÇ ŽÈ ŽÉ ŽÊ ŽË ŽÌ ŽÍ ŽÎ ŽÏ ŽÐ ŽÑ ŽÒ ŽÓ ŽÔ ŽÕ ŽÖ Ž× ŽØ ŽÙ ŽÚ ŽÛ ŽÜ ŽÝ ŽÞ -0x8ee0: 0xa0a0: 0xa0c0: 0xa0e0: diff --git a/ext/Encode/t/jisx0208.ref b/ext/Encode/t/jisx0208.ref index 1401b89..733427b 100644 --- a/ext/Encode/t/jisx0208.ref +++ b/ext/Encode/t/jisx0208.ref @@ -1,9 +1,3 @@ -0x0020: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > -0x0040: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ -0x0060: ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ -0x8ea0: 。 ï½¢ ï½£ 、 ï½¥ ヲ ァ ィ ゥ ェ ォ ャ ï½­ ï½® ッ ï½° ï½± ï½² ï½³ ï½´ ï½µ カ ï½· ク ï½¹ コ ï½» ï½¼ ï½½ ï½¾ -0x8ec0: タ チ ツ テ ト ナ ニ ヌ ネ ノ ハ ヒ フ ヘ ホ マ ミ ム メ モ ヤ ユ ヨ ラ リ ル レ ロ ワ ン ゙ -0x8ee0: 0xa0a0: 0xa0c0: 0xa0e0: diff --git a/ext/Encode/t/perlio.t b/ext/Encode/t/perlio.t new file mode 100644 index 0000000..74e3e7b --- /dev/null +++ b/ext/Encode/t/perlio.t @@ -0,0 +1,101 @@ +BEGIN { + if ($ENV{'PERL_CORE'}){ + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + require Encode; + unless ($INC{"PerlIO/encoding.pm"} + and PerlIO::encoding->VERSION >= 0.02 + ){ + print "1..0 # Skip:: PerlIO::encoding 0.02 or better required\n"; + exit 0; + } + # warn "PerlIO::encoding->VERSION == ", PerlIO::encoding->VERSION, "\n"; + $| = 1; +} + +use strict; +use File::Basename; +use File::Spec; +use File::Compare; +use FileHandle; + +#use Test::More qw(no_plan); +use Test::More tests => 20; + +our $DEBUG = 0; + +{ + no warnings; + @ARGV and $DEBUG = shift; + require Encode::JP::JIS7; + $Encode::JP::JIS7::DEBUG = $DEBUG; +} + +Encode->import(":all"); + +my $dir = dirname(__FILE__); +my $ufile = File::Spec->catfile($dir,"jisx0208.ref"); +open my $fh, "<:utf8", $ufile or die "$ufile : $!"; +my @uline = <$fh>; +my $utext = join('' => @uline); +close $fh; + +for my $e (qw/euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1/){ + my $sfile = File::Spec->catfile($dir,"$$.sio"); + my $pfile = File::Spec->catfile($dir,"$$.pio"); + + # first create a file without perlio + open $fh, ">", $sfile or die "$sfile :$!"; + binmode $fh; + print $fh &encode($e, $utext, 0); + close $fh; + + # then create a file via perlio without autoflush + + TODO:{ + todo_skip "$e: !perlio_ok", 1 unless perlio_ok($e); + open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; + $fh->autoflush(0); + print $fh $utext; + close $fh; + ok(compare($sfile, $pfile) == 0 => ">:encoding($e)"); + } + + # this time print line by line. + # works even for ISO-2022! + open $fh, ">:encoding($e)", $pfile or die "$sfile : $!"; + $fh->autoflush(1); + for my $l (@uline) { + print $fh $l; + } + close $fh; + is(compare($sfile, $pfile), 0 => ">:encoding($e); line-by-line"); + + TODO:{ + todo_skip "$e: !perlio_ok", 2 unless perlio_ok($e); + open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; + $fh->autoflush(0); + my $dtext = join('' => <$fh>); + close $fh; + ok($utext eq $dtext, "<:encoding($e)"); + $dtext = ''; + open $fh, "<:encoding($e)", $pfile or die "$pfile : $!"; + while(defined(my $l = <$fh>)) { + $dtext .= $l; + } + close $fh; + ok($utext eq $dtext, "<:encoding($e); line-by-line"); + } + $DEBUG or unlink ($sfile, $pfile); +} +