ext/Encode/Symbol/Symbol.pm Encode extension
ext/Encode/TW/Makefile.PL Encode extension
ext/Encode/TW/TW.pm Encode extension
+ext/Encode/Unicode/Makefile.PL Encode extension
+ext/Encode/Unicode/Unicode.pm Encode extension
+ext/Encode/Unicode/Unicode.xs Encode extension
ext/Encode/bin/enc2xs Encode module generator
ext/Encode/bin/piconv iconv by perl
ext/Encode/bin/ucm2table Table Generator for testing
ext/Encode/lib/Encode/JP/H2Z.pm Encode extension
ext/Encode/lib/Encode/JP/JIS7.pm Encode extension
ext/Encode/lib/Encode/KR/2022_KR.pm Encode extension
-ext/Encode/lib/Encode/Supported.pod Documents supported encodings
-ext/Encode/lib/Encode/Unicode.pm Encode extension
+ext/Encode/lib/Encode/PerlIO.pod Documents for Encode & PerlIO
+ext/Encode/lib/Encode/Supported.pod Documents for supported encodings
ext/Encode/t/Aliases.t Encode extension test
ext/Encode/t/CN.t Encode extension test
ext/Encode/t/Encode.t Encode extension test
ext/Encode/t/Unicode.t Encode extension test
ext/Encode/t/bogus.ucm Sample data for ucmlint
ext/Encode/t/encoding.t encoding extension test
+ext/Encode/t/fallback.t Encode extension test
ext/Encode/t/gb2312.euc test data
ext/Encode/t/gb2312.ref test data
ext/Encode/t/grow.t Encode extension test
+ext/Encode/t/jisx0201.euc test data
+ext/Encode/t/jisx0201.ref test data
ext/Encode/t/jisx0208.euc test data
ext/Encode/t/jisx0208.ref test data
ext/Encode/t/jisx0212.euc test data
ext/Encode/t/jperl.t encoding extension test
ext/Encode/t/ksc5601.euc test data
ext/Encode/t/ksc5601.ref test data
+ext/Encode/t/perlio.t
ext/Encode/t/unibench.pl Unicode benchmark
ext/Encode/ucm/8859-1.ucm Unicode Character Map
ext/Encode/ucm/8859-10.ucm Unicode Character Map
die "Encode::CN not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use XSLoader;
=head1 BUGS
When you see C<charset=gb2312> on mails and web pages, they really
-mean "euc-cn" encodings. To fix that, gb2312 is aliased to euc-cn. Use
-gb2312-raw when you really mean it.
+mean C<euc-cn> encodings. To fix that, C<gb2312> is aliased to C<euc-cn>.
+Use C<gb2312-raw> when you really mean it.
ASCII part (0x00-0x7f) is preserved for all encodings, even though it
conflicts with mappings by the Unicode Consortium. See
=head1 SEE ALSO
-L<Encode>,L<Encode::CJKguide>
+L<Encode>
=cut
# Revision history for Perl extension Encode.
#
-# $Id: Changes,v 1.42 2002/04/17 03:01:20 dankogai Exp dankogai $
+# $Id: Changes,v 1.50 2002/04/19 06:13:02 dankogai Exp $
#
+1.50 $Date: 2002/04/19 06:13:02 $
+! ! Encode.pm Encode.xs Encode/encoding.h
++ t/fallback.pm
+ New Fallback API imlemented and documented. See "perldoc Encode"
+ for details
+! lib/Encode/JP/JIS7.pm Encode.pm
++ lib/Encode/PerlIO.pod t/perlio.t
+ API compliance met. However, it still does not work unless perlio
+ implements line buffer. See BUGS section in perldoc Encode::PerlIO
+ As a sensible workaround, perlio_ok() added to Encode.
+! encoding.pm
+! lib/Encode/Supported.pod
+ Doc fixes from jhi
+ Message-Id: <20020418174647.J8466@alpha.hut.fi>
+! CN/CN.pm
+ Doc fixes from Autrijus
+ Message-Id: <20020418144131.GA10987@not.autrijus.org>
+! Encode.pm
+ perlqq mode documented
+! t/JP.t
++ t/jisx0201.euc t/jisx0201.ref
+! t/jisx0208.euc t/jisx0208.ref
+ t/JP.t tests more rigorously and with other encodings
+ t/jisx0201.* added to test JIS7 encodings. jisx0208 is now PURELY
+ in jis0208 (used to contain jisx0201 part).
+! Encode/Makefile_PL.e2x
+ The resulting Makefile.PL that "enc2xs -M" creates now auto-discovers
+ enc2xs and encode.h rather than hard-coded. This allows the resulting
+ module fully CPANizable.
+! encoding.pm t/JP.t t/KR.t
+ PerlIO detection simplified (checks %INC instead of eval{})
+! Encode.xs Encode/encode.h
++ Unicode/Makefile.PL Unicode/Unicode.pm Unicode/Unicode.xs
+- lib/Encode/Unicode.pm
+ (en|de)code_xs relocated to where it belongs. Source reindented
+ to my taste
+! bin/enc2xs
+ Additional (U8 *) cast added as suggested by jhi
+ Message-Id: <20020417165916.A28599@alpha.hut.fi>
-1.42 $Date: 2002/04/17 03:01:20 $
+1.42 Date: 2002/04/17
- lib/Encode/XS.pm
no-op module; Thought of adding a pod there but enc2xs has
one so gone.
Typo fixes and improvements by jhi
Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al.
-1.11 $Date: 2002/04/17 03:01:20 $
+1.11 $Date: 2002/04/19 06:13:02 $
+ t/encoding.t
+ t/jperl.t
! MANIFEST
package Encode;
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.42 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.50 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
require DynaLoader;
our @ISA = qw(Exporter DynaLoader);
# Public, encouraged API is exported by default
-our @EXPORT = qw (
- decode
- decode_utf8
- encode
- encode_utf8
- encodings
- find_encoding
+
+our @EXPORT = qw(
+ decode decode_utf8 encode encode_utf8
+ encodings find_encoding
);
+our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC PERLQQ);
+our @FB_CONSTS = qw(FB_DEFAULT FB_QUIET FB_WARN FB_PERLQQ FB_CROAK);
+
our @EXPORT_OK =
- qw(
- _utf8_off
- _utf8_on
- define_encoding
- from_to
- is_16bit
- is_8bit
- is_utf8
- resolve_alias
- utf8_downgrade
- utf8_upgrade
- );
+ (
+ qw(
+ _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
+ is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
+ ),
+ @FB_FLAGS, @FB_CONSTS,
+ );
+
+our %EXPORT_TAGS =
+ (
+ all => [ @EXPORT, @EXPORT_OK ],
+ fallbacks => [ @FB_CONSTS ],
+ fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
+ );
+
bootstrap Encode ();
grep {!/^(?:Internal|Unicode)$/o} keys %Encoding;
}
+sub perlio_ok{
+ exists $INC{"PerlIO/encoding.pm"} or return 0;
+ my $stash = ref($_[0]);
+ $stash ||= ref(find_encoding($_[0]));
+ return ($stash eq "Encode::XS" || $stash eq "Encode::Unicode");
+}
+
sub define_encoding
{
my $obj = shift;
require Encode::Encoding;
-eval { require PerlIO::encoding };
+eval qq{ use PerlIO::encoding 0.02 };
+# warn $@ if $@;
1;
=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK])
-Convert B<in-place> the data between two encodings. How did the data
-in $string originally get to be in FROM_ENCODING? Either using
-encode() or through PerlIO: See L</"Encoding and IO">.
-For encoding names and aliases, see L</"Defining Aliases">.
-For CHECK see L</"Handling Malformed Data">.
-
+Convert B<in-place> the data between two encodings.
For example to convert ISO-8859-1 data to UTF-8:
from_to($data, "iso-8859-1", "utf-8");
See L<Encode::Alias> on details.
-=head1 Encoding and IO
+=head1 Encoding via PerlIO
-It is very common to want to do encoding transformations when
-reading or writing files, network connections, pipes etc.
-If Perl is configured to use the new 'perlio' IO system then
-C<Encode> provides a "layer" (See L<perliol>) which can transform
-data as it is read or written.
+If your perl supports I<PerlIO>, you can use PerlIO layer to directly
+decode and encode via filehandle. The following two examples are
+totally identical by functionality.
-Here is how the blind poet would modernise the encoding:
+ # via PerlIO
+ open my $in, "<:encoding(shiftjis)", $infile or die;
+ open my $out, ">:encoding(euc-jp)", $outfile or die;
+ while(<>){ print; }
- use Encode;
- open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
- open(my $utf8,'>:utf8','iliad.utf8');
- my @epic = <$iliad>;
- print $utf8 @epic;
- close($utf8);
- close($illiad);
+ # via from_to
+ open my $in, $infile or die;
+ open my $out, $outfile or die;
+ while(<>){
+ from_to($_, "shiftjis", "euc", 1);
+ }
-In addition the new IO system can also be configured to read/write
-UTF-8 encoded characters (as noted above this is efficient):
+Unfortunately, not all encodings are PerlIO-savvy. You can check if
+your encoding is supported by PerlIO by C<perlio_ok> method.
- open(my $fh,'>:utf8','anything');
- print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
+ Encode::perlio_ok("iso-20220jp"); # false
+ find_encoding("iso-2022-jp")->perlio_ok; # false
+ use Encode qw(perlio_ok); # exported upon request
+ perlio_ok("euc-jp") # true if PerlIO is enabled
-Either of the above forms of "layer" specifications can be made the default
-for a lexical scope with the C<use open ...> pragma. See L<open>.
+For gory details, see L<Encode::PerlIO>;
-Once a handle is open is layers can be altered using C<binmode>.
+=head1 Handling Malformed Data
-Without any such configuration, or if Perl itself is built using
-system's own IO, then write operations assume that file handle accepts
-only I<bytes> and will C<die> if a character larger than 255 is
-written to the handle. When reading, each octet from the handle
-becomes a byte-in-a-character. Note that this default is the same
-behaviour as bytes-only languages (including Perl before v5.6) would
-have, and is sufficient to handle native 8-bit encodings
-e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
-other encodings and binary data.
+=over 4
-In other cases it is the programs responsibility to transform
-characters into bytes using the API above before doing writes, and to
-transform the bytes read from a handle into characters before doing
-"character operations" (e.g. C<lc>, C</\W+/>, ...).
+THE I<CHECK> argument is used as follows. When you omit it, it is
+identical to I<CHECK> = 0.
-You can also use PerlIO to convert larger amounts of data you don't
-want to bring into memory. For example to convert between ISO-8859-1
-(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
+=item I<CHECK> = Encode::FB_DEFAULT ( == 0)
- open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
- open(G, ">:utf8", "data.utf") or die $!;
- while (<F>) { print G }
+If I<CHECK> is 0, (en|de)code will put I<substitution character> in
+place of the malformed character. for UCM-based encodings,
+E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
+data is supposed to be UTF-8, an optional lexical warning (category
+utf8) is given.
- # Could also do "print G <F>" but that would pull
- # the whole file into memory just to write it out again.
+=item I<CHECK> = Encode::DIE_ON_ERROR (== 1)
-More examples:
+If I<CHECK> is 1, methods will die immediately with an error
+message. so when I<CHECK> is set, you should trap the fatal error
+with eval{} unless you really want to let it die on error.
- open(my $f, "<:encoding(cp1252)")
- open(my $g, ">:encoding(iso-8859-2)")
- open(my $h, ">:encoding(latin9)") # iso-8859-15
+=item I<CHECK> = Encode::FB_QUIET
-See L<PerlIO> for more information.
+If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
+return proccessed part on error, with data passed via argument
+overwritten with unproccessed part. This is handy when have to
+repeatedly call because the source data is chopped in the middle for
+some reasons, such as fixed-width buffer. Here is a sample code that
+just does this.
-See also L<encoding> for how to change the default encoding of the
-data in your script.
+ my $data = '';
+ while(defined(read $fh, $buffer, 256)){
+ # buffer may end in partial character so we append
+ $data .= $buffer;
+ $utf8 .= decode($encoding, $data, ENCODE::FB_QUIET);
+ # $data now contains unprocessed partial character
+ }
-=head1 Handling Malformed Data
+=item I<CHECK> = Encode::FB_WARN
-If I<CHECK> is not set, (en|de)code will put I<substitution character> in
-place of the malformed character. for UCM-based encodings,
-E<lt>subcharE<gt> will be used. For Unicode, \xFFFD is used. If the
-data is supposed to be UTF-8, an optional lexical warning (category
-utf8) is given.
+This is the same as above, except it warns on error. Handy when you
+are debugging the mode above.
+
+=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
+
+For encodings that are implemented by Encode::XS, CHECK ==
+Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
+
+When you decode, '\xI<XX>' will be placed where I<XX> is the hex
+representation of the octet that could not be decoded to utf8. And
+when you encode, '\x{I<xxxx>}' will be placed where I<xxxx> is the
+Unicode ID of the character that cannot be found in the character
+repartoire of the encoding.
+
+=item The bitmask
+
+These modes are actually set via bitmask. here is how FB_XX are laid
+out. for FB_XX you can import via C<use Encode qw(:fallbacks)> for
+generic bitmask constants, you can import via
+ C<use Encode qw(:fallback_all)>.
+
+ FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
+ DIE_ON_ERR 0x0001 X
+ WARN_ON_ERR 0x0002 X
+ RETURN_ON_ERR 0x0004 X X
+ LEAVE_SRC 0x0008
+ PERLQQ 0x0100 X
-If I<CHECK> is true but not a code reference, dies with an error message.
+=head2 Unemplemented fallback schemes
In future you will be able to use a code reference to a callback
function for the value of I<CHECK> but its API is still undecided.
L<Encode::Encoding>,
L<Encode::Supported>,
-L<PerlIO>,
+L<Encode::PerlIO>,
L<encoding>,
L<perlebcdic>,
L<perlfunc/open>,
L<utf8>,
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
-head2 MAINTAINER
+=head1 MAINTAINER
This project was originated by Nick Ing-Simmons and later maintained
by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for full list
+/*
+ $Id: Encode.xs,v 1.29 2002/04/19 05:36:43 dankogai Exp $
+ */
+
#define PERL_NO_GET_CONTEXT
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#define U8 U8
#include "encode.h"
-#include "def_t.h"
-
-#define FBCHAR 0xFFFd
-#define FBCHAR_UTF8 "\xEF\xBF\xBD"
-#define BOM_BE 0xFeFF
-#define BOM16LE 0xFFFe
-#define BOM32LE 0xFFFe0000
-#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
-#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
-#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
-#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
-
-static UV
-enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
-{
- U8 *s = *sp;
- UV v = 0;
- if (s+size > e) {
- croak("Partial character %c",(char) endian);
- }
- switch(endian) {
- case 'N':
- v = *s++;
- v = (v << 8) | *s++;
- case 'n':
- v = (v << 8) | *s++;
- v = (v << 8) | *s++;
- break;
- case 'V':
- case 'v':
- v |= *s++;
- v |= (*s++ << 8);
- if (endian == 'v')
- break;
- v |= (*s++ << 16);
- v |= (*s++ << 24);
- break;
- default:
- croak("Unknown endian %c",(char) endian);
- break;
- }
- *sp = s;
- return v;
-}
-
-void
-enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
-{
- U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size);
- switch(endian) {
- case 'v':
- case 'V':
- d += SvCUR(result);
- SvCUR_set(result,SvCUR(result)+size);
- while (size--) {
- *d++ = value & 0xFF;
- value >>= 8;
- }
- break;
- case 'n':
- case 'N':
- SvCUR_set(result,SvCUR(result)+size);
- d += SvCUR(result);
- while (size--) {
- *--d = value & 0xFF;
- value >>= 8;
- }
- break;
- default:
- croak("Unknown endian %c",(char) endian);
- break;
- }
-}
-#define ENCODE_XS_PROFILE 0 /* set 1 or more to profile.
- t/encoding.t dumps core because of
- Perl_warner and PerlIO don't work well */
+/* set 1 or more to profile. t/encoding.t dumps core because of
+ Perl_warner and PerlIO don't work well */
+#define ENCODE_XS_PROFILE 0
-#define ENCODE_XS_USEFP 1 /* set 0 to disable floating point to calculate
- buffer size for encode_method().
- 1 is recommended. 2 restores NI-S original */
+/* set 0 to disable floating point to calculate buffer size for
+ encode_method(). 1 is recommended. 2 restores NI-S original */
+#define ENCODE_XS_USEFP 1
#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \
Perl_croak(aTHX_ "panic_unimplemented"); \
return (y)0; /* fool picky compilers */ \
}
+/**/
UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
- UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
+UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
-void
+ void
Encode_XSEncoding(pTHX_ encode_t * enc)
{
dSP;
void
call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
{
- /* Exists for breakpointing */
+ /* Exists for breakpointing */
}
+
static SV *
encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
- int check)
+ int check)
{
STRLEN slen;
U8 *s = (U8 *) SvPV(src, slen);
STRLEN sdone = 0;
/* We allocate slen+1.
- PerlIO dumps core if this value is smaller than this. */
+ PerlIO dumps core if this value is smaller than this. */
SV *dst = sv_2mortal(newSV(slen+1));
- if (slen) {
- U8 *d = (U8 *) SvPVX(dst);
- STRLEN dlen = SvLEN(dst)-1;
- int code;
- while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) {
- SvCUR_set(dst, dlen+ddone);
- SvPOK_only(dst);
-
-#if ENCODE_XS_PROFILE >= 3
- Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d\n",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1);
-#endif
+ U8 *d = (U8 *)SvPVX(dst);
+ STRLEN dlen = SvLEN(dst)-1;
+ int code;
+
+ if (!slen){
+ SvCUR_set(dst, 0);
+ SvPOK_only(dst);
+ goto ENCODE_END;
+ }
+
+ while (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))
+ {
+ SvCUR_set(dst, dlen+ddone);
+ SvPOK_only(dst);
- if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL)
- break;
-
- switch (code) {
- case ENCODE_NOSPACE:
- {
- STRLEN more = 0; /* make sure you initialize! */
- STRLEN sleft;
- sdone += slen;
- ddone += dlen;
- sleft = tlen - sdone;
+ if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL){
+ break;
+ }
+ switch (code) {
+ case ENCODE_NOSPACE:
+ {
+ STRLEN more = 0; /* make sure you initialize! */
+ STRLEN sleft;
+ sdone += slen;
+ ddone += dlen;
+ sleft = tlen - sdone;
#if ENCODE_XS_PROFILE >= 2
- Perl_warn(aTHX_
- "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
- more, sdone, sleft, SvLEN(dst));
+ Perl_warn(aTHX_
+ "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
+ more, sdone, sleft, SvLEN(dst));
#endif
- if (sdone != 0) { /* has src ever been processed ? */
+ if (sdone != 0) { /* has src ever been processed ? */
#if ENCODE_XS_USEFP == 2
- more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
- - SvLEN(dst);
+ more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
+ - SvLEN(dst);
#elif ENCODE_XS_USEFP
- more = (1.0*SvLEN(dst)+1)/sdone * sleft;
+ more = (1.0*SvLEN(dst)+1)/sdone * sleft;
#else
- /* safe until SvLEN(dst) == MAX_INT/16 */
- more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
-#endif
- }
- more += UTF8_MAXLEN; /* insurance policy */
-#if ENCODE_XS_PROFILE >= 2
- Perl_warn(aTHX_
- "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
- more, sdone, sleft, SvLEN(dst));
+ /* safe until SvLEN(dst) == MAX_INT/16 */
+ more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
#endif
- d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
- /* dst need to grow need MORE bytes! */
- if (ddone >= SvLEN(dst)) {
- Perl_croak(aTHX_ "Destination couldn't be grown.");
- }
- dlen = SvLEN(dst)-ddone-1;
- d += ddone;
- s += slen;
- slen = tlen-sdone;
- continue;
}
-
- case ENCODE_NOREP:
- if (dir == enc->f_utf8) {
- STRLEN clen;
- UV ch =
- utf8n_to_uvuni(s + slen, (SvCUR(src) - slen),
- &clen, 0);
- if (!check) { /* fallback char */
+ more += UTF8_MAXLEN; /* insurance policy */
+ d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
+ /* dst need to grow need MORE bytes! */
+ if (ddone >= SvLEN(dst)) {
+ Perl_croak(aTHX_ "Destination couldn't be grown.");
+ }
+ dlen = SvLEN(dst)-ddone-1;
+ d += ddone;
+ s += slen;
+ slen = tlen-sdone;
+ continue;
+ }
+ case ENCODE_NOREP:
+ /* encoding */
+ if (dir == enc->f_utf8) {
+ STRLEN clen;
+ UV ch =
+ utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), &clen, 0);
+ if (check & ENCODE_DIE_ON_ERR) {
+ Perl_croak(
+ aTHX_ "\"\\N{U+%" UVxf "}\" does not map to %s, %d",
+ ch, enc->name[0], __LINE__);
+ }else{
+ if (check & ENCODE_RETURN_ON_ERR){
+ if (check & ENCODE_WARN_ON_ERR){
+ Perl_warner(
+ aTHX_ packWARN(WARN_UTF8),
+ "\"\\N{U+%" UVxf "}\" does not map to %s",
+ ch,enc->name[0]);
+ }
+ goto ENCODE_SET_SRC;
+ }else if (check & ENCODE_PERLQQ){
+ SV* perlqq =
+ sv_2mortal(newSVpvf("\\x{%04x}", ch));
sdone += slen + clen;
- ddone += dlen + enc->replen;
- sv_catpvn(dst, (char*)enc->rep, enc->replen);
- }
- else if (check == -1){ /* perlqq */
- SV* perlqq =
- sv_2mortal(newSVpvf("\\x{%x}", ch));
- sdone += slen + clen;
- ddone += dlen + SvLEN(perlqq);
- sv_catsv(dst, perlqq);
+ ddone += dlen + SvCUR(perlqq);
+ sv_catsv(dst, perlqq);
+ } else {
+ /* fallback char */
+ sdone += slen + clen;
+ ddone += dlen + enc->replen;
+ sv_catpvn(dst, (char*)enc->rep, enc->replen);
}
- else {
- Perl_croak(aTHX_
- "\"\\N{U+%" UVxf
- "}\" does not map to %s", ch,
- enc->name[0]);
- }
+ }
}
- else {
- if (!check){ /* fallback char */
- sdone += slen + 1;
- ddone += dlen + strlen(FBCHAR_UTF8);
- sv_catpv(dst, FBCHAR_UTF8);
- }
- else if (check == -1){ /* perlqq */
- SV* perlqq =
+ /* decoding */
+ else {
+ if (check & ENCODE_DIE_ON_ERR){
+ Perl_croak(
+ aTHX_ "%s \"\\x%02X\" does not map to Unicode (%d)",
+ enc->name[0], (U8) s[slen], code);
+ }else{
+ if (check & ENCODE_RETURN_ON_ERR){
+ if (check & ENCODE_WARN_ON_ERR){
+ Perl_warner(
+ aTHX_ packWARN(WARN_UTF8),
+ "%s \"\\x%02X\" does not map to Unicode (%d)",
+ enc->name[0], (U8) s[slen], code);
+ }
+ goto ENCODE_SET_SRC;
+ }else if (check & ENCODE_PERLQQ){
+ SV* perlqq =
sv_2mortal(newSVpvf("\\x%02X", s[slen]));
- sdone += slen + 1;
- ddone += dlen + SvLEN(perlqq);
- sv_catsv(dst, perlqq);
- }
- else {
- /* UTF-8 is supposed to be "Universal" so should not
- happen for real characters, but some encodings
- have non-assigned codes which may occur. */
- Perl_croak(aTHX_ "%s \"\\x%02X\" "
- "does not map to Unicode (%d)",
- enc->name[0], (U8) s[slen], code);
+ sdone += slen + 1;
+ ddone += dlen + SvCUR(perlqq);
+ sv_catsv(dst, perlqq);
+ } else {
+ sdone += slen + 1;
+ ddone += dlen + strlen(FBCHAR_UTF8);
+ sv_catpv(dst, FBCHAR_UTF8);
+ }
}
}
+ /* settle variables when fallback */
dlen = SvCUR(dst);
d = (U8*)SvPVX(dst) + dlen;
s = (U8*)SvPVX(src) + sdone;
slen = tlen - sdone;
break;
- default:
- Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
- code, (dir == enc->f_utf8) ? "to" : "from",
- enc->name[0]);
- return &PL_sv_undef;
- }
+ default:
+ Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
+ code, (dir == enc->f_utf8) ? "to" : "from",
+ enc->name[0]);
+ return &PL_sv_undef;
}
- SvCUR_set(dst, dlen+ddone);
- SvPOK_only(dst);
- if (check) {
- sdone = SvCUR(src) - (slen+sdone);
- if (sdone) {
-#if 1
- /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly
- SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0
- type SVs and sv_clear() calls it ...
- */
- sv_setpvn(src, (char*)s+slen, sdone);
-#else
- Move(s + slen, SvPVX(src), sdone , U8);
-#endif
- }
- SvCUR_set(src, sdone);
+ }
+ ENCODE_SET_SRC:
+ if (check & ~ENCODE_LEAVE_SRC){
+ sdone = SvCUR(src) - (slen+sdone);
+ if (sdone) {
+ sv_setpvn(src, (char*)s+slen, sdone);
}
+ SvCUR_set(src, sdone);
}
- else {
- SvCUR_set(dst, 0);
- SvPOK_only(dst);
+ /* warn("check = 0x%X, code = 0x%d\n", check, code); */
+ if (code && !(check & ENCODE_RETURN_ON_ERR)) {
+ return &PL_sv_undef;
}
+
+ SvCUR_set(dst, dlen+ddone);
+ SvPOK_only(dst);
+
#if ENCODE_XS_PROFILE
if (SvCUR(dst) > SvCUR(src)){
- Perl_warn(aTHX_
- "SvLEN(dst)=%d, SvCUR(dst)=%d. "
- "%d bytes unused(%f %%)\n",
- SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst),
- (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
-
+ Perl_warn(aTHX_
+ "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n",
+ SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst),
+ (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
}
#endif
+
+ ENCODE_END:
*SvEND(dst) = '\0';
return dst;
}
Method_name(obj)
SV * obj
CODE:
- {
- encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
- ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
- XSRETURN(1);
- }
+{
+ encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
+ ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
+ XSRETURN(1);
+}
void
Method_decode(obj,src,check = 0)
SV * src
int check
CODE:
- {
- encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
- ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
- SvUTF8_on(ST(0));
- XSRETURN(1);
- }
-
-void
-Method_encode(obj,src,check = 0)
-SV * obj
-SV * src
-int check
-CODE:
- {
- encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
- sv_utf8_upgrade(src);
- ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
- XSRETURN(1);
- }
-
-MODULE = Encode PACKAGE = Encode::Unicode
-
-void
-decode_xs(obj, str, chk = &PL_sv_undef)
-SV * obj
-SV * str
-SV * chk
-CODE:
{
- int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
- U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
- int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
- SV *result = newSVpvn("",0);
- STRLEN ulen;
- U8 *s = (U8 *)SvPVbyte(str,ulen);
- U8 *e = (U8 *)SvEND(str);
- ST(0) = sv_2mortal(result);
- SvUTF8_on(result);
-
- if (!endian && s+size <= e) {
- UV bom;
- endian = (size == 4) ? 'N' : 'n';
- bom = enc_unpack(aTHX_ &s,e,size,endian);
- if (bom != BOM_BE) {
- if (bom == BOM16LE) {
- endian = 'v';
- }
- else if (bom == BOM32LE) {
- endian = 'V';
- }
- else {
- croak("%s:Unregognised BOM %"UVxf,
- SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),bom);
- }
- }
-#if 0
- /* Update endian for this sequence */
- hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
-#endif
- }
- while (s < e && s+size <= e) {
- UV ord = enc_unpack(aTHX_ &s,e,size,endian);
- U8 *d;
- if (size != 4 && invalid_ucs2(ord)) {
- if (ucs2) {
- if (SvTRUE(chk)) {
- croak("%s:no surrogates allowed %"UVxf,
- SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
- }
- if (s+size <= e) {
- enc_unpack(aTHX_ &s,e,size,endian); /* skip the next one as well */
- }
- ord = FBCHAR;
- }
- else {
- UV lo;
- if (!isHiSurrogate(ord)) {
- croak("%s:Malformed HI surrogate %"UVxf,
- SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
- }
- if (s+size > e) {
- /* Partial character */
- s -= size; /* back up to 1st half */
- break; /* And exit loop */
- }
- lo = enc_unpack(aTHX_ &s,e,size,endian);
- if (!isLoSurrogate(lo)){
- croak("%s:Malformed LO surrogate %"UVxf,
- SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
- }
- ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
- }
- }
- d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
- d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
- SvCUR_set(result,d - (U8 *)SvPVX(result));
- }
- if (SvTRUE(chk)) {
- if (s < e) {
- Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
- SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
- Move(s,SvPVX(str),e-s,U8);
- SvCUR_set(str,(e-s));
- }
- else {
- SvCUR_set(str,0);
- }
- *SvEND(str) = '\0';
- }
+ encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
+ ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
+ SvUTF8_on(ST(0));
XSRETURN(1);
}
void
-encode_xs(obj, utf8, chk = &PL_sv_undef)
+Method_encode(obj,src,check = 0)
SV * obj
-SV * utf8
-SV * chk
+SV * src
+int check
CODE:
{
- int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
- U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
- int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
- SV *result = newSVpvn("",0);
- STRLEN ulen;
- U8 *s = (U8 *)SvPVutf8(utf8,ulen);
- U8 *e = (U8 *)SvEND(utf8);
- ST(0) = sv_2mortal(result);
- if (!endian) {
- endian = (size == 4) ? 'N' : 'n';
- enc_pack(aTHX_ result,size,endian,BOM_BE);
-#if 0
- /* Update endian for this sequence */
- hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
-#endif
- }
- while (s < e && s+UTF8SKIP(s) <= e) {
- STRLEN len;
- UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
- s += len;
- if (size != 4 && invalid_ucs2(ord)) {
- if (!issurrogate(ord)){
- if (ucs2) {
- if (SvTRUE(chk)) {
- croak("%s:code point \"\\x{"UVxf"}\" too high",
- SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
- }
- enc_pack(aTHX_ result,size,endian,FBCHAR);
- }else{
- UV hi = ((ord - 0x10000) >> 10) + 0xD800;
- UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
- enc_pack(aTHX_ result,size,endian,hi);
- enc_pack(aTHX_ result,size,endian,lo);
- }
- }
- else {
- /* not supposed to happen */
- enc_pack(aTHX_ result,size,endian,FBCHAR);
- }
- }
- else {
- enc_pack(aTHX_ result,size,endian,ord);
- }
- }
- if (SvTRUE(chk)) {
- if (s < e) {
- Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
- SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
- Move(s,SvPVX(utf8),e-s,U8);
- SvCUR_set(utf8,(e-s));
- }
- else {
- SvCUR_set(utf8,0);
- }
- *SvEND(utf8) = '\0';
- }
+ encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
+ sv_utf8_upgrade(src);
+ ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
XSRETURN(1);
}
I32
_bytes_to_utf8(sv, ...)
- SV * sv
- CODE:
- {
- SV * encoding = items == 2 ? ST(1) : Nullsv;
-
- if (encoding)
- RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
- else {
- STRLEN len;
- U8* s = (U8*)SvPV(sv, len);
- U8* converted;
-
- converted = bytes_to_utf8(s, &len); /* This allocs */
- sv_setpvn(sv, (char *)converted, len);
- SvUTF8_on(sv); /* XXX Should we? */
- Safefree(converted); /* ... so free it */
- RETVAL = len;
- }
- }
- OUTPUT:
- RETVAL
+SV * sv
+CODE:
+{
+ SV * encoding = items == 2 ? ST(1) : Nullsv;
+
+ if (encoding)
+ RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
+ else {
+ STRLEN len;
+ U8* s = (U8*)SvPV(sv, len);
+ U8* converted;
+
+ converted = bytes_to_utf8(s, &len); /* This allocs */
+ sv_setpvn(sv, (char *)converted, len);
+ SvUTF8_on(sv); /* XXX Should we? */
+ Safefree(converted); /* ... so free it */
+ RETVAL = len;
+ }
+}
+OUTPUT:
+ RETVAL
I32
_utf8_to_bytes(sv, ...)
- SV * sv
- CODE:
- {
- SV * to = items > 1 ? ST(1) : Nullsv;
- SV * check = items > 2 ? ST(2) : Nullsv;
-
- if (to)
- RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
- else {
- STRLEN len;
- U8 *s = (U8*)SvPV(sv, len);
-
- RETVAL = 0;
- if (SvTRUE(check)) {
- /* Must do things the slow way */
- U8 *dest;
- U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
- U8 *send = s + len;
-
- New(83, dest, len, U8); /* I think */
-
- while (s < send) {
- if (*s < 0x80)
- *dest++ = *s++;
- else {
- STRLEN ulen;
- UV uv = *s++;
-
- /* Have to do it all ourselves because of error routine,
- aargh. */
- if (!(uv & 0x40))
- goto failure;
- if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
- else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
- else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
- else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
- else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
- else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
- else { ulen = 13; uv = 0; }
-
- /* Note change to utf8.c variable naming, for variety */
- while (ulen--) {
- if ((*s & 0xc0) != 0x80)
- goto failure;
+SV * sv
+CODE:
+{
+ SV * to = items > 1 ? ST(1) : Nullsv;
+ SV * check = items > 2 ? ST(2) : Nullsv;
+
+ if (to) {
+ RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
+ } else {
+ STRLEN len;
+ U8 *s = (U8*)SvPV(sv, len);
+
+ RETVAL = 0;
+ if (SvTRUE(check)) {
+ /* Must do things the slow way */
+ U8 *dest;
+ /* We need a copy to pass to check() */
+ U8 *src = (U8*)savepv((char *)s);
+ U8 *send = s + len;
+
+ New(83, dest, len, U8); /* I think */
+
+ while (s < send) {
+ if (*s < 0x80){
+ *dest++ = *s++;
+ } else {
+ STRLEN ulen;
+ UV uv = *s++;
+
+ /* Have to do it all ourselves because of error routine,
+ aargh. */
+ if (!(uv & 0x40)){ goto failure; }
+ if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
+ else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
+ else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
+ else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
+ else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
+ else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
+ else { ulen = 13; uv = 0; }
- else
- uv = (uv << 6) | (*s++ & 0x3f);
+ /* Note change to utf8.c variable naming, for variety */
+ while (ulen--) {
+ if ((*s & 0xc0) != 0x80){
+ goto failure;
+ } else {
+ uv = (uv << 6) | (*s++ & 0x3f);
+ }
}
if (uv > 256) {
failure:
- call_failure(check, s, dest, src);
- /* Now what happens? */
+ call_failure(check, s, dest, src);
+ /* Now what happens? */
}
*dest++ = (U8)uv;
- }
- }
- } else
- RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
- }
+ }
+ }
+ } else {
+ RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
}
- OUTPUT:
- RETVAL
+ }
+}
+OUTPUT:
+ RETVAL
bool
is_utf8(sv, check = 0)
SV * sv
int check
- CODE:
- {
- if (SvGMAGICAL(sv)) /* it could be $1, for example */
- sv = newSVsv(sv); /* GMAGIG will be done */
- if (SvPOK(sv)) {
- RETVAL = SvUTF8(sv) ? TRUE : FALSE;
- if (RETVAL &&
- check &&
- !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
- RETVAL = FALSE;
- } else {
+CODE:
+{
+ if (SvGMAGICAL(sv)) /* it could be $1, for example */
+ sv = newSVsv(sv); /* GMAGIG will be done */
+ if (SvPOK(sv)) {
+ RETVAL = SvUTF8(sv) ? TRUE : FALSE;
+ if (RETVAL &&
+ check &&
+ !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
RETVAL = FALSE;
- }
- if (sv != ST(0))
- SvREFCNT_dec(sv); /* it was a temp copy */
- }
- OUTPUT:
- RETVAL
+ } else {
+ RETVAL = FALSE;
+ }
+ if (sv != ST(0))
+ SvREFCNT_dec(sv); /* it was a temp copy */
+}
+OUTPUT:
+ RETVAL
SV *
_utf8_on(sv)
- SV * sv
- CODE:
- {
- if (SvPOK(sv)) {
- SV *rsv = newSViv(SvUTF8(sv));
- RETVAL = rsv;
- SvUTF8_on(sv);
- } else {
- RETVAL = &PL_sv_undef;
- }
- }
- OUTPUT:
- RETVAL
+SV * sv
+CODE:
+{
+ if (SvPOK(sv)) {
+ SV *rsv = newSViv(SvUTF8(sv));
+ RETVAL = rsv;
+ SvUTF8_on(sv);
+ } else {
+ RETVAL = &PL_sv_undef;
+ }
+}
+OUTPUT:
+ RETVAL
SV *
_utf8_off(sv)
- SV * sv
- CODE:
- {
- if (SvPOK(sv)) {
- SV *rsv = newSViv(SvUTF8(sv));
- RETVAL = rsv;
- SvUTF8_off(sv);
- } else {
- RETVAL = &PL_sv_undef;
- }
- }
- OUTPUT:
- RETVAL
+SV * sv
+CODE:
+{
+ if (SvPOK(sv)) {
+ SV *rsv = newSViv(SvUTF8(sv));
+ RETVAL = rsv;
+ SvUTF8_off(sv);
+ } else {
+ RETVAL = &PL_sv_undef;
+ }
+}
+OUTPUT:
+ RETVAL
+
+PROTOTYPES: DISABLE
+
+
+int
+DIE_ON_ERR()
+CODE:
+ RETVAL = ENCODE_DIE_ON_ERR;
+OUTPUT:
+ RETVAL
+
+int
+WARN_ON_ERR()
+CODE:
+ RETVAL = ENCODE_WARN_ON_ERR;
+OUTPUT:
+ RETVAL
+
+int
+LEAVE_SRC()
+CODE:
+ RETVAL = ENCODE_LEAVE_SRC;
+OUTPUT:
+ RETVAL
+
+int
+RETURN_ON_ERR()
+CODE:
+ RETVAL = ENCODE_RETURN_ON_ERR;
+OUTPUT:
+ RETVAL
+
+int
+PERLQQ()
+CODE:
+ RETVAL = ENCODE_PERLQQ;
+OUTPUT:
+ RETVAL
+
+int
+FB_DEFAULT()
+CODE:
+ RETVAL = ENCODE_FB_DEFAULT;
+OUTPUT:
+ RETVAL
+
+int
+FB_CROAK()
+CODE:
+ RETVAL = ENCODE_FB_CROAK;
+OUTPUT:
+ RETVAL
+
+int
+FB_QUIET()
+CODE:
+ RETVAL = ENCODE_FB_QUIET;
+OUTPUT:
+ RETVAL
+
+int
+FB_WARN()
+CODE:
+ RETVAL = ENCODE_FB_WARN;
+OUTPUT:
+ RETVAL
+
+int
+FB_PERLQQ()
+CODE:
+ RETVAL = ENCODE_FB_PERLQQ;
+OUTPUT:
+ RETVAL
BOOT:
{
-#if defined(USE_PERLIO) && !defined(USE_SFIO)
-/* PerlIO_define_layer(aTHX_ &PerlIO_encode); */
-#endif
+#include "def_t.h"
#include "def_t.exh"
}
);
#### DO NOT EDIT BEYOND THIS POINT!
-my $enc2xs = '$_Enc2xs_';
+require File::Spec;
+my ($enc2xs, $encode_h) = ();
+PATHLOOP:
+for my $d (split /:/, $ENV{PATH}){
+ for my $f (qw/enc2xs enc2xs5.7.3/){
+ my $path = File::Spec->catfile($d, $f);
+ -x $path and $enc2xs = $path and last PATHLOOP;
+ }
+}
+$enc2xs or die "enc2xs not found!";
+print "enc2xs is $enc2xs\n";
+my %encode_h = ();
+for my $d (@INC){
+ my $dir = File::Spec->catfile($d, "Encode");
+ my $file = File::Spec->catfile($dir, "encode.h");
+ -f $file and $encode_h{$dir} = -M $file;
+}
+%encode_h or die "encode.h not found!";
+# find the latest one
+($encode_h) = sort {$encode_h{$b} <=> $encode_h{$a}} keys %encode_h;
+print "encode.h is at $encode_h\n";
+
WriteMakefile(
- INC => "-I$_E2X_",
+ INC => "-I$encode_h",
#### END_OF_HEADER -- DO NOT EDIT THIS LINE BY HAND! ####
NAME => 'Encode::'.$name,
VERSION_FROM => "$name.pm",
DIST_DEFAULT => 'all tardist',
},
MAN3PODS => {},
+ PREREQ_PM => {
+ 'Encode' => "1.41",
+ },
# OS 390 winges about line numbers > 64K ???
XSOPT => '-nolinenumbers',
);
#define ENCODE_H
#ifndef U8
-/* A tad devious this:
- perl normally has a #define for U8 - if that isn't present
- then we typedef it - leaving it #ifndef so we can do data parts without
+/*
+ A tad devious this:
+ perl normally has a #define for U8 - if that isn't present then we
+ typedef it - leaving it #ifndef so we can do data parts without
getting extern references to the code parts
- */
+*/
typedef unsigned char U8;
#endif
typedef struct encpage_s encpage_t;
-
struct encpage_s
{
- /* fields ordered to pack nicely on 32-bit machines */
- const U8 *seq; /* Packed output sequences we generate if we match */
- encpage_t *next; /* Page to go to if we match */
- U8 min; /* Min value of octet to match this entry */
- U8 max; /* Max value of octet to match this entry */
- U8 dlen; /* destination length - size of entries in seq */
- U8 slen; /* source length - number of source octets needed */
+ /* fields ordered to pack nicely on 32-bit machines */
+ const U8 *seq; /* Packed output sequences we generate
+ if we match */
+ encpage_t *next; /* Page to go to if we match */
+ U8 min; /* Min value of octet to match this entry */
+ U8 max; /* Max value of octet to match this entry */
+ U8 dlen; /* destination length -
+ size of entries in seq */
+ U8 slen; /* source length -
+ number of source octets needed */
};
/*
- At any point in a translation there is a page pointer which points at an array
- of the above structures.
-
- Basic operation :
- get octet from source stream.
- if (octet >= min && octet < max) {
- if slen is 0 then we cannot represent this character.
- if we have less than slen octets (including this one) then we have a partial character.
- otherwise
- copy dlen octets from seq + dlen*(octet-min) to output
- (dlen may be zero if we don't know yet.)
- load page pointer with next to continue.
- (is slen is one this is end of a character)
- get next octet.
- }
- else {
- increment the page pointer to look at next slot in the array
- }
-
- arrays SHALL be constructed so there is an entry which matches ..0xFF at the end,
- and either maps it or indicates no representation.
-
- if MSB of slen is set then mapping is an approximate "FALLBACK" entry.
+ At any point in a translation there is a page pointer which points
+ at an array of the above structures.
+
+ Basic operation :
+ get octet from source stream.
+ if (octet >= min && octet < max) {
+ if slen is 0 then we cannot represent this character.
+ if we have less than slen octets (including this one) then
+ we have a partial character.
+ otherwise
+ copy dlen octets from seq + dlen*(octet-min) to output
+ (dlen may be zero if we don't know yet.)
+ load page pointer with next to continue.
+ (is slen is one this is end of a character)
+ get next octet.
+ }
+ else {
+ increment the page pointer to look at next slot in the array
+ }
+
+ arrays SHALL be constructed so there is an entry which matches
+ ..0xFF at the end, and either maps it or indicates no
+ representation.
+
+ if MSB of slen is set then mapping is an approximate "FALLBACK" entry.
*/
typedef struct encode_s encode_t;
struct encode_s
{
- encpage_t *t_utf8; /* Starting table for translation from the encoding to UTF-8 form */
- encpage_t *f_utf8; /* Starting table for translation from UTF-8 to the encoding */
- const U8 *rep; /* Replacement character in this encoding e.g. "?" */
- int replen; /* Number of octets to represent replacement character */
- U8 min_el; /* Minimum octets to represent a character */
- U8 max_el; /* Maximum octets to represent a character */
- const char *name[2]; /* name(s) of this encoding */
+ encpage_t *t_utf8; /* Starting table for translation from
+ the encoding to UTF-8 form */
+ encpage_t *f_utf8; /* Starting table for translation
+ from UTF-8 to the encoding */
+ const U8 *rep; /* Replacement character in this encoding
+ e.g. "?" */
+ int replen; /* Number of octets in rep */
+ U8 min_el; /* Minimum octets to represent a character */
+ U8 max_el; /* Maximum octets to represent a character */
+ const char *name[2]; /* name(s) of this encoding */
};
#ifdef U8
extern void Encode_DefineEncoding(encode_t *enc);
-#endif
+#endif /* U8 */
#define ENCODE_NOSPACE 1
#define ENCODE_PARTIAL 2
#define ENCODE_NOREP 3
#define ENCODE_FALLBACK 4
-#endif
+
+#define FBCHAR_UTF8 "\xEF\xBF\xBD"
+
+#define ENCODE_DIE_ON_ERR 0x0001 /* croaks immediately */
+#define ENCODE_WARN_ON_ERR 0x0002 /* warn on error; may proceed */
+#define ENCODE_RETURN_ON_ERR 0x0004 /* immediately returns on NOREP */
+#define ENCODE_LEAVE_SRC 0x0008 /* $src updated unless set */
+#define ENCODE_PERLQQ 0x0100 /* perlqq fallback string */
+
+#define ENCODE_FB_DEFAULT 0x0000
+#define ENCODE_FB_CROAK 0x0001
+#define ENCODE_FB_QUIET ENCODE_RETURN_ON_ERR
+#define ENCODE_FB_WARN (ENCODE_RETURN_ON_ERR|ENCODE_WARN_ON_ERR)
+#define ENCODE_FB_PERLQQ ENCODE_PERLQQ
+
+#endif /* ENCODE_H */
}
}
use Encode;
-our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use XSLoader;
XSLoader::load(__PACKAGE__,$VERSION);
=head1 SEE ALSO
-L<Encode>,L<Encode::CJKguide>
+L<Encode>
=cut
Symbol/Symbol.pm Encode extension
TW/Makefile.PL Encode extension
TW/TW.pm Encode extension
+Unicode/Makefile.PL Encode extension
+Unicode/Unicode.pm Encode extension
+Unicode/Unicode.xs Encode extension
bin/enc2xs Encode module generator
bin/piconv iconv by perl
bin/ucm2table Table Generator for testing
lib/Encode/JP/H2Z.pm Encode extension
lib/Encode/JP/JIS7.pm Encode extension
lib/Encode/KR/2022_KR.pm Encode extension
-lib/Encode/Supported.pod Documents supported encodings
-lib/Encode/Unicode.pm Encode extension
+lib/Encode/PerlIO.pod Documents for Encode & PerlIO
+lib/Encode/Supported.pod Documents for supported encodings
t/Aliases.t Encode extension test
t/CN.t Encode extension test
t/Encode.t Encode extension test
t/Unicode.t Encode extension test
t/bogus.ucm Sample data for ucmlint
t/encoding.t encoding extension test
+t/fallback.t Encode extension test
t/gb2312.euc test data
t/gb2312.ref test data
t/grow.t Encode extension test
+t/jisx0201.euc test data
+t/jisx0201.ref test data
t/jisx0208.euc test data
t/jisx0208.ref test data
t/jisx0212.euc test data
t/jperl.t encoding extension test
t/ksc5601.euc test data
t/ksc5601.ref test data
+t/perlio.t
t/unibench.pl Unicode benchmark
ucm/8859-1.ucm Unicode Character Map
ucm/8859-10.ucm Unicode Character Map
die "Encode::TW not supported on EBCDIC\n";
}
}
-our $VERSION = do { my @r = (q$Revision: 1.21 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use XSLoader;
=head1 NOTES
-Due to size concerns, C<EUC-TW> (Extended Unix Character) and C<BIG5PLUS>
+Due to size concerns, C<EUC-TW> (Extended Unix Character), C<CCCII>
+(Chinese Character Code for Information Interchange) and C<BIG5PLUS>
(CMEX's Big5+) are distributed separately on CPAN, under the name
L<Encode::HanExtra>. That module also contains extra China-based encodings.
=head1 BUGS
-The C<CNS11643> encoding files are not complete (only the first two planes,
-C<11643-1> and C<11643-2>, exist in the distribution). For common CNS11643
+The C<CNS11643> encoding files are not complete. For common C<CNS11643>
manipulation, please use C<EUC-TW> in L<Encode::HanExtra>, which contains
plane 1-7.
=head1 SEE ALSO
-L<Encode>,L<Encode::CJKguide>
+L<Encode>
=cut
--- /dev/null
+use 5.7.2;
+use strict;
+use ExtUtils::MakeMaker;
+
+WriteMakefile(
+ INC => "-I../Encode",
+ NAME => 'Encode::Unicode',
+ VERSION_FROM => "Unicode.pm",
+ MAN3PODS => {},
+ );
+
use strict;
use warnings;
-our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.32 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
-#
-# Aux. subs & constants
-#
-
-sub FBCHAR(){ 0xFFFd }
-sub BOM_BE(){ 0xFeFF }
-sub BOM16LE(){ 0xFFFe }
-sub BOM32LE(){ 0xFFFe0000 }
-
-sub valid_ucs2($){
- return
- (0 <= $_[0] && $_[0] < 0xD800)
- || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
-}
-
-sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
-sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
-sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
-
-sub ensurrogate($){
- use integer; # we have divisions
- my $uni = shift;
- my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
- my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
- return ($hi, $lo);
-}
-
-sub desurrogate($$){
- my ($hi, $lo) = @_;
- return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
-}
-
-sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
+use XSLoader;
+XSLoader::load(__PACKAGE__,$VERSION);
#
# Object Generator 8 transcoders all at once!
set_transcoder("xs");
#
+# Aux. subs & constants
+#
+
+sub FBCHAR(){ 0xFFFd }
+sub BOM_BE(){ 0xFeFF }
+sub BOM16LE(){ 0xFFFe }
+sub BOM32LE(){ 0xFFFe0000 }
+
+sub valid_ucs2($){
+ return
+ (0 <= $_[0] && $_[0] < 0xD800)
+ || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
+}
+
+sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
+sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
+sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
+
+sub ensurrogate($){
+ use integer; # we have divisions
+ my $uni = shift;
+ my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
+ my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
+ return ($hi, $lo);
+}
+
+sub desurrogate($$){
+ my ($hi, $lo) = @_;
+ return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
+}
+
+sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
+
+#
# *_modern are much faster but guzzle more memory
#
--- /dev/null
+/*
+ $Id: Unicode.xs,v 1.2 2002/04/19 05:36:43 dankogai Exp $
+ */
+
+#define PERL_NO_GET_CONTEXT
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+
+#define FBCHAR 0xFFFd
+#define BOM_BE 0xFeFF
+#define BOM16LE 0xFFFe
+#define BOM32LE 0xFFFe0000
+#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
+#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
+#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
+#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
+
+static UV
+enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
+{
+ U8 *s = *sp;
+ UV v = 0;
+ if (s+size > e) {
+ croak("Partial character %c",(char) endian);
+ }
+ switch(endian) {
+ case 'N':
+ v = *s++;
+ v = (v << 8) | *s++;
+ case 'n':
+ v = (v << 8) | *s++;
+ v = (v << 8) | *s++;
+ break;
+ case 'V':
+ case 'v':
+ v |= *s++;
+ v |= (*s++ << 8);
+ if (endian == 'v')
+ break;
+ v |= (*s++ << 16);
+ v |= (*s++ << 24);
+ break;
+ default:
+ croak("Unknown endian %c",(char) endian);
+ break;
+ }
+ *sp = s;
+ return v;
+}
+
+void
+enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
+{
+ U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size);
+ switch(endian) {
+ case 'v':
+ case 'V':
+ d += SvCUR(result);
+ SvCUR_set(result,SvCUR(result)+size);
+ while (size--) {
+ *d++ = value & 0xFF;
+ value >>= 8;
+ }
+ break;
+ case 'n':
+ case 'N':
+ SvCUR_set(result,SvCUR(result)+size);
+ d += SvCUR(result);
+ while (size--) {
+ *--d = value & 0xFF;
+ value >>= 8;
+ }
+ break;
+ default:
+ croak("Unknown endian %c",(char) endian);
+ break;
+ }
+}
+
+MODULE = Encode::Unicode PACKAGE = Encode::Unicode
+
+void
+decode_xs(obj, str, chk = &PL_sv_undef)
+SV * obj
+SV * str
+SV * chk
+CODE:
+{
+ int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
+ U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
+ int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
+ SV *result = newSVpvn("",0);
+ STRLEN ulen;
+ U8 *s = (U8 *)SvPVbyte(str,ulen);
+ U8 *e = (U8 *)SvEND(str);
+ ST(0) = sv_2mortal(result);
+ SvUTF8_on(result);
+
+ if (!endian && s+size <= e) {
+ UV bom;
+ endian = (size == 4) ? 'N' : 'n';
+ bom = enc_unpack(aTHX_ &s,e,size,endian);
+ if (bom != BOM_BE) {
+ if (bom == BOM16LE) {
+ endian = 'v';
+ }
+ else if (bom == BOM32LE) {
+ endian = 'V';
+ }
+ else {
+ croak("%s:Unregognised BOM %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
+ bom);
+ }
+ }
+#if 0
+ /* Update endian for this sequence */
+ hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#endif
+ }
+ while (s < e && s+size <= e) {
+ UV ord = enc_unpack(aTHX_ &s,e,size,endian);
+ U8 *d;
+ if (size != 4 && invalid_ucs2(ord)) {
+ if (ucs2) {
+ if (SvTRUE(chk)) {
+ croak("%s:no surrogates allowed %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
+ ord);
+ }
+ if (s+size <= e) {
+ /* skip the next one as well */
+ enc_unpack(aTHX_ &s,e,size,endian);
+ }
+ ord = FBCHAR;
+ }
+ else {
+ UV lo;
+ if (!isHiSurrogate(ord)) {
+ croak("%s:Malformed HI surrogate %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
+ ord);
+ }
+ if (s+size > e) {
+ /* Partial character */
+ s -= size; /* back up to 1st half */
+ break; /* And exit loop */
+ }
+ lo = enc_unpack(aTHX_ &s,e,size,endian);
+ if (!isLoSurrogate(lo)){
+ croak("%s:Malformed LO surrogate %"UVxf,
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),
+ ord);
+ }
+ ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
+ }
+ }
+ d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
+ d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
+ SvCUR_set(result,d - (U8 *)SvPVX(result));
+ }
+ if (SvTRUE(chk)) {
+ if (s < e) {
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
+ Move(s,SvPVX(str),e-s,U8);
+ SvCUR_set(str,(e-s));
+ }
+ else {
+ SvCUR_set(str,0);
+ }
+ *SvEND(str) = '\0';
+ }
+ XSRETURN(1);
+}
+
+void
+encode_xs(obj, utf8, chk = &PL_sv_undef)
+ SV * obj
+SV * utf8
+SV * chk
+CODE:
+{
+ int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
+ U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
+ int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
+ SV *result = newSVpvn("",0);
+ STRLEN ulen;
+ U8 *s = (U8 *)SvPVutf8(utf8,ulen);
+ U8 *e = (U8 *)SvEND(utf8);
+ ST(0) = sv_2mortal(result);
+ if (!endian) {
+ endian = (size == 4) ? 'N' : 'n';
+ enc_pack(aTHX_ result,size,endian,BOM_BE);
+#if 0
+ /* Update endian for this sequence */
+ hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#endif
+ }
+ while (s < e && s+UTF8SKIP(s) <= e) {
+ STRLEN len;
+ UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
+ s += len;
+ if (size != 4 && invalid_ucs2(ord)) {
+ if (!issurrogate(ord)){
+ if (ucs2) {
+ if (SvTRUE(chk)) {
+ croak("%s:code point \"\\x{"UVxf"}\" too high",
+ SvPV_nolen(
+ *hv_fetch((HV *)SvRV(obj),"Name",4,0))
+ ,ord);
+ }
+ enc_pack(aTHX_ result,size,endian,FBCHAR);
+ }else{
+ UV hi = ((ord - 0x10000) >> 10) + 0xD800;
+ UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
+ enc_pack(aTHX_ result,size,endian,hi);
+ enc_pack(aTHX_ result,size,endian,lo);
+ }
+ }
+ else {
+ /* not supposed to happen */
+ enc_pack(aTHX_ result,size,endian,FBCHAR);
+ }
+ }
+ else {
+ enc_pack(aTHX_ result,size,endian,ord);
+ }
+ }
+ if (SvTRUE(chk)) {
+ if (s < e) {
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
+ SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
+ Move(s,SvPVX(utf8),e-s,U8);
+ SvCUR_set(utf8,(e-s));
+ }
+ else {
+ SvCUR_set(utf8,0);
+ }
+ *SvEND(utf8) = '\0';
+ }
+ XSRETURN(1);
+}
+
use strict;
use Getopt::Std;
my @orig_ARGV = @ARGV;
-our $VERSION = do { my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.23 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
# These may get re-ordered.
# RAW is a do_now as inserted by &enter
#my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
my $replen = 0;
$replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
- my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8*)"$rep"),$replen,$min_el,$max_el);
+ my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
my $sym = "${enc}_encoding";
$sym =~ s/\W+/_/g;
print C "encode_t $sym = \n";
#!./perl
-# $Id: piconv,v 1.22 2002/04/16 23:35:00 dankogai Exp $
+# $Id: piconv,v 1.23 2002/04/19 05:36:43 dankogai Exp $
#
use 5.7.3;
use strict;
$Opt{s} and Encode::from_to($Opt{s}, $from, $to) and print $Opt{s} and exit;
my $scheme = exists $Scheme{$Opt{S}} ? $Opt{S} : 'from_to';
$Opt{C} ||= $Opt{c};
-$Opt{p} and $Opt{C} = -1;
+$Opt{p} and $Opt{C} = Encode::FB_PERLQQ;
if ($Opt{D}){
my $cfrom = Encode->getEncoding($from)->name;
package encoding;
-our $VERSION = do { my @r = (q$Revision: 1.28 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode;
use strict;
}
}
-our $HAS_PERLIO_ENCODING;
-
-eval { require PerlIO::encoding; };
-if ($@){
- $HAS_PERLIO_ENCODING = 0;
-}else{
- $HAS_PERLIO_ENCODING = 1;
- binmode(STDIN);
-}
+our $HAS_PERLIO = exists $INC{"PerlIO/encoding.pm"};
+$HAS_PERLIO or binmode(STDIN);
sub import {
my $class = shift;
}
unless ($arg{Filter}){
${^ENCODING} = $enc; # this is all you need, actually.
- $HAS_PERLIO_ENCODING or return 1;
+ $HAS_PERLIO or return 1;
for my $h (qw(STDIN STDOUT)){
if ($arg{$h}){
unless (defined find_encoding($arg{$h})) {
1;
__END__
+
=pod
=head1 NAME
package Encode::JP::JIS7;
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
require Encode;
for my $name ('7bit-jis', 'iso-2022-jp', 'iso-2022-jp-1'){
use Encode::CJKConstants qw(:all);
+our $DEBUG = 0;
+
#
# decode is identical for all 2022 variants
#
sub decode
{
my ($obj,$str,$chk) = @_;
- jis_euc(\$str);
- return Encode::decode('euc-jp', $str, $chk);
+ my $residue = jis_euc(\$str);
+ # This is for PerlIO
+ $_[1] = $residue if $chk;
+ # use perlqq fallback for euc-jp -> utf8
+ return Encode::decode('euc-jp', $str, 0);
}
#
sub encode
{
require Encode::JP::H2Z;
- my ($obj,$str,$chk) = @_;
+ my ($obj, $utf8, $chk) = @_;
+ # empty the input string in the stack so perlio is ok
+ $_[1] = '' if $chk;
my ($h2z, $jis0212) = @$obj{qw(h2z jis0212)};
- my $result = Encode::encode('euc-jp', $str, $chk);
- $h2z and &Encode::JP::H2Z::h2z(\$result);
- euc_jis(\$result, $jis0212);
- return $result;
+ my $octet = Encode::encode('euc-jp', $utf8, 0) ;
+ $h2z and &Encode::JP::H2Z::h2z(\$octet);
+ euc_jis(\$octet, $jis0212);
+ return $octet;
}
([^\e]*)
)
{
- my ($esc, $str) = ($1, $2);
+ my ($esc, $chunk) = ($1, $2);
if ($esc !~ /$RE{ISO_ASC}/o) {
- $str =~ tr/\x21-\x7e/\xa1-\xfe/;
+ $chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
if ($esc =~ /$RE{JIS_KANA}/o) {
- $str =~ s/([\xa1-\xdf])/\x8e$1/og;
+ $chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
}
elsif ($esc =~ /$RE{JIS_0212}/o) {
- $str =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
+ $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
}
}
- $str;
+ $chunk;
}geox;
- $$r_str;
+ my ($residue) = ($$r_str =~ s/(\e.*)$//so);
+ return $residue;
}
sub euc_jis{
$$r_str =~ s{
((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+)
}{
- my $str = $1;
+ my $chunk = $1;
my $esc =
- ( $str =~ tr/\x8E//d ) ? $ESC{KANA} :
- ( $str =~ tr/\x8F//d ) ? $ESC{JIS_0212} :
+ ( $chunk =~ tr/\x8E//d ) ? $ESC{KANA} :
+ ( $chunk =~ tr/\x8F//d ) ? $ESC{JIS_0212} :
$ESC{JIS_0208};
if ($esc eq $ESC{JIS_0212} && !$jis0212){
# fallback to '?'
- $str =~ tr/\xA1-\xFE/\x3F/;
+ $chunk =~ tr/\xA1-\xFE/\x3F/;
}else{
- $str =~ tr/\xA1-\xFE/\x21-\x7E/;
+ $chunk =~ tr/\xA1-\xFE/\x21-\x7E/;
}
- $esc . $str . $ESC{ASC};
+ $esc . $chunk . $ESC{ASC};
}geox;
$$r_str =~
s/\Q$ESC{ASC}\E
--- /dev/null
+=head1 NAME
+
+Encode::PerlIO -- a detailed document on Encode and PerlIO
+
+=head1 Overview
+
+It is very common to want to do encoding transformations when
+reading or writing files, network connections, pipes etc.
+If Perl is configured to use the new 'perlio' IO system then
+C<Encode> provides a "layer" (See L<PerlIO>) which can transform
+data as it is read or written.
+
+Here is how the blind poet would modernise the encoding:
+
+ use Encode;
+ open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
+ open(my $utf8,'>:utf8','iliad.utf8');
+ my @epic = <$iliad>;
+ print $utf8 @epic;
+ close($utf8);
+ close($illiad);
+
+In addition the new IO system can also be configured to read/write
+UTF-8 encoded characters (as noted above this is efficient):
+
+ open(my $fh,'>:utf8','anything');
+ print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
+
+Either of the above forms of "layer" specifications can be made the default
+for a lexical scope with the C<use open ...> pragma. See L<open>.
+
+Once a handle is open is layers can be altered using C<binmode>.
+
+Without any such configuration, or if Perl itself is built using
+system's own IO, then write operations assume that file handle accepts
+only I<bytes> and will C<die> if a character larger than 255 is
+written to the handle. When reading, each octet from the handle
+becomes a byte-in-a-character. Note that this default is the same
+behaviour as bytes-only languages (including Perl before v5.6) would
+have, and is sufficient to handle native 8-bit encodings
+e.g. iso-8859-1, EBCDIC etc. and any legacy mechanisms for handling
+other encodings and binary data.
+
+In other cases it is the programs responsibility to transform
+characters into bytes using the API above before doing writes, and to
+transform the bytes read from a handle into characters before doing
+"character operations" (e.g. C<lc>, C</\W+/>, ...).
+
+You can also use PerlIO to convert larger amounts of data you don't
+want to bring into memory. For example to convert between ISO-8859-1
+(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
+
+ open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
+ open(G, ">:utf8", "data.utf") or die $!;
+ while (<F>) { print G }
+
+ # Could also do "print G <F>" but that would pull
+ # the whole file into memory just to write it out again.
+
+More examples:
+
+ open(my $f, "<:encoding(cp1252)")
+ open(my $g, ">:encoding(iso-8859-2)")
+ open(my $h, ">:encoding(latin9)") # iso-8859-15
+
+See also L<encoding> for how to change the default encoding of the
+data in your script.
+
+=head1 How does it work?
+
+Here is a crude diagram of how filehandle, PerlIO, and Encode
+interact.
+
+ filehandle <-> PerlIO PerlIO <-> scalar (read/printed)
+ \ /
+ Encode
+
+When PerlIO receives data from either direction, it fills in the buffer
+(currently with 1024 bytes) and pass the buffer to Encode. Encode tries
+to convert the valid part and pass it back to PerlIO, leaving invalid
+parts (usually partial character) in buffer. PerlIO then appends more
+data in buffer, call Encode, and so on until the data stream ends.
+
+To do so, PerlIO always calls (de|en)code methods with CHECK set to 1.
+this ensures that the method stops at the right place when it
+encounters partial character. The following is what happens when
+PerlIO and Encode tries to encode (from utf8) more than 1024 bytes
+long and the buffer boundary happens to be between a character.
+
+ A B C .... ~ \x{3000} ....
+ 41 42 43 .... 7E e3 80 80 ....
+ <- buffer --------------->
+ << encoded >>>>>>>>>>
+ <- next buffer ------
+
+Encode converts from the beginning to \x7E, leaving \xe3 in the buffer
+because it is invalid (partial character).
+
+Unfortunately, this scheme does not work well with escape-based
+encoding such as ISO-2022-JP. Let's see what happens in that case
+in the next chapter.
+
+=head1 BUGS
+
+Now let's see what happens when you try to decode form ISO-2022-JP and
+the buffer cuts in the middle of a character
+
+ JIS208-ESC \x{5f3e}
+ A B C .... ~ \e $ B |DAN | ....
+ 41 42 43 .... 7E 1b 24 41 43 46 ....
+ <- buffer --------------------------->
+ << encoded >>>>>>>>>>>>>>>>>>>>>>>
+
+As you see, the next buffer begins with \x43. But \x43 is 'C' in
+ASCII, which is wrong in this case because we are now in JISX 0208
+area so it has to convert \x43\x46, not \x43. Unlike utf8 and EUC,
+in escape-based encoding you can't tell if it a given octed is a whole
+character or just part of it.
+
+There are actually several ways to solve this problem but none of
+which is fast enough to be practical. From Encode's point of view
+the easiest solution is for PerlIO to implement line buffer instead
+of fixed-length buffer but that makes PerlIO really complicated.
+
+So for the time being, using escape-based encodings in ":encoding()"
+layer of PerlIO does not work well.
+
+=head2 Workaround
+
+If you still insist, you can at least use ":encoding()" by making sure
+the buffer never gets full. Here is an example.
+
+ use FileHandle;
+ binmode(STDOUT, ":encoding(7bit-jis)");
+ STDOUT->autoflush(1); # don't forget this!
+ for my $l (@lines){ # $l cannot be longer than 1023 bytes
+ print $l;
+ }
+
+=head2 How can you tell my encoding fully supports PerlIO ?
+
+As of this writing, Any encoding which class belongs to Encode::XS and
+Encode::Unicode works. Encode module has C<perlio_ok> method so you
+can use it before appling PerlIO encoding to the filehandle. Here is
+an example;
+
+ my $use_perlio = perlio_ok($enc);
+ my $layer = $use_perlio ? "<:raw" : "<:encoding($enc)";
+ open my $fh, $layer, $file or die "$file : $!";
+ while(<$fh>){
+ $_ = decode($enc, $_) unless $use_perlio;
+ # ....
+ }
+
+=head1 SEE ALSO
+
+L<Encode::Encoding>,
+L<Encode::Supported>,
+L<Encode::PerlIO>,
+L<encoding>,
+L<perlebcdic>,
+L<perlfunc/open>,
+L<perlunicode>,
+L<utf8>,
+the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
+
+
+=cut
+
----------------------------------------------------------------
N. America (ASCII) cp437 AdobeStandardEncoding
cp863 (DOSCanadaF)
- W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep
+ W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep
hp-roman8
cp860 (DOSPortuguese)
Cntrl. Europe iso-8859-2 cp852 cp1250 MacCentralEurRoman
koi8-f
koi8-r cp878 [RFC1489]
koi8-u [RFC2319]
+ ----------------------------------------------------------------
=item gsm0338 - Hentai Latin 1
JIS has not endorsed the full Microsoft standard however.
The official C<Shift_JIS> includes only JIS X 0201 and JIS X 0208
-subsets, while Microsoft has always been meaning C<Shift_JIS> to
-encode a wider character repertoire. See C<IANA> registration for
+character sets, while Microsoft has always been meaning C<Shift_JIS>
+to encode a wider character repertoire. See C<IANA> registration for
C<Windows-31J>.
As a historical predecessor Microsoft's variant
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
- unless (find PerlIO::Layer 'perlio') {
- print "1..0 # Skip: PerlIO was not built\n";
- exit 0;
- }
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
$| = 1;
}
use strict;
-use Test::More tests => 27;
+use Test::More tests => 37;
#use Test::More qw(no_plan);
use Encode;
use File::Basename;
my ($src, $uni, $dst, $txt, $euc, $utf, $ref, $rnd);
-ok(defined(my $enc = find_encoding('euc-jp')));
-ok($enc->isa('Encode::XS'));
-is($enc->name,'euc-jp');
+ok(defined(my $enc = find_encoding('euc-jp')), 'find_encoding');
+ok($enc->isa('Encode::XS'), 'ISA');
+is($enc->name,'euc-jp', '$enc->name');
my $dir = dirname(__FILE__);
-my @subcodings = qw(jisx0212 jisx0208);
-
-for my $subcoding (@subcodings){
- $euc = File::Spec->catfile($dir,"$subcoding.euc");
+for my $charset (qw(jisx0201 jisx0212 jisx0208)){
+ $euc = File::Spec->catfile($dir,"$charset.euc");
$utf = File::Spec->catfile($dir,"$$.utf8");
- $ref = File::Spec->catfile($dir,"$subcoding.ref");
+ $ref = File::Spec->catfile($dir,"$charset.ref");
$rnd = File::Spec->catfile($dir,"$$.rnd");
- print "# Basic decode test\n";
- open($src,"<",$euc) || die "Cannot open $euc:$!";
+
+ open($src,"<",$euc) or die "Cannot open $euc:$!";
binmode($src);
- ok(defined($src) && fileno($src));
$txt = join('',<$src>);
- open($dst,">:utf8",$utf) || die "Cannot open $utf:$!";
- binmode($dst);
- ok(defined($dst) && fileno($dst));
- eval{ $uni = $enc->decode($txt,1) };
- $@ and print $@;
- ok(defined($uni));
- is(length($txt),0);
- print $dst $uni;
- close($dst);
close($src);
- ok(compare($utf,$ref) == 0);
-}
-
-print "# Basic encode test\n";
-open($src,"<:utf8",$ref) || die "Cannot open $ref:$!";
-binmode($src);
-ok(defined($src) && fileno($src));
-$uni = join('',<$src>);
-open($dst,">",$rnd) || die "Cannot open $rnd:$!";
-binmode($dst);
-ok(defined($dst) && fileno($dst));
-$txt = $enc->encode($uni,1);
-ok(defined($txt));
-is(length($uni),0);
-print $dst $txt;
-close($dst);
-close($src);
-ok(compare($euc,$rnd) == 0);
-
-is($enc->name,'euc-jp');
-
-my $skip_perlio;
-eval { require PerlIO::encoding; };
-if ($@){
- $skip_perlio = 1;
-}else{
- $skip_perlio = 0;
- binmode(STDIN);
-}
-
-$skip_perlio ||= (@ARGV and shift eq 'perlio');
+
+ eval{ $uni = $enc->decode($txt, 1) };
+ $@ and print $@;
+ ok(defined($uni), "decode $charset");
+ is(length($txt),0, "decode $charset completely");
-SKIP: {
- skip "PerlIO Encoding Needed", 6 if $skip_perlio;
- print "# src :encoding test\n";
- open($src,"<encoding(euc-jp)",$euc) || die "Cannot open $euc:$!";
- binmode($src);
- ok(defined($src) && fileno($src));
- open($dst,">:utf8",$utf) || die "Cannot open $utf:$!";
+ open($dst,">:utf8",$utf) or die "Cannot open $utf:$!";
binmode($dst);
- ok(defined($dst) || fileno($dst));
- my $out = select($dst);
- while (<$src>){ print; }
- close($dst);
- close($src);
+ print $dst $uni;
+ close($dst);
+ is(compare($utf, $ref), 0, "$utf eq $ref");
+
+ open $src, "<:utf8", $ref or die "$ref : $!";
+ $uni = join('', <$src>);
+ close $src;
- TODO:
- {
- local $TODO = 'needs debugging on VMS' if $^O eq 'VMS';
- ok(compare($utf,$ref) == 0);
- }
- select($out);
+ for my $canon (qw(euc-jp shiftjis
+ 7bit-jis iso-2022-jp iso-2022-jp-1)){
+ my $test = \&is;
+ if ($charset eq 'jisx0201'){
+ $canon eq 'iso-2022-jp' and $test = \&isnt;
+ $canon eq 'iso-2022-jp-1' and $test = \&isnt;
+ }elsif($charset eq 'jisx0212'){
+ $canon eq 'shiftjis' and $test = \&isnt;
+ $canon eq 'iso-2022-jp' and $test = \&isnt;
+ }
+ my $rt = ($test eq \&is) ? 'RT' : 'non-RT';
+ $test->($uni, decode($canon, encode($canon, $uni)),
+ "$rt $charset $canon");
+
+ }
- print "# dst :encoding test\n";
- open($src,"<:utf8",$ref) || die "Cannot open $ref:$!";
- binmode($src);
- ok(defined($src) || fileno($src));
- open($dst,">encoding(euc-jp)",$rnd) || die "Cannot open $rnd:$!";
+ eval{ $txt = $enc->encode($uni,1) };
+ $@ and print $@;
+ ok(defined($txt), "encode $charset");
+ is(length($uni), 0, "encode $charset completely");
+
+ open($dst,">", $rnd) or die "Cannot open $utf:$!";
binmode($dst);
- ok(defined($dst) || fileno($dst));
- $out = select($dst);
- while (<$src>) { print; }
- close($dst);
- close($src);
- ok(compare($euc,$rnd) == 0);
- select($out);
+ print $dst $txt;
+ close($dst);
+ is(compare($euc, $rnd), 0 => "$rnd eq $euc");
}
-is($enc->name,'euc-jp');
-
END {
1 while unlink($utf,$rnd);
}
$| = 1;
}
use strict;
-use Test::More tests => 22;
+use Test::More tests => 15;
#use Test::More qw(no_plan);
use Encode;
use File::Basename;
is($enc->name,'euc-kr');
-my $skip_perlio;
-eval { require PerlIO::encoding; };
-if ($@){
- $skip_perlio = 1;
-}else{
- $skip_perlio = 0;
- binmode(STDIN);
-}
-
-$skip_perlio ||= (@ARGV and shift eq 'perlio');
-
-SKIP: {
- skip "PerlIO Encoding Needed", 6 if $skip_perlio;
- print "# src :encoding test\n";
- open($src,"<encoding(euc-kr)",$euc) || die "Cannot open $euc:$!";
- binmode($src);
- ok(defined($src) && fileno($src));
- open($dst,">:utf8",$utf) || die "Cannot open $utf:$!";
- binmode($dst);
- ok(defined($dst) || fileno($dst));
- my $out = select($dst);
- while (<$src>) { print; }
- close($dst);
- close($src);
-
- TODO:
- {
- local $TODO = 'needs debugging on VMS' if $^O eq 'VMS';
- ok(compare($utf,$ref) == 0);
- }
- select($out);
-
- print "# dst :encoding test\n";
- open($src,"<:utf8",$ref) || die "Cannot open $ref:$!";
- binmode($src);
- ok(defined($src) || fileno($src));
- open($dst,">encoding(euc-kr)",$rnd) || die "Cannot open $rnd:$!";
- binmode($dst);
- ok(defined($dst) || fileno($dst));
- $out = select($dst);
- while (<$src>) { print; }
- close($dst);
- close($src);
- ok(compare($euc,$rnd) == 0);
- select($out);
-}
-
-is($enc->name,'euc-kr');
-
END {
1 while unlink($utf,$rnd);
}
--- /dev/null
+BEGIN {
+ if ($ENV{'PERL_CORE'}){
+ chdir 't';
+ unshift @INC, '../lib';
+ }
+ require Config; import Config;
+ if ($Config{'extensions'} !~ /\bEncode\b/) {
+ print "1..0 # Skip: Encode was not built\n";
+ exit 0;
+ }
+ $| = 1;
+}
+
+use strict;
+#use Test::More qw(no_plan);
+use Test::More tests => 15;
+use Encode q(:all);
+
+
+my $original = '';
+my $nofallback = '';
+my ($fallenback, $quiet, $perlqq);
+for my $i (0x20..0x7e){
+ $original .= chr($i);
+}
+$fallenback = $quiet = $perlqq = $nofallback = $original;
+
+my $residue = '';
+for my $i (0x80..0xff){
+ $original .= chr($i);
+ $residue .= chr($i);
+ $fallenback .= '?';
+ $perlqq .= sprintf("\\x{%04x}", $i);
+}
+utf8::upgrade($original);
+my $meth = find_encoding('ascii');
+
+my $src = $original;
+my $dst = $meth->encode($src, FB_DEFAULT);
+is($dst, $fallenback, "FB_DEFAULT");
+is($src, $original, "FB_DEFAULT residue");
+
+$src = $original;
+eval{ $dst = $meth->encode($src, FB_CROAK) };
+like($@, qr/does not map to ascii/o, "FB_CROAK");
+is($src, $original, "FB_CROAK residue");
+
+$src = $original;
+eval{ $dst = $meth->encode($src, FB_CROAK) };
+like($@, qr/does not map to ascii/o, "FB_CROAK");
+is($src, $original, "FB_CROAK residue");
+
+
+$src = $nofallback;
+eval{ $dst = $meth->encode($src, FB_CROAK) };
+is($@, '', "FB_CROAK on success");
+is($src, '', "FB_CROAK on success residue");
+
+$src = $original;
+$dst = $meth->encode($src, FB_QUIET);
+is($dst, $quiet, "FB_QUIET");
+is($src, $residue, "FB_QUIET residue");
+
+{
+ my $message;
+ local $SIG{__WARN__} = sub { $message = $_[0] };
+ $src = $original;
+ $dst = $meth->encode($src, FB_WARN);
+ is($dst, $quiet, "FB_WARN");
+ is($src, $residue, "FB_WARN residue");
+ like($message, qr/does not map to ascii/o, "FB_WARN message");
+}
+
+$src = $original;
+$dst = $meth->encode($src, FB_PERLQQ);
+is($dst, $perlqq, "FB_PERLQQ");
+is($src, '', "FB_PERLQQ residue");
--- /dev/null
+0x0020: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = >
+0x0040: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^
+0x0060: ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
+0x8ea0: ¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ ® ¯ ° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾
+0x8ec0: À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ
+0x8ee0:
--- /dev/null
+0x0020: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = >
+0x0040: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^
+0x0060: ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
+0x8ea0: 。 「 」 、 ・ ヲ ァ ィ ゥ ェ ォ ャ ュ ョ ッ ー ア イ ウ エ オ カ キ ク ケ コ サ シ ス セ
+0x8ec0: タ チ ツ テ ト ナ ニ ヌ ネ ノ ハ ヒ フ ヘ ホ マ ミ ム メ モ ヤ ユ ヨ ラ リ ル レ ロ ワ ン ゙
+0x8ee0:
-0x0020: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = >
-0x0040: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^
-0x0060: ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
-0x8ea0: ¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ ® ¯ ° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾
-0x8ec0: À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ
-0x8ee0:
0xa0a0:
0xa0c0:
0xa0e0:
-0x0020: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = >
-0x0040: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^
-0x0060: ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
-0x8ea0: 。 「 」 、 ・ ヲ ァ ィ ゥ ェ ォ ャ ュ ョ ッ ー ア イ ウ エ オ カ キ ク ケ コ サ シ ス セ
-0x8ec0: タ チ ツ テ ト ナ ニ ヌ ネ ノ ハ ヒ フ ヘ ホ マ ミ ム メ モ ヤ ユ ヨ ラ リ ル レ ロ ワ ン ゙
-0x8ee0:
0xa0a0:
0xa0c0:
0xa0e0:
--- /dev/null
+BEGIN {
+ if ($ENV{'PERL_CORE'}){
+ chdir 't';
+ unshift @INC, '../lib';
+ }
+ require Config; import Config;
+ if ($Config{'extensions'} !~ /\bEncode\b/) {
+ print "1..0 # Skip: Encode was not built\n";
+ exit 0;
+ }
+ if (ord("A") == 193) {
+ print "1..0 # Skip: EBCDIC\n";
+ exit 0;
+ }
+ require Encode;
+ unless ($INC{"PerlIO/encoding.pm"}
+ and PerlIO::encoding->VERSION >= 0.02
+ ){
+ print "1..0 # Skip:: PerlIO::encoding 0.02 or better required\n";
+ exit 0;
+ }
+ # warn "PerlIO::encoding->VERSION == ", PerlIO::encoding->VERSION, "\n";
+ $| = 1;
+}
+
+use strict;
+use File::Basename;
+use File::Spec;
+use File::Compare;
+use FileHandle;
+
+#use Test::More qw(no_plan);
+use Test::More tests => 20;
+
+our $DEBUG = 0;
+
+{
+ no warnings;
+ @ARGV and $DEBUG = shift;
+ require Encode::JP::JIS7;
+ $Encode::JP::JIS7::DEBUG = $DEBUG;
+}
+
+Encode->import(":all");
+
+my $dir = dirname(__FILE__);
+my $ufile = File::Spec->catfile($dir,"jisx0208.ref");
+open my $fh, "<:utf8", $ufile or die "$ufile : $!";
+my @uline = <$fh>;
+my $utext = join('' => @uline);
+close $fh;
+
+for my $e (qw/euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1/){
+ my $sfile = File::Spec->catfile($dir,"$$.sio");
+ my $pfile = File::Spec->catfile($dir,"$$.pio");
+
+ # first create a file without perlio
+ open $fh, ">", $sfile or die "$sfile :$!";
+ binmode $fh;
+ print $fh &encode($e, $utext, 0);
+ close $fh;
+
+ # then create a file via perlio without autoflush
+
+ TODO:{
+ todo_skip "$e: !perlio_ok", 1 unless perlio_ok($e);
+ open $fh, ">:encoding($e)", $pfile or die "$sfile : $!";
+ $fh->autoflush(0);
+ print $fh $utext;
+ close $fh;
+ ok(compare($sfile, $pfile) == 0 => ">:encoding($e)");
+ }
+
+ # this time print line by line.
+ # works even for ISO-2022!
+ open $fh, ">:encoding($e)", $pfile or die "$sfile : $!";
+ $fh->autoflush(1);
+ for my $l (@uline) {
+ print $fh $l;
+ }
+ close $fh;
+ is(compare($sfile, $pfile), 0 => ">:encoding($e); line-by-line");
+
+ TODO:{
+ todo_skip "$e: !perlio_ok", 2 unless perlio_ok($e);
+ open $fh, "<:encoding($e)", $pfile or die "$pfile : $!";
+ $fh->autoflush(0);
+ my $dtext = join('' => <$fh>);
+ close $fh;
+ ok($utext eq $dtext, "<:encoding($e)");
+ $dtext = '';
+ open $fh, "<:encoding($e)", $pfile or die "$pfile : $!";
+ while(defined(my $l = <$fh>)) {
+ $dtext .= $l;
+ }
+ close $fh;
+ ok($utext eq $dtext, "<:encoding($e); line-by-line");
+ }
+ $DEBUG or unlink ($sfile, $pfile);
+}
+