From: Nick Ing-Simmons Date: Sat, 20 Apr 2002 18:37:38 +0000 (+0000) Subject: Various tweaks to Encode X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ca777f1ce1ff5cad164a769e07ab2e03b14695b6;p=p5sagit%2Fp5-mst-13.2.git Various tweaks to Encode p4raw-id: //depot/perlio@16022 --- diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index d1c5494..3b3fd97 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -2,11 +2,11 @@ package Encode; use strict; our $VERSION = do { my @r = (q$Revision: 1.50 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; +use XSLoader (); +XSLoader::load 'Encode'; -require DynaLoader; require Exporter; - -our @ISA = qw(Exporter DynaLoader); +our @ISA = qw(Exporter); # Public, encouraged API is exported by default @@ -19,7 +19,7 @@ our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC PERLQQ); our @FB_CONSTS = qw(FB_DEFAULT FB_QUIET FB_WARN FB_PERLQQ FB_CROAK); our @EXPORT_OK = - ( + ( qw( _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade @@ -27,16 +27,13 @@ our @EXPORT_OK = @FB_FLAGS, @FB_CONSTS, ); -our %EXPORT_TAGS = +our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ], fallbacks => [ @FB_CONSTS ], fallback_all => [ @FB_CONSTS, @FB_FLAGS ], ); - -bootstrap Encode (); - # Documentation moved after __END__ for speed - NI-S use Carp; @@ -57,7 +54,7 @@ sub encodings my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_; for my $mod (@modules){ $mod =~ s,::,/,g or $mod = "Encode/$mod"; - $mod .= '.pm'; + $mod .= '.pm'; $DEBUG and warn "about to require $mod;"; eval { require $mod; }; } @@ -193,7 +190,7 @@ predefine_encodings(); # This is to restore %Encoding if really needed; # sub predefine_encodings{ - if ($ON_EBCDIC) { + if ($ON_EBCDIC) { # was in Encode::UTF_EBCDIC package Encode::UTF_EBCDIC; *name = sub{ shift->{'Name'} }; @@ -202,7 +199,7 @@ sub predefine_encodings{ my ($obj,$str,$chk) = @_; my $res = ''; for (my $i = 0; $i < length($str); $i++) { - $res .= + $res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1)))); } $_[1] = '' if $chk; @@ -212,15 +209,15 @@ sub predefine_encodings{ my ($obj,$str,$chk) = @_; my $res = ''; for (my $i = 0; $i < length($str); $i++) { - $res .= + $res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1)))); } $_[1] = '' if $chk; return $res; }; - $Encode::Encoding{Unicode} = + $Encode::Encoding{Unicode} = bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC"; - } else { + } else { # was in Encode::UTF_EBCDIC package Encode::Internal; *name = sub{ shift->{'Name'} }; @@ -232,7 +229,7 @@ sub predefine_encodings{ return $str; }; *encode = \&decode; - $Encode::Encoding{Unicode} = + $Encode::Encoding{Unicode} = bless {Name => "Internal"} => "Encode::Internal"; } @@ -256,15 +253,14 @@ sub predefine_encodings{ $_[1] = '' if $chk; return $octets; }; - $Encode::Encoding{utf8} = + $Encode::Encoding{utf8} = bless {Name => "utf8"} => "Encode::utf8"; } } require Encode::Encoding; +@Encode::XS::ISA = qw(Encode::Encoding); -eval qq{ use PerlIO::encoding 0.02 }; -# warn $@ if $@; 1; @@ -281,14 +277,14 @@ Encode - character encodings =head2 Table of Contents -Encode consists of a collection of modules which details are too big +Encode consists of a collection of modules which details are too big to fit in one document. This POD itself explains the top-level APIs -and general topics at a glance. For other topics and more details, +and general topics at a glance. For other topics and more details, see the PODs below; Name Description -------------------------------------------------------- - Encode::Alias Alias defintions to encodings + Encode::Alias Alias definitions to encodings Encode::Encoding Encode Implementation Base Class Encode::Supported List of Supported Encodings Encode::CN Simplified Chinese Encodings @@ -359,7 +355,7 @@ alias. For encoding names and aliases, see L. For CHECK see L. For example to convert (internally UTF-8 encoded) Unicode string to -iso-8859-1 (also known as Latin1), +iso-8859-1 (also known as Latin1), $octets = encode("iso-8859-1", $unicode); @@ -439,7 +435,7 @@ When "::" is not in the name, "Encode::" is assumed. @ebcdic = Encode->encodings("EBCDIC"); -To find which encodings are supported by this package in details, +To find which encodings are supported by this package in details, see L. =head2 Defining Aliases @@ -462,7 +458,7 @@ i.e. Encode::resolve_alias("iso-8859-12") # false; nonexistent Encode::resolve_alias($name) eq $name # true if $name is canonical -This resolve_alias() does not need C and is +This resolve_alias() does not need C and is exported via C. See L on details. @@ -481,7 +477,7 @@ totally identical by functionality. # via from_to open my $in, $infile or die; open my $out, $outfile or die; - while(<>){ + while(<>){ from_to($_, "shiftjis", "euc", 1); } @@ -508,7 +504,7 @@ If I is 0, (en|de)code will put I in place of the malformed character. for UCM-based encodings, EsubcharE will be used. For Unicode, \xFFFD is used. If the data is supposed to be UTF-8, an optional lexical warning (category -utf8) is given. +utf8) is given. =item I = Encode::DIE_ON_ERROR (== 1) @@ -519,10 +515,10 @@ with eval{} unless you really want to let it die on error. =item I = Encode::FB_QUIET If I is set to Encode::FB_QUIET, (en|de)code will immediately -return proccessed part on error, with data passed via argument -overwritten with unproccessed part. This is handy when have to +return processed part on error, with data passed via argument +overwritten with unprocessed part. This is handy when have to repeatedly call because the source data is chopped in the middle for -some reasons, such as fixed-width buffer. Here is a sample code that +some reasons, such as fixed-width buffer. Here is a sample code that just does this. my $data = ''; @@ -547,7 +543,7 @@ When you decode, '\xI' will be placed where I is the hex representation of the octet that could not be decoded to utf8. And when you encode, '\x{I}' will be placed where I is the Unicode ID of the character that cannot be found in the character -repartoire of the encoding. +repertoire of the encoding. =item The bitmask @@ -616,12 +612,12 @@ not a string. L, L, -L, +L, L, -L, -L, -L, -L, +L, +L, +L, +L, the Perl Unicode Mailing List Eperl-unicode@perl.orgE =head1 MAINTAINER diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index b42668b..2796316 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -193,8 +193,8 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, } } ENCODE_SET_SRC: - if (check & ~ENCODE_LEAVE_SRC){ - sdone = SvCUR(src) - (slen+sdone); + if (check && !(check & ENCODE_LEAVE_SRC)){ + sdone = SvCUR(src) - (slen+sdone); if (sdone) { sv_setpvn(src, (char*)s+slen, sdone); } diff --git a/ext/Encode/Unicode/Unicode.xs b/ext/Encode/Unicode/Unicode.xs index 4e21de9..9924ae2 100644 --- a/ext/Encode/Unicode/Unicode.xs +++ b/ext/Encode/Unicode/Unicode.xs @@ -6,6 +6,8 @@ #include "EXTERN.h" #include "perl.h" #include "XSUB.h" +#define U8 U8 +#include "../Encode/encode.h" #define FBCHAR 0xFFFd #define BOM_BE 0xFeFF @@ -80,11 +82,13 @@ enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) MODULE = Encode::Unicode PACKAGE = Encode::Unicode +PROTOTYPES: DISABLE + void -decode_xs(obj, str, chk = &PL_sv_undef) +decode_xs(obj, str, check = 0) SV * obj SV * str -SV * chk +IV check CODE: { int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); @@ -124,14 +128,14 @@ CODE: U8 *d; if (size != 4 && invalid_ucs2(ord)) { if (ucs2) { - if (SvTRUE(chk)) { + if (check) { croak("%s:no surrogates allowed %"UVxf, SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)), ord); } if (s+size <= e) { /* skip the next one as well */ - enc_unpack(aTHX_ &s,e,size,endian); + enc_unpack(aTHX_ &s,e,size,endian); } ord = FBCHAR; } @@ -160,10 +164,12 @@ CODE: d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); SvCUR_set(result,d - (U8 *)SvPVX(result)); } - if (SvTRUE(chk)) { - if (s < e) { + if (s < e) { Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); + } + if (check && !(check & ENCODE_LEAVE_SRC)){ + if (s < e) { Move(s,SvPVX(str),e-s,U8); SvCUR_set(str,(e-s)); } @@ -176,10 +182,10 @@ CODE: } void -encode_xs(obj, utf8, chk = &PL_sv_undef) - SV * obj +encode_xs(obj, utf8, check = 0) +SV * obj SV * utf8 -SV * chk +IV check CODE: { int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); @@ -205,7 +211,7 @@ CODE: if (size != 4 && invalid_ucs2(ord)) { if (!issurrogate(ord)){ if (ucs2) { - if (SvTRUE(chk)) { + if (check) { croak("%s:code point \"\\x{"UVxf"}\" too high", SvPV_nolen( *hv_fetch((HV *)SvRV(obj),"Name",4,0)) @@ -228,10 +234,12 @@ CODE: enc_pack(aTHX_ result,size,endian,ord); } } - if (SvTRUE(chk)) { + if (s < e) { + Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", + SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); + } + if (check && !(check & ENCODE_LEAVE_SRC)){ if (s < e) { - Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character", - SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0))); Move(s,SvPVX(utf8),e-s,U8); SvCUR_set(utf8,(e-s)); } diff --git a/ext/Encode/lib/Encode/Encoding.pm b/ext/Encode/lib/Encode/Encoding.pm index 88594d1..73334d7 100644 --- a/ext/Encode/lib/Encode/Encoding.pm +++ b/ext/Encode/lib/Encode/Encoding.pm @@ -20,6 +20,8 @@ sub fromUnicode { shift->encode(@_) } sub new_sequence { return $_[0] } +sub needs_lines { 0 } + sub DESTROY {} 1; diff --git a/ext/Encode/lib/Encode/JP/JIS7.pm b/ext/Encode/lib/Encode/JP/JIS7.pm index 18d8b16..a31ae2e 100644 --- a/ext/Encode/lib/Encode/JP/JIS7.pm +++ b/ext/Encode/lib/Encode/JP/JIS7.pm @@ -7,8 +7,8 @@ require Encode; for my $name ('7bit-jis', 'iso-2022-jp', 'iso-2022-jp-1'){ my $h2z = ($name eq '7bit-jis') ? 0 : 1; my $jis0212 = ($name eq 'iso-2022-jp') ? 0 : 1; - - $Encode::Encoding{$name} = + + $Encode::Encoding{$name} = bless { Name => $name, h2z => $h2z, @@ -17,7 +17,10 @@ for my $name ('7bit-jis', 'iso-2022-jp', 'iso-2022-jp-1'){ } sub name { shift->{'Name'} } -sub new_sequence { $_[0] }; + +sub new_sequence { $_[0] } + +sub needs_lines { 1 } use Encode::CJKConstants qw(:all); @@ -87,7 +90,7 @@ sub euc_jis{ ((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+) }{ my $chunk = $1; - my $esc = + my $esc = ( $chunk =~ tr/\x8E//d ) ? $ESC{KANA} : ( $chunk =~ tr/\x8F//d ) ? $ESC{JIS_0212} : $ESC{JIS_0208}; diff --git a/ext/Encode/lib/Encode/KR/2022_KR.pm b/ext/Encode/lib/Encode/KR/2022_KR.pm index c71f0e4..f668b56 100644 --- a/ext/Encode/lib/Encode/KR/2022_KR.pm +++ b/ext/Encode/lib/Encode/KR/2022_KR.pm @@ -13,6 +13,8 @@ $obj->Define($canon); sub name { return $_[0]->{name}; } +sub needs_lines { 1 } + sub decode { my ($obj,$str,$chk) = @_; @@ -35,14 +37,14 @@ use Encode::CJKConstants qw(:all); sub iso_euc{ my $r_str = shift; - $$r_str =~ s/$RE{'2022_KR'}//gox; # remove the designator + $$r_str =~ s/$RE{'2022_KR'}//gox; # remove the designator $$r_str =~ s{ # replace chars. in GL \x0e # between SO(\x0e) and SI(\x0f) ([^\x0f]*) # with chars. in GR \x0f } { - my $out= $1; + my $out= $1; $out =~ tr/\x21-\x7e/\xa1-\xfe/; $out; }geox; @@ -51,7 +53,7 @@ sub iso_euc{ sub euc_iso{ my $r_str = shift; - substr($$r_str,0,0)=$ESC{'2022_KR'}; # put the designator at the beg. + substr($$r_str,0,0)=$ESC{'2022_KR'}; # put the designator at the beg. $$r_str =~ s{ # move KS X 1001 chars. in GR to GL ($RE{EUC_C}+) # and enclose them with SO and SI }{ diff --git a/ext/Encode/t/perlio.t b/ext/Encode/t/perlio.t index 74e3e7b..8d55d85 100644 --- a/ext/Encode/t/perlio.t +++ b/ext/Encode/t/perlio.t @@ -13,7 +13,8 @@ BEGIN { exit 0; } require Encode; - unless ($INC{"PerlIO/encoding.pm"} + eval { require PerlIO::encoding }; + unless ($INC{"PerlIO/encoding.pm"} and PerlIO::encoding->VERSION >= 0.02 ){ print "1..0 # Skip:: PerlIO::encoding 0.02 or better required\n"; @@ -95,7 +96,7 @@ for my $e (qw/euc-jp shiftjis 7bit-jis iso-2022-jp iso-2022-jp-1/){ } close $fh; ok($utext eq $dtext, "<:encoding($e); line-by-line"); - } + } $DEBUG or unlink ($sfile, $pfile); }