From: Jarkko Hietaniemi Date: Thu, 16 Jan 2003 20:36:23 +0000 (+0000) Subject: Fix 'use encoding' I/O for code points 0x80..0xFF; X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=88632417a970dff8f92718b0800b1aa1400cb4ae;p=p5sagit%2Fp5-mst-13.2.git Fix 'use encoding' I/O for code points 0x80..0xFF; code changes from Inaba Hiroto; test tweaks by jhi. p4raw-id: //depot/perl@18496 --- diff --git a/MANIFEST b/MANIFEST index 9ebb6a6..be7882c 100644 --- a/MANIFEST +++ b/MANIFEST @@ -259,6 +259,7 @@ ext/Encode/t/big5-hkscs.utf test data ext/Encode/t/CJKT.t test script ext/Encode/t/Encode.t test script ext/Encode/t/Encoder.t test script +ext/Encode/t/enc_eucjp.t test script ext/Encode/t/enc_utf8.t test script ext/Encode/t/encoding.t test script ext/Encode/t/fallback.t test script diff --git a/doio.c b/doio.c index e23a2ca..3ae3764 100644 --- a/doio.c +++ b/doio.c @@ -1268,7 +1268,8 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp) default: if (PerlIO_isutf8(fp)) { if (!SvUTF8(sv)) - sv_utf8_upgrade(sv = sv_mortalcopy(sv)); + sv_utf8_upgrade_flags(sv = sv_mortalcopy(sv), + SV_GMAGIC|SV_UTF8_NO_ENCODING); } else if (DO_UTF8(sv)) { if (!sv_utf8_downgrade((sv = sv_mortalcopy(sv)), TRUE) diff --git a/ext/Encode/encoding.pm b/ext/Encode/encoding.pm index e8aa737..1a43790 100644 --- a/ext/Encode/encoding.pm +++ b/ext/Encode/encoding.pm @@ -29,8 +29,7 @@ sub import { Carp::croak("Unknown encoding '$name'"); } unless ($arg{Filter}) { - ${^ENCODING} = $enc # this is all you need, actually. - unless $name =~ /^(?:utf-?(?:8|16|32)|ucs-?(?:2|4))(?:[bl]e)?$/i; + ${^ENCODING} = $enc; $HAS_PERLIO or return 1; for my $h (qw(STDIN STDOUT)){ if ($arg{$h}){ diff --git a/ext/Encode/t/enc_eucjp.t b/ext/Encode/t/enc_eucjp.t new file mode 100644 index 0000000..019b426 --- /dev/null +++ b/ext/Encode/t/enc_eucjp.t @@ -0,0 +1,66 @@ +# This is the twin of enc_utf8.t, the only difference is that +# this has "use encoding 'euc-jp'". + +BEGIN { + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + unless (find PerlIO::Layer 'perlio') { + print "1..0 # Skip: PerlIO was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # encoding pragma does not support EBCDIC platforms\n"; + exit(0); + } +} + +use encoding 'euc-jp'; + +my @c = (127, 128, 255, 256); + +print "1.." . (scalar @c + 1) . "\n"; + +my @f; + +for my $i (0..$#c) { + push @f, "f$i"; + open(F, ">f$i") or die "$0: failed to open 'f$i' for writing: $!"; + binmode(F, ":utf8"); + print F chr($c[$i]); + close F; +} + +my $t = 1; + +for my $i (0..$#c) { + open(F, "; + my $o = ord($c); + print $o == $c[$i] ? "ok $t - utf8 I/O $c[$i]\n" : "not ok $t - utf8 I/O $c[$i]: $o != $c[$i]\n"; + $t++; +} + +my $f = "f" . @f; + +push @f, $f; +open(F, ">$f") or die "$0: failed to open '$f' for writing: $!"; +binmode(F, ":raw"); # Output raw bytes. +print F chr(128); # Output illegal UTF-8. +close F; +open(F, $f) or die "$0: failed to open '$f' for reading: $!"; +binmode(F, ":encoding(utf-8)"); +{ + local $^W = 1; + local $SIG{__WARN__} = sub { $a = shift }; + eval { }; # This should get caught. +} +print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ? + "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n"; + +END { + 1 while unlink @f; +} diff --git a/ext/Encode/t/enc_utf8.t b/ext/Encode/t/enc_utf8.t index 20eb288..6271fe6 100644 --- a/ext/Encode/t/enc_utf8.t +++ b/ext/Encode/t/enc_utf8.t @@ -1,3 +1,6 @@ +# This is the twin of enc_eucjp.t, the only difference is that +# this has "use encoding 'utf8'". + BEGIN { require Config; import Config; if ($Config{'extensions'} !~ /\bEncode\b/) { @@ -37,11 +40,11 @@ for my $i (0..$#c) { binmode(F, ":utf8"); my $c = ; my $o = ord($c); - print $o == $c[$i] ? "ok $t\n" : "not ok $t # $o != $c[$i]\n"; + print $o == $c[$i] ? "ok $t - utf8 I/O $c[$i]\n" : "not ok $t - utf8 I/O $c[$$i]: $o != $c[$i]\n"; $t++; } -my $f = "f4"; +my $f = "f" . @f; push @f, $f; open(F, ">$f") or die "$0: failed to open '$f' for writing: $!"; @@ -56,7 +59,7 @@ binmode(F, ":encoding(utf-8)"); eval { }; # This should get caught. } print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ? - "ok $t\n" : "not ok $t: $a\n"; + "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n"; END { 1 while unlink @f; diff --git a/pp.c b/pp.c index c78246e..c9d1dc6 100644 --- a/pp.c +++ b/pp.c @@ -3278,8 +3278,19 @@ PP(pp_chr) *tmps++ = (char)value; *tmps = '\0'; (void)SvPOK_only(TARG); - if (PL_encoding) + if (PL_encoding && !IN_BYTES) { sv_recode_to_utf8(TARG, PL_encoding); + tmps = SvPVX(TARG); + if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) || + memEQ(tmps, "\xef\xbf\xbd\0", 4)) { + SvGROW(TARG,3); + SvCUR_set(TARG, 2); + *tmps++ = (U8)UTF8_EIGHT_BIT_HI(value); + *tmps++ = (U8)UTF8_EIGHT_BIT_LO(value); + *tmps = '\0'; + SvUTF8_on(TARG); + } + } XPUSHs(TARG); RETURN; } diff --git a/sv.c b/sv.c index ffa71e1..33e2202 100644 --- a/sv.c +++ b/sv.c @@ -3395,7 +3395,7 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags) sv_force_normal_flags(sv, 0); } - if (PL_encoding) + if (PL_encoding && !(flags & SV_UTF8_NO_ENCODING)) sv_recode_to_utf8(sv, PL_encoding); else { /* Assume Latin-1/EBCDIC */ /* This function could be much more efficient if we diff --git a/sv.h b/sv.h index 7c5e6dc..598397e 100644 --- a/sv.h +++ b/sv.h @@ -1030,6 +1030,7 @@ otherwise. #define SV_IMMEDIATE_UNREF 1 #define SV_GMAGIC 2 #define SV_COW_DROP_PV 4 +#define SV_UTF8_NO_ENCODING 8 /* We are about to replace the SV's current value. So if it's copy on write we need to normalise it. Use the SV_COW_DROP_PV flag hint to say that diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t index ee95c36..12f3516 100644 --- a/t/uni/tr_utf8.t +++ b/t/uni/tr_utf8.t @@ -62,7 +62,6 @@ is($str, $hiragana, "s/// # hiragana -> katakana"); # [perl 16843] my $line = 'abcdefghijklmnopqrstuvwxyz$0123456789'; $line =~ tr/bcdeghijklmnprstvwxyz$02578/בצדעגהיײקלמנפּרסטװשכיזשױתײחא/; -# is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]"); - ok(1, "TODO: Encode 1.84 broke the test for perl #16843"); + is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]"); } __END__