From: SADAHIRO Tomoyuki Date: Sun, 14 May 2006 19:57:28 +0000 (+0900) Subject: strange encodings upsets pp_chr X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=4c5ed6e2fe45844ca952edb0ad5be618e204247b;p=p5sagit%2Fp5-mst-13.2.git strange encodings upsets pp_chr Message-Id: <20060514195532.5422.BQW10602@nifty.com> p4raw-id: //depot/perl@28193 --- diff --git a/MANIFEST b/MANIFEST index e6873e3..5fc8a5d 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3538,6 +3538,7 @@ t/TestInit.pm Preamble library for core tests t/test.pl Simple testing library t/uni/case.pl See if Unicode casing works t/uni/chomp.t See if Unicode chomp works +t/uni/chr.t See if Unicode chr works t/uni/class.t See if Unicode classes work (\p) t/uni/fold.t See if Unicode folding works t/uni/lower.t See if Unicode casing works diff --git a/pp.c b/pp.c index 7540c99..fb220a0 100644 --- a/pp.c +++ b/pp.c @@ -3367,20 +3367,21 @@ PP(pp_chr) *tmps++ = (char)value; *tmps = '\0'; (void)SvPOK_only(TARG); + if (PL_encoding && !IN_BYTES) { sv_recode_to_utf8(TARG, PL_encoding); tmps = SvPVX(TARG); if (SvCUR(TARG) == 0 || !is_utf8_string((U8*)tmps, SvCUR(TARG)) || - memEQ(tmps, "\xef\xbf\xbd\0", 4)) { - SvGROW(TARG, 3); + UNICODE_IS_REPLACEMENT(utf8_to_uvchr((U8*)tmps, NULL))) { + SvGROW(TARG, 2); tmps = SvPVX(TARG); - SvCUR_set(TARG, 2); - *tmps++ = (U8)UTF8_EIGHT_BIT_HI(value); - *tmps++ = (U8)UTF8_EIGHT_BIT_LO(value); + SvCUR_set(TARG, 1); + *tmps++ = (char)value; *tmps = '\0'; - SvUTF8_on(TARG); + SvUTF8_off(TARG); } } + XPUSHs(TARG); RETURN; } diff --git a/t/op/chr.t b/t/op/chr.t index e63c3b5..056f11a 100644 --- a/t/op/chr.t +++ b/t/op/chr.t @@ -31,7 +31,9 @@ is(chr(-3.0), "\x{FFFD}"); is(chr(-3.0), "\xFD"); } -# Check UTF-8. +# Check UTF-8 (not UTF-EBCDIC). +SKIP: { + skip "no UTF-8 on EBCDIC", 21 if chr(193) eq 'A'; sub hexes { no warnings 'utf8'; # avoid surrogate and beyond Unicode warnings @@ -39,25 +41,25 @@ sub hexes { } # The following code points are some interesting steps in UTF-8. -is(hexes( 0x100), "c4 80"); -is(hexes( 0x7FF), "df bf"); -is(hexes( 0x800), "e0 a0 80"); -is(hexes( 0xFFF), "e0 bf bf"); -is(hexes( 0x1000), "e1 80 80"); -is(hexes( 0xCFFF), "ec bf bf"); -is(hexes( 0xD000), "ed 80 80"); -is(hexes( 0xD7FF), "ed 9f bf"); -is(hexes( 0xD800), "ed a0 80"); # not strict utf-8 (surrogate area begin) -is(hexes( 0xDFFF), "ed bf bf"); # not strict utf-8 (surrogate area end) -is(hexes( 0xE000), "ee 80 80"); -is(hexes( 0xFFFF), "ef bf bf"); -is(hexes( 0x10000), "f0 90 80 80"); -is(hexes( 0x3FFFF), "f0 bf bf bf"); -is(hexes( 0x40000), "f1 80 80 80"); -is(hexes( 0xFFFFF), "f3 bf bf bf"); -is(hexes(0x100000), "f4 80 80 80"); -is(hexes(0x10FFFF), "f4 8f bf bf"); # Unicode (4.1) last code point -is(hexes(0x110000), "f4 90 80 80"); -is(hexes(0x1FFFFF), "f7 bf bf bf"); # last four byte encoding -is(hexes(0x200000), "f8 88 80 80 80"); - + is(hexes( 0x100), "c4 80"); + is(hexes( 0x7FF), "df bf"); + is(hexes( 0x800), "e0 a0 80"); + is(hexes( 0xFFF), "e0 bf bf"); + is(hexes( 0x1000), "e1 80 80"); + is(hexes( 0xCFFF), "ec bf bf"); + is(hexes( 0xD000), "ed 80 80"); + is(hexes( 0xD7FF), "ed 9f bf"); + is(hexes( 0xD800), "ed a0 80"); # not strict utf-8 (surrogate area begin) + is(hexes( 0xDFFF), "ed bf bf"); # not strict utf-8 (surrogate area end) + is(hexes( 0xE000), "ee 80 80"); + is(hexes( 0xFFFF), "ef bf bf"); + is(hexes( 0x10000), "f0 90 80 80"); + is(hexes( 0x3FFFF), "f0 bf bf bf"); + is(hexes( 0x40000), "f1 80 80 80"); + is(hexes( 0xFFFFF), "f3 bf bf bf"); + is(hexes(0x100000), "f4 80 80 80"); + is(hexes(0x10FFFF), "f4 8f bf bf"); # Unicode (4.1) last code point + is(hexes(0x110000), "f4 90 80 80"); + is(hexes(0x1FFFFF), "f7 bf bf bf"); # last four byte encoding + is(hexes(0x200000), "f8 88 80 80 80"); +} diff --git a/t/op/ord.t b/t/op/ord.t index 4556664..1c82262 100755 --- a/t/op/ord.t +++ b/t/op/ord.t @@ -6,7 +6,7 @@ BEGIN { require "test.pl"; } -plan tests => 7; +plan tests => 35; # compile time evaluation @@ -33,3 +33,36 @@ is(ord("\x{1234}"), 0x1234, 'compile time ord \x{....}'); $x = "\x{1234}"; is(ord($x), 0x1234, 'runtime ord \x{....}'); +{ + no warnings 'utf8'; # avoid Unicode warnings + +# The following code points are some interesting steps. + is(ord(chr( 0x100)), 0x100, '0x0100'); + is(ord(chr( 0x3FF)), 0x3FF, 'last two-byte char in UTF-EBCDIC'); + is(ord(chr( 0x400)), 0x400, 'first three-byte char in UTF-EBCDIC'); + is(ord(chr( 0x7FF)), 0x7FF, 'last two-byte char in UTF-8'); + is(ord(chr( 0x800)), 0x800, 'first three-byte char in UTF-8'); + is(ord(chr( 0xFFF)), 0xFFF, '0x0FFF'); + is(ord(chr( 0x1000)), 0x1000, '0x1000'); + is(ord(chr( 0x3FFF)), 0x3FFF, 'last three-byte char in UTF-EBCDIC'); + is(ord(chr( 0x4000)), 0x4000, 'first four-byte char in UTF-EBCDIC'); + is(ord(chr( 0xCFFF)), 0xCFFF, '0xCFFF'); + is(ord(chr( 0xD000)), 0xD000, '0xD000'); + is(ord(chr( 0xD7FF)), 0xD7FF, '0xD7FF'); + is(ord(chr( 0xD800)), 0xD800, 'surrogate begin (not strict utf-8)'); + is(ord(chr( 0xDFFF)), 0xDFFF, 'surrogate end (not strict utf-8)'); + is(ord(chr( 0xE000)), 0xE000, '0xE000'); + is(ord(chr( 0xFDD0)), 0xFDD0, 'first additional noncharacter in BMP'); + is(ord(chr( 0xFDEF)), 0xFDEF, 'last additional noncharacter in BMP'); + is(ord(chr( 0xFFFE)), 0xFFFE, '0xFFFE'); + is(ord(chr( 0xFFFF)), 0xFFFF, 'last three-byte char in UTF-8'); + is(ord(chr( 0x10000)), 0x10000, 'first four-byte char in UTF-8'); + is(ord(chr( 0x3FFFF)), 0x3FFFF, 'last four-byte char in UTF-EBCDIC'); + is(ord(chr( 0x40000)), 0x40000, 'first five-byte char in UTF-EBCDIC'); + is(ord(chr( 0xFFFFF)), 0xFFFFF, '0xFFFFF'); + is(ord(chr(0x100000)), 0x100000, '0x100000'); + is(ord(chr(0x10FFFF)), 0x10FFFF, 'Unicode last code point'); + is(ord(chr(0x110000)), 0x110000, '0x110000'); + is(ord(chr(0x1FFFFF)), 0x1FFFFF, 'last four-byte char in UTF-8'); + is(ord(chr(0x200000)), 0x200000, 'first five-byte char in UTF-8'); +} diff --git a/t/uni/chr.t b/t/uni/chr.t new file mode 100644 index 0000000..ab710d9 --- /dev/null +++ b/t/uni/chr.t @@ -0,0 +1,41 @@ + +BEGIN { + if ($ENV{'PERL_CORE'}){ + chdir 't'; + @INC = '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + unless (PerlIO::Layer->find('perlio')){ + print "1..0 # Skip: PerlIO required\n"; + exit 0; + } + if ($ENV{PERL_CORE_MINITEST}) { + print "1..0 # Skip: no dynamic loading on miniperl, no Encode\n"; + exit 0; + } + $| = 1; +} + +use strict; +use Test::More tests => 6; +use Encode; + +use encoding 'johab'; + +ok(chr(0x7f) eq "\x7f"); +ok(chr(0x80) eq "\x80"); +ok(chr(0xff) eq "\xff"); + +for my $i (127, 128, 255) { + ok(chr($i) eq pack('C', $i)); +} + +__END__