From: Nicholas Clark Date: Thu, 22 Oct 2009 18:39:30 +0000 (+0100) Subject: S_utf16_textfilter() needs to avoid splitting UTF-16 surrogate pairs. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ba77e4cc9d1ceebf472c9c5c18b2377ee47062e6;p=p5sagit%2Fp5-mst-13.2.git S_utf16_textfilter() needs to avoid splitting UTF-16 surrogate pairs. Easier said than done. --- diff --git a/t/comp/utf.t b/t/comp/utf.t index 00523f9..d59ba2d 100644 --- a/t/comp/utf.t +++ b/t/comp/utf.t @@ -1,6 +1,6 @@ #!./perl -w -print "1..100\n"; +print "1..3980\n"; my $test = 0; my %templates = ( @@ -62,6 +62,8 @@ for my $bom (0, 1) { for my $expect ("N", "\xFF", "\x{100}", "\x{010a}", "\x{0a23}", "\x{10000}", "\x{64321}", "\x{10FFFD}", + "\x{1000a}", # 0xD800 0xDC0A + "\x{12800}", # 0xD80A 0xDC00 ) { # A space so that the UTF-16 heuristic triggers - " '" gives two # characters of ASCII. @@ -69,6 +71,21 @@ for my $bom (0, 1) { my $name = 'chrs ' . join ', ', map {ord $_} split '', $expect; test($enc, $write, $expect, $bom, $nl, $name); } + + # This is designed to try to trip over the end of the buffer, + # with similar results to U-1000A and U-12800 above. + for my $pad (2 .. 162) { + for my $chr ("\x{10000}", "\x{1000a}", "\x{12800}") { + my $padding = ' ' x $pad; + # Need 4 octets that were from 2 ASCII characters to trigger + # the heuristic that detects UTF-16 without a BOM. For + # UTF-16BE, one space and the newline will do, as the + # newline's high octet comes first. But for UTF-16LE, a + # newline is "\n\0", so it doesn't trigger it. + test($enc, " \n$padding'$chr'", $chr, $bom, $nl, + sprintf "'\\x{%x}' with $pad spaces before it", ord $chr); + } + } } } } diff --git a/toke.c b/toke.c index f795707..f105505 100644 --- a/toke.c +++ b/toke.c @@ -12822,13 +12822,32 @@ S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen) sv_chop(utf8_buffer, nl); break; } + /* OK, not a complete line there, so need to read some more UTF-16. Read an extra octect if the buffer currently has an odd number. */ + while (1) { + if (status <= 0) + break; + if (SvCUR(utf16_buffer) >= 2) { + /* Location of the high octet of the last complete code point. + Gosh, UTF-16 is a pain. All the benefits of variable length, + *coupled* with all the benefits of partial reads and + endianness. */ + const U8 *const last_hi = (U8*)SvPVX(utf16_buffer) + + ((SvCUR(utf16_buffer) & ~1) - (reverse ? 1 : 2)); + + if (*last_hi < 0xd8 || *last_hi > 0xdb) { + break; + } + + /* We have the first half of a surrogate. Read more. */ + DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter partial surrogate detected at %p\n", last_hi)); + } - while(SvCUR(utf16_buffer) < 2 && status > 0) { status = FILTER_READ(idx + 1, utf16_buffer, 160 + (SvCUR(utf16_buffer) & 1)); DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%"IVdf" SvCUR(sv)=%"UVuf"\n", status, (UV)SvCUR(utf16_buffer))); + DEBUG_P({ sv_dump(utf16_buffer); sv_dump(utf8_buffer);}); if (status < 0) { /* Error */ IoPAGE(filter) = status;