utf16_to_utf8() should croak if the buffer ends without the second surrogate.
Nicholas Clark [Sun, 18 Oct 2009 20:55:52 +0000 (21:55 +0100)]
ext/XS-APItest/t/utf16_to_utf8.t
utf8.c

index 83add20..3f6f798 100644 (file)
@@ -54,3 +54,10 @@ like($@, qr/^panic: utf16_to_utf8_reversed: odd bytelen 1 at/,
      'Odd byte length panics');
 is($got, undef, 'hence eval returns undef');
 is($in, "NA", 'and input unchanged');
+
+$in = "\xD8\0\xDC\0";
+$got = eval {utf16_to_utf8($in, 2)};
+like($@, qr/^Malformed UTF-16 surrogate at/, 'Lone surrogate croaks');
+(ok(!defined $got, 'hence eval returns undef')) or
+    diag(join ', ', map {ord $_} split //, $got);
+
diff --git a/utf8.c b/utf8.c
index 455078d..4a728aa 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -986,11 +986,15 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
            continue;
        }
        if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
-           UV low = (p[0] << 8) + p[1];
-           p += 2;
-           if (low < 0xdc00 || low >= 0xdfff)
+           if (p >= pend) {
                Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
-           uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
+           } else {
+               UV low = (p[0] << 8) + p[1];
+               p += 2;
+               if (low < 0xdc00 || low >= 0xdfff)
+                   Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
+               uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
+           }
        }
        if (uv < 0x10000) {
            *d++ = (U8)(( uv >> 12)         | 0xe0);