From: Yitzchak Scott-Thoennes Date: Mon, 13 Nov 2000 14:49:40 +0000 (-0800) Subject: [ID 20001113.003] utf8_to_uv on malformed utf returns wrong values X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=cc366d4bfec1d12643c0fb8ca45074934bcc46bb;p=p5sagit%2Fp5-mst-13.2.git [ID 20001113.003] utf8_to_uv on malformed utf returns wrong values Message-Id: <200011132249.eADMnek09679@garcia.efn.org> p4raw-id: //depot/perl@7677 --- diff --git a/doop.c b/doop.c index 3d22eb4..a2990ce 100644 --- a/doop.c +++ b/doop.c @@ -968,10 +968,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) switch (optype) { case OP_BIT_AND: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY); + luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY); + ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); rc += ulen; rulen -= ulen; duc = luc & ruc; @@ -983,10 +983,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) break; case OP_BIT_XOR: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY); + luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY); + ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); rc += ulen; rulen -= ulen; duc = luc ^ ruc; @@ -995,10 +995,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) goto mop_up_utf; case OP_BIT_OR: while (lulen && rulen) { - luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANY); + luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANY); + ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV); rc += ulen; rulen -= ulen; duc = luc | ruc; diff --git a/pp.c b/pp.c index 2a414b8..6001165 100644 --- a/pp.c +++ b/pp.c @@ -1486,7 +1486,7 @@ PP(pp_complement) send = tmps + len; while (tmps < send) { - UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY); + UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV); tmps += UTF8SKIP(tmps); targlen += UNISKIP(~c); nchar++; @@ -1500,7 +1500,7 @@ PP(pp_complement) if (nwide) { Newz(0, result, targlen + 1, U8); while (tmps < send) { - UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY); + UV c = utf8_to_uv(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV); tmps += UTF8SKIP(tmps); result = uv_to_utf8(result, ~c); } diff --git a/toke.c b/toke.c index 458e258..b48577e 100644 --- a/toke.c +++ b/toke.c @@ -1332,7 +1332,7 @@ S_scan_const(pTHX_ char *start) UV uv; uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY); - if (len == 1) { + if (len == (STRLEN)-1) { /* Illegal UTF8 (a high-bit byte), make it valid. */ char *old_pvx = SvPVX(sv); /* need space for one extra char (NOTE: SvCUR() not set here) */ diff --git a/utf8.c b/utf8.c index 6ddf42b..a54726f 100644 --- a/utf8.c +++ b/utf8.c @@ -312,12 +312,12 @@ malformed: if (flags & UTF8_CHECK_ONLY) { if (retlen) - *retlen = len; + *retlen = -1; return 0; } if (retlen) - *retlen = -1; + *retlen = expectlen ? expectlen : len; return UNICODE_REPLACEMENT_CHARACTER; } diff --git a/utf8.h b/utf8.h index dc93e95..269ad3e 100644 --- a/utf8.h +++ b/utf8.h @@ -41,6 +41,8 @@ END_EXTERN_C #define UTF8_ALLOW_BOM 0x0020 #define UTF8_ALLOW_FFFF 0x0040 #define UTF8_ALLOW_LONG 0x0080 +#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF \ + |UTF8_ALLOW_BOM|UTF8_ALLOW_SURROGATE) #define UTF8_ALLOW_ANY 0x00ff #define UTF8_CHECK_ONLY 0x0100