From: Jarkko Hietaniemi Date: Fri, 5 Jan 2001 00:47:23 +0000 (+0000) Subject: Unify UTF-8 malformedness handling. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a0dbb04553005cdbafdeb7435a79ae500b5aa8ff;p=p5sagit%2Fp5-mst-13.2.git Unify UTF-8 malformedness handling. p4raw-id: //depot/perl@8323 --- diff --git a/doop.c b/doop.c index 868be22..f6dbe67 100644 --- a/doop.c +++ b/doop.c @@ -833,15 +833,15 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv) char *send = s + len; char *start = s; s = send - 1; - while ((*s & 0xc0) == 0x80) - --s; - if (UTF8SKIP(s) != send - s && ckWARN_d(WARN_UTF8)) - Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character"); - sv_setpvn(astr, s, send - s); - *s = '\0'; - SvCUR_set(sv, s - start); - SvNIOK_off(sv); - SvUTF8_on(astr); + while (s > start && UTF8_IS_CONTINUATION(*s)) + s--; + if (utf8_to_uv_simple((U8*)s, 0)) { + sv_setpvn(astr, s, send - s); + *s = '\0'; + SvCUR_set(sv, s - start); + SvNIOK_off(sv); + SvUTF8_on(astr); + } } else sv_setpvn(astr, "", 0); diff --git a/pp.c b/pp.c index 950d85a..1ea2a07 100644 --- a/pp.c +++ b/pp.c @@ -3978,20 +3978,17 @@ PP(pp_reverse) U8* s = (U8*)SvPVX(TARG); U8* send = (U8*)(s + len); while (s < send) { - if (*s < 0x80) { + if (UTF8_IS_ASCII(*s)) { s++; continue; } else { + if (!utf8_to_uv_simple(s, 0)) + break; up = (char*)s; s += UTF8SKIP(s); down = (char*)(s - 1); - if (s > send || !((*down & 0xc0) == 0x80)) { - if (ckWARN_d(WARN_UTF8)) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character"); - break; - } + /* reverse this character */ while (down > up) { tmp = *up; *up++ = *down; diff --git a/sv.c b/sv.c index b999e6c..139d98a 100644 --- a/sv.c +++ b/sv.c @@ -4606,17 +4606,18 @@ Perl_sv_pos_b2u(pTHX_ register SV *sv, I32* offsetp) s = (U8*)SvPV(sv, len); if (len < *offsetp) - Perl_croak(aTHX_ "panic: bad byte offset"); + Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset"); send = s + *offsetp; len = 0; while (s < send) { - s += UTF8SKIP(s); - ++len; - } - if (s != send) { - if (ckWARN_d(WARN_UTF8)) - Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character"); - --len; + STRLEN n; + + if (utf8_to_uv(s, UTF8SKIP(s), &n, 0)) { + s += n; + len++; + } + else + break; } *offsetp = len; return; diff --git a/toke.c b/toke.c index a085c70..018e235 100644 --- a/toke.c +++ b/toke.c @@ -1551,7 +1551,7 @@ S_scan_const(pTHX_ char *start) STRLEN len = (STRLEN) -1; UV uv; if (this_utf8) { - uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY); + uv = utf8_to_uv((U8*)s, send - s, &len, 0); } if (len == (STRLEN)-1) { /* Illegal UTF8 (a high-bit byte), make it valid. */ diff --git a/utf8.c b/utf8.c index d1f1d66..83e91fc 100644 --- a/utf8.c +++ b/utf8.c @@ -213,11 +213,24 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) bool dowarn = ckWARN_d(WARN_UTF8); #endif STRLEN expectlen = 0; - - if (curlen == 0) { - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (an empty string)"); + U32 warning = 0; + +/* This list is a superset of the UTF8_ALLOW_XXX. */ + +#define UTF8_WARN_EMPTY 1 +#define UTF8_WARN_CONTINUATION 2 +#define UTF8_WARN_NON_CONTINUATION 3 +#define UTF8_WARN_FE_FF 4 +#define UTF8_WARN_SHORT 5 +#define UTF8_WARN_OVERFLOW 6 +#define UTF8_WARN_SURROGATE 7 +#define UTF8_WARN_BOM 8 +#define UTF8_WARN_LONG 9 +#define UTF8_WARN_FFFF 10 + + if (curlen == 0 && + !(flags & UTF8_ALLOW_EMPTY)) { + warning = UTF8_WARN_EMPTY; goto malformed; } @@ -229,28 +242,19 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) if (UTF8_IS_CONTINUATION(uv) && !(flags & UTF8_ALLOW_CONTINUATION)) { - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (unexpected continuation byte 0x%02"UVxf")", - uv); + warning = UTF8_WARN_CONTINUATION; goto malformed; } if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) && !(flags & UTF8_ALLOW_NON_CONTINUATION)) { - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")", - (UV)s[1], uv); + warning = UTF8_WARN_NON_CONTINUATION; goto malformed; } if ((uv == 0xfe || uv == 0xff) && !(flags & UTF8_ALLOW_FE_FF)) { - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (byte 0x%02"UVxf")", - uv); + warning = UTF8_WARN_FE_FF; goto malformed; } @@ -269,10 +273,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) if ((curlen < expectlen) && !(flags & UTF8_ALLOW_SHORT)) { - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (%d byte%s, need %d)", - curlen, curlen == 1 ? "" : "s", expectlen); + warning = UTF8_WARN_SHORT; goto malformed; } @@ -283,21 +284,25 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) while (len--) { if (!UTF8_IS_CONTINUATION(*s) && !(flags & UTF8_ALLOW_NON_CONTINUATION)) { - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x)", - *s); + s--; + warning = UTF8_WARN_NON_CONTINUATION; goto malformed; } else uv = UTF8_ACCUMULATE(uv, *s); - if (uv < ouv) { - /* This cannot be allowed. */ - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (overflow at 0x%"UVxf", byte 0x%02x)", - ouv, *s); - goto malformed; + if (!(uv > ouv)) { + /* These cannot be allowed. */ + if (uv == ouv) { + if (!(flags & UTF8_ALLOW_LONG)) { + warning = UTF8_WARN_LONG; + goto malformed; + } + } + else { /* uv < ouv */ + /* This cannot be allowed. */ + warning = UTF8_WARN_OVERFLOW; + goto malformed; + } } s++; ouv = uv; @@ -305,31 +310,19 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) if (UNICODE_IS_SURROGATE(uv) && !(flags & UTF8_ALLOW_SURROGATE)) { - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")", - uv); + warning = UTF8_WARN_SURROGATE; goto malformed; } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) && !(flags & UTF8_ALLOW_BOM)) { - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (byte order mark 0x%04"UVxf")", - uv); + warning = UTF8_WARN_BOM; goto malformed; } else if ((expectlen > UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) { - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (%d byte%s, need %d)", - expectlen, expectlen == 1 ? "": "s", UNISKIP(uv)); + warning = UTF8_WARN_LONG; goto malformed; } else if (UNICODE_IS_ILLEGAL(uv) && !(flags & UTF8_ALLOW_FFFF)) { - if (dowarn) - Perl_warner(aTHX_ WARN_UTF8, - "Malformed UTF-8 character (character 0x%04"UVxf")", - uv); + warning = UTF8_WARN_FFFF; goto malformed; } @@ -343,6 +336,61 @@ malformed: return 0; } + if (dowarn) { + SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0)); + + switch (warning) { + case 0: /* Intentionally empty. */ break; + case UTF8_WARN_EMPTY: + Perl_sv_catpvf(aTHX_ sv, "(empty string)"); + break; + case UTF8_WARN_CONTINUATION: + Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv); + break; + case UTF8_WARN_NON_CONTINUATION: + Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")", + (UV)s[1], uv); + break; + case UTF8_WARN_FE_FF: + Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv); + break; + case UTF8_WARN_SHORT: + Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)", + curlen, curlen == 1 ? "" : "s", expectlen); + break; + case UTF8_WARN_OVERFLOW: + Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)", + ouv, *s); + break; + case UTF8_WARN_SURROGATE: + Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv); + break; + case UTF8_WARN_BOM: + Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv); + break; + case UTF8_WARN_LONG: + Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)", + expectlen, expectlen == 1 ? "": "s", UNISKIP(uv)); + break; + case UTF8_WARN_FFFF: + Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv); + break; + default: + Perl_sv_catpvf(aTHX_ sv, "(unknown reason)"); + break; + } + + if (warning) { + char *s = SvPVX(sv); + + if (PL_op) + Perl_warner(aTHX_ WARN_UTF8, + "%s in %s", s, PL_op_desc[PL_op->op_type]); + else + Perl_warner(aTHX_ WARN_UTF8, "%s", s); + } + } + if (retlen) *retlen = expectlen ? expectlen : len; diff --git a/utf8.h b/utf8.h index 9061ad5..28aa057 100644 --- a/utf8.h +++ b/utf8.h @@ -33,16 +33,18 @@ END_EXTERN_C #define IN_BYTE (PL_curcop->op_private & HINT_BYTE) #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE) -#define UTF8_ALLOW_CONTINUATION 0x0001 -#define UTF8_ALLOW_NON_CONTINUATION 0x0002 -#define UTF8_ALLOW_FE_FF 0x0004 -#define UTF8_ALLOW_SHORT 0x0008 -#define UTF8_ALLOW_SURROGATE 0x0010 -#define UTF8_ALLOW_BOM 0x0020 -#define UTF8_ALLOW_FFFF 0x0040 -#define UTF8_ALLOW_LONG 0x0080 -#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF \ - |UTF8_ALLOW_BOM|UTF8_ALLOW_SURROGATE) +#define UTF8_ALLOW_EMPTY 0x0001 +#define UTF8_ALLOW_CONTINUATION 0x0002 +#define UTF8_ALLOW_NON_CONTINUATION 0x0004 +#define UTF8_ALLOW_FE_FF 0x0008 +#define UTF8_ALLOW_SHORT 0x0010 +#define UTF8_ALLOW_SURROGATE 0x0020 +#define UTF8_ALLOW_BOM 0x0040 +#define UTF8_ALLOW_FFFF 0x0080 +#define UTF8_ALLOW_LONG 0x0100 +#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\ + UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|\ + UTF8_ALLOW_FFFF|UTF8_ALLOW_LONG) #define UTF8_ALLOW_ANY 0x00ff #define UTF8_CHECK_ONLY 0x0100