From: Jarkko Hietaniemi Date: Tue, 24 Oct 2000 02:55:33 +0000 (+0000) Subject: Make the UTF-8 decoding stricter and more verbose when X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=ba210ebec161cde003bc967e8e460c72f71fb70c;p=p5sagit%2Fp5-mst-13.2.git Make the UTF-8 decoding stricter and more verbose when malformation happens. This involved adding an argument to utf8_to_uv_chk(), which involved changing its prototype, and prefer STRLEN over I32 for the UTF-8 length, which as a domino effect necessitated changing the prototypes of scan_bin(), scan_oct(), scan_hex(), and reg_uni(). The stricter UTF-8 decoding checking uses Markus Kuhn's UTF-8 Decode Stress Tester from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt p4raw-id: //depot/perl@7416 --- diff --git a/doop.c b/doop.c index b75ffaa..3cd8f07 100644 --- a/doop.c +++ b/doop.c @@ -72,12 +72,12 @@ S_do_trans_simple(pTHX_ SV *sv) Newz(0, d, len*2+1, U8); dstart = d; while (s < send) { - I32 ulen; + STRLEN ulen; short c; ulen = 1; /* Need to check this, otherwise 128..255 won't match */ - c = utf8_to_uv_chk(s, &ulen, 0); + c = utf8_to_uv_chk(s, send - s, &ulen, 0); if (c < 0x100 && (ch = tbl[(short)c]) >= 0) { matches++; if (ch < 0x80) @@ -122,10 +122,10 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ s += UTF8SKIP(s); else { UV c; - I32 ulen; + STRLEN ulen; ulen = 1; if (hasutf) - c = utf8_to_uv_chk(s,&ulen, 0); + c = utf8_to_uv_chk(s, send - s, &ulen, 0); else c = *s; if (c < 0x100 && tbl[c] >= 0) @@ -363,8 +363,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ continue; } else if (uv == none) { /* "none" is unmapped character */ - I32 ulen; - *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0); + STRLEN ulen; + *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0); s += ulen; puv = 0xfeedface; continue; @@ -404,8 +404,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ continue; } else if (uv == none) { /* "none" is unmapped character */ - I32 ulen; - *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0); + STRLEN ulen; + *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0); s += ulen; continue; } @@ -964,15 +964,15 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) char *dcsave = dc; STRLEN lulen = leftlen; STRLEN rulen = rightlen; - I32 ulen; + STRLEN ulen; switch (optype) { case OP_BIT_AND: while (lulen && rulen) { - luc = utf8_to_uv_chk((U8*)lc, &ulen, 0); + luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0); + ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0); rc += ulen; rulen -= ulen; duc = luc & ruc; @@ -984,10 +984,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) break; case OP_BIT_XOR: while (lulen && rulen) { - luc = utf8_to_uv_chk((U8*)lc, &ulen, 0); + luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0); + ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0); rc += ulen; rulen -= ulen; duc = luc ^ ruc; @@ -996,10 +996,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) goto mop_up_utf; case OP_BIT_OR: while (lulen && rulen) { - luc = utf8_to_uv_chk((U8*)lc, &ulen, 0); + luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0); lc += ulen; lulen -= ulen; - ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0); + ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0); rc += ulen; rulen -= ulen; duc = luc | ruc; diff --git a/embed.h b/embed.h index b4c8f6a..eab037f 100644 --- a/embed.h +++ b/embed.h @@ -2190,7 +2190,7 @@ #define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b) #define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) #define utf8_to_uv(a,b) Perl_utf8_to_uv(aTHX_ a,b) -#define utf8_to_uv_chk(a,b,c) Perl_utf8_to_uv_chk(aTHX_ a,b,c) +#define utf8_to_uv_chk(a,b,c,d) Perl_utf8_to_uv_chk(aTHX_ a,b,c,d) #define uv_to_utf8(a,b) Perl_uv_to_utf8(aTHX_ a,b) #define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a) #define vivify_ref(a,b) Perl_vivify_ref(aTHX_ a,b) diff --git a/embed.pl b/embed.pl index f685042..6adb275 100755 --- a/embed.pl +++ b/embed.pl @@ -1941,10 +1941,10 @@ p |OP* |scalar |OP* o p |OP* |scalarkids |OP* o p |OP* |scalarseq |OP* o p |OP* |scalarvoid |OP* o -Ap |NV |scan_bin |char* start|I32 len|I32* retlen -Ap |NV |scan_hex |char* start|I32 len|I32* retlen +Ap |NV |scan_bin |char* start|STRLEN len|STRLEN* retlen +Ap |NV |scan_hex |char* start|STRLEN len|STRLEN* retlen Ap |char* |scan_num |char* s|YYSTYPE *lvalp -Ap |NV |scan_oct |char* start|I32 len|I32* retlen +Ap |NV |scan_oct |char* start|STRLEN len|STRLEN* retlen p |OP* |scope |OP* o Ap |char* |screaminstr |SV* bigsv|SV* littlesv|I32 start_shift \ |I32 end_shift|I32 *state|I32 last @@ -2074,8 +2074,8 @@ Ap |I32 |utf8_distance |U8 *a|U8 *b Ap |U8* |utf8_hop |U8 *s|I32 off ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len -Ap |UV |utf8_to_uv |U8 *s|I32* retlen -Ap |UV |utf8_to_uv_chk |U8 *s|I32* retlen|bool checking +Ap |UV |utf8_to_uv |U8 *s|STRLEN* retlen +Ap |UV |utf8_to_uv_chk |U8 *s|STRLEN curlen|STRLEN* retlen|bool checking Ap |U8* |uv_to_utf8 |U8 *d|UV uv p |void |vivify_defelem |SV* sv p |void |vivify_ref |SV* sv|U32 to_what @@ -2358,7 +2358,7 @@ s |regnode*|reg |I32|I32 * s |regnode*|reganode |U8|U32 s |regnode*|regatom |I32 * s |regnode*|regbranch |I32 *|I32 -s |void |reguni |UV|char *|I32* +s |void |reguni |UV|char *|STRLEN* s |regnode*|regclass s |regnode*|regclassutf8 s |I32 |regcurly |char * diff --git a/handy.h b/handy.h index f0e39af..7341012 100644 --- a/handy.h +++ b/handy.h @@ -448,21 +448,23 @@ Converts the specified character to lowercase. #define isPSXSPC_utf8(c) (isSPACE_utf8(c) ||(c) == '\f') #define isBLANK_utf8(c) isBLANK(c) /* could be wrong */ -#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv_chk(p, 0, 0)) -#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0)) +#define STRLEN_MAX ((STRLEN)-1) + +#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) +#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0)) #define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f') #define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */ diff --git a/op.c b/op.c index 6ef4bfe..9e256a3 100644 --- a/op.c +++ b/op.c @@ -2621,7 +2621,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) SV* transv = 0; U8* tend = t + tlen; U8* rend = r + rlen; - I32 ulen; + STRLEN ulen; U32 tfirst = 1; U32 tlast = 0; I32 tdiff; @@ -2641,6 +2641,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) if (complement) { U8 tmpbuf[UTF8_MAXLEN]; U8** cp; + I32* cl; UV nextmin = 0; New(1109, cp, tlen, U8*); i = 0; @@ -2656,7 +2657,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) qsort(cp, i, sizeof(U8*), utf8compare); for (j = 0; j < i; j++) { U8 *s = cp[j]; - UV val = utf8_to_uv_chk(s, &ulen, 0); + I32 cur = j < i ? cp[j+1] - s : tend - s; + UV val = utf8_to_uv_chk(s, cur, &ulen, 0); s += ulen; diff = val - nextmin; if (diff > 0) { @@ -2669,7 +2671,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) } } if (*s == 0xff) - val = utf8_to_uv_chk(s+1, &ulen, 0); + val = utf8_to_uv_chk(s+1, cur - 1, &ulen, 0); if (val >= nextmin) nextmin = val + 1; } @@ -2696,10 +2698,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) while (t < tend || tfirst <= tlast) { /* see if we need more "t" chars */ if (tfirst > tlast) { - tfirst = (I32)utf8_to_uv_chk(t, &ulen, 0); + tfirst = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0); t += ulen; if (t < tend && *t == 0xff) { /* illegal utf8 val indicates range */ - tlast = (I32)utf8_to_uv_chk(++t, &ulen, 0); + t++; + tlast = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0); t += ulen; } else @@ -2709,10 +2712,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) /* now see if we need more "r" chars */ if (rfirst > rlast) { if (r < rend) { - rfirst = (I32)utf8_to_uv_chk(r, &ulen, 0); + rfirst = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0); r += ulen; if (r < rend && *r == 0xff) { /* illegal utf8 val indicates range */ - rlast = (I32)utf8_to_uv_chk(++r, &ulen, 0); + r++; + rlast = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0); r += ulen; } else diff --git a/perl.c b/perl.c index cb2cb14..3d874ca 100644 --- a/perl.c +++ b/perl.c @@ -2025,7 +2025,7 @@ NULL char * Perl_moreswitches(pTHX_ char *s) { - I32 numlen; + STRLEN numlen; U32 rschar; switch (*s) { diff --git a/perlapi.c b/perlapi.c index 3cfe4e0..1f1343d 100644 --- a/perlapi.c +++ b/perlapi.c @@ -2638,14 +2638,14 @@ Perl_save_threadsv(pTHXo_ PADOFFSET i) #undef Perl_scan_bin NV -Perl_scan_bin(pTHXo_ char* start, I32 len, I32* retlen) +Perl_scan_bin(pTHXo_ char* start, STRLEN len, STRLEN* retlen) { return ((CPerlObj*)pPerl)->Perl_scan_bin(start, len, retlen); } #undef Perl_scan_hex NV -Perl_scan_hex(pTHXo_ char* start, I32 len, I32* retlen) +Perl_scan_hex(pTHXo_ char* start, STRLEN len, STRLEN* retlen) { return ((CPerlObj*)pPerl)->Perl_scan_hex(start, len, retlen); } @@ -2659,7 +2659,7 @@ Perl_scan_num(pTHXo_ char* s, YYSTYPE *lvalp) #undef Perl_scan_oct NV -Perl_scan_oct(pTHXo_ char* start, I32 len, I32* retlen) +Perl_scan_oct(pTHXo_ char* start, STRLEN len, STRLEN* retlen) { return ((CPerlObj*)pPerl)->Perl_scan_oct(start, len, retlen); } @@ -3380,16 +3380,16 @@ Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len) #undef Perl_utf8_to_uv UV -Perl_utf8_to_uv(pTHXo_ U8 *s, I32* retlen) +Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN* retlen) { return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, retlen); } #undef Perl_utf8_to_uv_chk UV -Perl_utf8_to_uv_chk(pTHXo_ U8 *s, I32* retlen, bool checking) +Perl_utf8_to_uv_chk(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking) { - return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, retlen, checking); + return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, curlen, retlen, checking); } #undef Perl_uv_to_utf8 diff --git a/pod/perlapi.pod b/pod/perlapi.pod index a5178e8..730d89f 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -3225,7 +3225,7 @@ advanced to the end of the character. If C does not point to a well-formed UTF8 character, an optional UTF8 warning is produced. - U8* s utf8_to_uv(I32 *retlen) + U8* s utf8_to_uv(STRLEN *retlen) =for hackers Found in file utf8.c @@ -3233,9 +3233,9 @@ Found in file utf8.c =item utf8_to_uv_chk Returns the character value of the first character in the string C -which is assumed to be in UTF8 encoding; C will be set to the -length, in bytes, of that character, and the pointer C will be -advanced to the end of the character. +which is assumed to be in UTF8 encoding and no longer than C; +C will be set to the length, in bytes, of that character, +and the pointer C will be advanced to the end of the character. If C does not point to a well-formed UTF8 character, the behaviour is dependent on the value of C: if this is true, it is @@ -3243,7 +3243,7 @@ assumed that the caller will raise a warning, and this function will set C to C<-1> and return. If C is not true, an optional UTF8 warning is produced. - U8* s utf8_to_uv_chk(I32 *retlen, I32 checking) + U8* s utf8_to_uv_chk(STRLEN curlen, I32 *retlen, I32 checking) =for hackers Found in file utf8.c diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 480ab84..139bab9 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1789,6 +1789,10 @@ a builtin library search path, prefix2 is substituted. The error may appear if components are not found, or are too long. See "PERLLIB_PREFIX" in L. +=item Malformed UTF-8 character (%s) + +Perl detected something that didn't comply with UTF-8 encoding rules. + =item Malformed UTF-16 surrogate Perl thought it was reading UTF-16 encoded character data but while diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index c9954d8..145c953 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -71,6 +71,11 @@ on Windows. Regardless of the above, the C pragma can always be used to force byte semantics in a particular lexical scope. See L. +One effect of the C pragma is that the internal UTF-8 decoding +becomes stricter so that the character 0xFFFF (UTF-8 bytes 0xEF 0xBF +0xBF), and the bytes 0xFE and 0xFF, start to cause warnings if they +appear in the data. + The C pragma is primarily a compatibility device that enables recognition of UTF-8 in literals encountered by the parser. It may also be used for enabling some of the more experimental Unicode support features. diff --git a/pp.c b/pp.c index 98d31cb..35f5956 100644 --- a/pp.c +++ b/pp.c @@ -1480,7 +1480,7 @@ PP(pp_complement) STRLEN targlen = 0; U8 *result; U8 *send; - I32 l; + STRLEN l; send = tmps + len; while (tmps < send) { @@ -1944,7 +1944,7 @@ PP(pp_hex) { djSP; dTARGET; char *tmps; - I32 argtype; + STRLEN argtype; STRLEN n_a; tmps = POPpx; @@ -1957,7 +1957,7 @@ PP(pp_oct) { djSP; dTARGET; NV value; - I32 argtype; + STRLEN argtype; char *tmps; STRLEN n_a; @@ -2234,13 +2234,13 @@ PP(pp_ord) { djSP; dTARGET; UV value; - STRLEN n_a; SV *tmpsv = POPs; - U8 *tmps = (U8*)SvPVx(tmpsv,n_a); - I32 retlen; + STRLEN len; + U8 *tmps = (U8*)SvPVx(tmpsv, len); + STRLEN retlen; if ((*tmps & 0x80) && DO_UTF8(tmpsv)) - value = utf8_to_uv_chk(tmps, &retlen, 0); + value = utf8_to_uv_chk(tmps, len, &retlen, 0); else value = (UV)(*tmps & 255); XPUSHu(value); @@ -2304,10 +2304,10 @@ PP(pp_ucfirst) STRLEN slen; if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) { - I32 ulen; + STRLEN ulen; U8 tmpbuf[UTF8_MAXLEN]; U8 *tend; - UV uv = utf8_to_uv_chk(s, &ulen, 0); + UV uv = utf8_to_uv_chk(s, slen, &ulen, 0); if (PL_op->op_private & OPpLOCALE) { TAINT; @@ -2363,10 +2363,10 @@ PP(pp_lcfirst) STRLEN slen; if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) { - I32 ulen; + STRLEN ulen; U8 tmpbuf[UTF8_MAXLEN]; U8 *tend; - UV uv = utf8_to_uv_chk(s, &ulen, 0); + UV uv = utf8_to_uv_chk(s, slen, &ulen, 0); if (PL_op->op_private & OPpLOCALE) { TAINT; @@ -2423,7 +2423,7 @@ PP(pp_uc) if (DO_UTF8(sv)) { dTARGET; - I32 ulen; + STRLEN ulen; register U8 *d; U8 *send; @@ -2443,7 +2443,7 @@ PP(pp_uc) TAINT; SvTAINTED_on(TARG); while (s < send) { - d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, &ulen, 0))); + d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0))); s += ulen; } } @@ -2497,7 +2497,7 @@ PP(pp_lc) if (DO_UTF8(sv)) { dTARGET; - I32 ulen; + STRLEN ulen; register U8 *d; U8 *send; @@ -2517,7 +2517,7 @@ PP(pp_lc) TAINT; SvTAINTED_on(TARG); while (s < send) { - d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, &ulen, 0))); + d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0))); s += ulen; } } @@ -3363,7 +3363,7 @@ PP(pp_unpack) /* These must not be in registers: */ I16 ashort; int aint; - I32 along; + STRLEN along; #ifdef HAS_QUAD Quad_t aquad; #endif @@ -3659,7 +3659,7 @@ PP(pp_unpack) len = strend - s; if (checksum) { while (len-- > 0 && s < strend) { - auint = utf8_to_uv_chk((U8*)s, &along, 0); + auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0); s += along; if (checksum > 32) cdouble += (NV)auint; @@ -3671,7 +3671,7 @@ PP(pp_unpack) EXTEND(SP, len); EXTEND_MORTAL(len); while (len-- > 0 && s < strend) { - auint = utf8_to_uv_chk((U8*)s, &along, 0); + auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0); s += along; sv = NEWSV(37, 0); sv_setuv(sv, (UV)auint); diff --git a/pp_ctl.c b/pp_ctl.c index cf2000e..33f91ee 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -2971,17 +2971,17 @@ PP(pp_require) if (SvNIOKp(sv)) { if (SvPOK(sv) && SvNOK(sv)) { /* require v5.6.1 */ UV rev = 0, ver = 0, sver = 0; - I32 len; + STRLEN len; U8 *s = (U8*)SvPVX(sv); U8 *end = (U8*)SvPVX(sv) + SvCUR(sv); if (s < end) { - rev = utf8_to_uv_chk(s, &len, 0); + rev = utf8_to_uv_chk(s, end - s, &len, 0); s += len; if (s < end) { - ver = utf8_to_uv_chk(s, &len, 0); + ver = utf8_to_uv_chk(s, end - s, &len, 0); s += len; if (s < end) - sver = utf8_to_uv_chk(s, &len, 0); + sver = utf8_to_uv_chk(s, end - s, &len, 0); } } if (PERL_REVISION < rev diff --git a/proto.h b/proto.h index 2713916..7624255 100644 --- a/proto.h +++ b/proto.h @@ -677,10 +677,10 @@ PERL_CALLCONV OP* Perl_scalar(pTHX_ OP* o); PERL_CALLCONV OP* Perl_scalarkids(pTHX_ OP* o); PERL_CALLCONV OP* Perl_scalarseq(pTHX_ OP* o); PERL_CALLCONV OP* Perl_scalarvoid(pTHX_ OP* o); -PERL_CALLCONV NV Perl_scan_bin(pTHX_ char* start, I32 len, I32* retlen); -PERL_CALLCONV NV Perl_scan_hex(pTHX_ char* start, I32 len, I32* retlen); +PERL_CALLCONV NV Perl_scan_bin(pTHX_ char* start, STRLEN len, STRLEN* retlen); +PERL_CALLCONV NV Perl_scan_hex(pTHX_ char* start, STRLEN len, STRLEN* retlen); PERL_CALLCONV char* Perl_scan_num(pTHX_ char* s, YYSTYPE *lvalp); -PERL_CALLCONV NV Perl_scan_oct(pTHX_ char* start, I32 len, I32* retlen); +PERL_CALLCONV NV Perl_scan_oct(pTHX_ char* start, STRLEN len, STRLEN* retlen); PERL_CALLCONV OP* Perl_scope(pTHX_ OP* o); PERL_CALLCONV char* Perl_screaminstr(pTHX_ SV* bigsv, SV* littlesv, I32 start_shift, I32 end_shift, I32 *state, I32 last); #if !defined(VMS) @@ -809,8 +809,8 @@ PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b); PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off); PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len); PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len); -PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen); -PERL_CALLCONV UV Perl_utf8_to_uv_chk(pTHX_ U8 *s, I32* retlen, bool checking); +PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN* retlen); +PERL_CALLCONV UV Perl_utf8_to_uv_chk(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking); PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv); PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv); PERL_CALLCONV void Perl_vivify_ref(pTHX_ SV* sv, U32 to_what); @@ -1103,7 +1103,7 @@ STATIC regnode* S_reg(pTHX_ I32, I32 *); STATIC regnode* S_reganode(pTHX_ U8, U32); STATIC regnode* S_regatom(pTHX_ I32 *); STATIC regnode* S_regbranch(pTHX_ I32 *, I32); -STATIC void S_reguni(pTHX_ UV, char *, I32*); +STATIC void S_reguni(pTHX_ UV, char *, STRLEN*); STATIC regnode* S_regclass(pTHX); STATIC regnode* S_regclassutf8(pTHX); STATIC I32 S_regcurly(pTHX_ char *); diff --git a/regcomp.c b/regcomp.c index e7042ea..3f2b10c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2742,11 +2742,11 @@ tryagain: /* FALL THROUGH */ default: { - register I32 len; + register STRLEN len; register UV ender; register char *p; char *oldp, *s; - I32 numlen; + STRLEN numlen; PL_regcomp_parse++; @@ -2884,7 +2884,8 @@ tryagain: default: normal_default: if ((*p & 0xc0) == 0xc0 && UTF) { - ender = utf8_to_uv_chk((U8*)p, &numlen, 0); + ender = utf8_to_uv_chk((U8*)p, PL_regxend - p, + &numlen, 0); p += numlen; } else @@ -3128,7 +3129,7 @@ S_regclass(pTHX) register I32 lastvalue = OOB_CHAR8; register I32 range = 0; register regnode *ret; - I32 numlen; + STRLEN numlen; I32 namedclass; char *rangebegin; bool need_class = 0; @@ -3606,7 +3607,7 @@ S_regclassutf8(pTHX) register U32 lastvalue = OOB_UTF8; register I32 range = 0; register regnode *ret; - I32 numlen; + STRLEN numlen; I32 n; SV *listsv; U8 flags = 0; @@ -3638,12 +3639,16 @@ S_regclassutf8(pTHX) namedclass = OOB_NAMEDCLASS; if (!range) rangebegin = PL_regcomp_parse; - value = utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0); + value = utf8_to_uv_chk((U8*)PL_regcomp_parse, + PL_regxend - PL_regcomp_parse, + &numlen, 0); PL_regcomp_parse += numlen; if (value == '[') namedclass = regpposixcc(value); else if (value == '\\') { - value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0); + value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, + PL_regxend - PL_regcomp_parse, + &numlen, 0); PL_regcomp_parse += numlen; /* Some compilers cannot handle switching on 64-bit integer * values, therefore value cannot be an UV. Yes, this will @@ -3937,7 +3942,7 @@ S_reganode(pTHX_ U8 op, U32 arg) - reguni - emit (if appropriate) a Unicode character */ STATIC void -S_reguni(pTHX_ UV uv, char* s, I32* lenp) +S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp) { dTHR; if (SIZE_ONLY) { diff --git a/regexec.c b/regexec.c index 6e046f3..350f432 100644 --- a/regexec.c +++ b/regexec.c @@ -917,7 +917,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUNDUTF8: - tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n'; + tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), + strend - s, + 0, 0) : '\n'; tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); while (s < strend) { if (tmp == !(OP(c) == BOUNDUTF8 ? @@ -953,7 +955,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case NBOUNDUTF8: - tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n'; + tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), + strend - s, + 0, 0) : '\n'; tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); while (s < strend) { if (tmp == !(OP(c) == NBOUNDUTF8 ? @@ -1998,7 +2002,7 @@ S_regmatch(pTHX_ regnode *prog) while (s < e) { if (l >= PL_regeol) sayNO; - if (utf8_to_uv_chk((U8*)s, 0, 0) != (c1 ? + if (utf8_to_uv_chk((U8*)s, e - s, 0, 0) != (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l))) { @@ -2136,7 +2140,8 @@ S_regmatch(pTHX_ regnode *prog) case NBOUNDUTF8: /* was last char in word? */ ln = (locinput != PL_regbol) - ? utf8_to_uv_chk(reghop((U8*)locinput, -1), 0, 0) : PL_regprev; + ? utf8_to_uv_chk(reghop((U8*)locinput, -1), + PL_regeol - locinput, 0, 0) : PL_regprev; if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) { ln = isALNUM_uni(ln); n = swash_fetch(PL_utf8_alnum, (U8*)locinput); diff --git a/sv.c b/sv.c index 1fac162..2790cfd 100644 --- a/sv.c +++ b/sv.c @@ -6358,13 +6358,13 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV case 'd': case 'i': if (vectorize) { - I32 ulen; + STRLEN ulen; if (!veclen) { vectorize = FALSE; break; } if (utf) - iv = (IV)utf8_to_uv_chk(vecstr, &ulen, 0); + iv = (IV)utf8_to_uv_chk(vecstr, veclen, &ulen, 0); else { iv = *vecstr; ulen = 1; @@ -6440,14 +6440,14 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV uns_integer: if (vectorize) { - I32 ulen; + STRLEN ulen; vector: if (!veclen) { vectorize = FALSE; break; } if (utf) - uv = utf8_to_uv_chk(vecstr, &ulen, 0); + uv = utf8_to_uv_chk(vecstr, veclen, &ulen, 0); else { uv = *vecstr; ulen = 1; diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t index 7224a74..e61baad 100755 --- a/t/pragma/utf8.t +++ b/t/pragma/utf8.t @@ -10,7 +10,7 @@ BEGIN { } } -print "1..103\n"; +print "1..181\n"; my $test = 1; @@ -559,3 +559,170 @@ sub nok_bytes { print "ok $test\n"; $test++; } + +# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester, +# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt, +# version dated 2000-09-02. + +my @MK = split(/\n/, <<__EOMK__); +1 Correct UTF-8 +1.1.1 y "κόσμε" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5 +2 Boundary conditions +2.1 First possible sequence of certain length +2.1.1 y "" 0 1 00 1 +2.1.2 y "€" 80 2 c2:80 1 +2.1.3 y "ࠀ" 800 3 e0:a0:80 1 +2.1.4 y "𐀀" 10000 4 f0:90:80:80 1 +2.1.5 y "øˆ€€€" 200000 5 f8:88:80:80:80 1 +2.1.6 y "ü„€€€€" 4000000 6 fc:84:80:80:80:80 1 +2.2 Last possible sequence of certain length +2.2.1 y "" 7f 1 7f 1 +2.2.2 y "ß¿" 7ff 2 df:bf 1 +# The ffff is legal unless under use utf8 +2.2.3 y "ï¿¿" ffff 3 ef:bf:bf 1 +2.2.4 y "÷¿¿¿" 1fffff 4 f7:bf:bf:bf 1 +2.2.5 y "û¿¿¿¿" 3ffffff 5 fb:bf:bf:bf:bf 1 +2.2.6 y "ý¿¿¿¿¿" 7fffffff 6 fd:bf:bf:bf:bf:bf 1 +2.3 Other boundary conditions +2.3.1 y "퟿" d7ff 3 ed:9f:bf 1 +2.3.2 y "" e000 3 ee:80:80 1 +2.3.3 y "�" fffd 3 ef:bf:bd 1 +2.3.4 y "ô¿¿" 10ffff 4 f4:8f:bf:bf 1 +2.3.5 y "ô€€" 110000 4 f4:90:80:80 1 +3 Malformed sequences +3.1 Unexpected continuation bytes +3.1.1 n "€" - 1 80 +3.1.2 n "¿" - 1 bf +3.1.3 n "€¿" - 2 80:bf +3.1.4 n "€¿€" - 3 80:bf:80 +3.1.5 n "€¿€¿" - 4 80:bf:80:bf +3.1.6 n "€¿€¿€" - 5 80:bf:80:bf:80 +3.1.7 n "€¿€¿€¿" - 6 80:bf:80:bf:80:bf +3.1.8 n "€¿€¿€¿€" - 7 80:bf:80:bf:80:bf:80 +3.1.9 n "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿" - 64 80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf +3.2 Lonely start characters +3.2.1 n "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß " - 64 c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 +3.2.2 n "à á â ã ä å æ ç è é ê ë ì í î ï " - 32 e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 +3.2.3 n "ð ñ ò ó ô õ ö ÷ " - 16 f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 +3.2.4 n "ø ù ú û " - 8 f8:20:f9:20:fa:20:fb:20 +3.2.5 n "ü ý " - 4 fc:20:fd:20 +3.3 Sequences with last continuation byte missing +3.3.1 n "À" - 1 c0 +3.3.2 n "à€" - 2 e0:80 +3.3.3 n "ð€€" - 3 f0:80:80 +3.3.4 n "ø€€€" - 4 f8:80:80:80 +3.3.5 n "ü€€€€" - 5 fc:80:80:80:80 +3.3.6 n "ß" - 1 df +3.3.7 n "ï¿" - 2 ef:bf +3.3.8 n "÷¿¿" - 3 f7:bf:bf +3.3.9 n "û¿¿¿" - 4 fb:bf:bf:bf +3.3.10 n "ý¿¿¿¿" - 5 fd:bf:bf:bf:bf +3.4 Concatenation of incomplete sequences +3.4.1 n "Àà€ð€€ø€€€ü€€€€ßï¿÷¿¿û¿¿¿ý¿¿¿¿" - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf +3.5 Impossible bytes +3.5.1 n "þ" - 1 fe +3.5.2 n "ÿ" - 1 ff +3.5.3 n "þþÿÿ" - 4 fe:fe:ff:ff +4 Overlong sequences +4.1 Examples of an overlong ASCII character +4.1.1 n "À¯" - 2 c0:af +4.1.2 n "à€¯" - 3 e0:80:af +4.1.3 n "ð€€¯" - 4 f0:80:80:af +4.1.4 n "ø€€€¯" - 5 f8:80:80:80:af +4.1.5 n "ü€€€€¯" - 6 fc:80:80:80:80:af +4.2 Maximum overlong sequences +4.2.1 n "Á¿" - 2 c1:bf +4.2.2 n "àŸ¿" - 3 e0:9f:bf +4.2.3 n "ð¿¿" - 4 f0:8f:bf:bf +4.2.4 n "ø‡¿¿¿" - 5 f8:87:bf:bf:bf +4.2.5 n "üƒ¿¿¿¿" - 6 fc:83:bf:bf:bf:bf +4.3 Overlong representation of the NUL character +4.3.1 n "À€" - 2 c0:80 +4.3.2 n "à€€" - 3 e0:80:80 +4.3.3 n "ð€€€" - 4 f0:80:80:80 +4.3.4 n "ø€€€€" - 5 f8:80:80:80:80 +4.3.5 n "ü€€€€€" - 6 fc:80:80:80:80:80 +5 Illegal code positions +5.1 Single UTF-16 surrogates +5.1.1 n "í €" - 3 ed:a0:80 +5.1.2 n "í­¿" - 3 ed:ad:bf +5.1.3 n "í®€" - 3 ed:ae:80 +5.1.4 n "í¯¿" - 3 ed:af:bf +5.1.5 n "í°€" - 3 ed:b0:80 +5.1.6 n "í¾€" - 3 ed:be:80 +5.1.7 n "í¿¿" - 3 ed:bf:bf +5.2 Paired UTF-16 surrogates +5.2.1 n "𐀀" - 6 ed:a0:80:ed:b0:80 +5.2.2 n "𐏿" - 6 ed:a0:80:ed:bf:bf +5.2.3 n "󯰀" - 6 ed:ad:bf:ed:b0:80 +5.2.4 n "í­¿í¿¿" - 6 ed:ad:bf:ed:bf:bf +5.2.5 n "󰀀" - 6 ed:ae:80:ed:b0:80 +5.2.6 n "󰏿" - 6 ed:ae:80:ed:bf:bf +5.2.7 n "􏰀" - 6 ed:af:bf:ed:b0:80 +5.2.8 n "􏿿" - 6 ed:af:bf:ed:bf:bf +5.3 Other illegal code positions +5.3.1 n "￾" - 3 ef:bf:be +# The ffff is legal unless under use utf8 +5.3.2 y "ï¿¿" - 3 ef:bf:bf +__EOMK__ + +# 104..181 +{ + my $WARN; + my $id; + + local $SIG{__WARN__} = + sub { + # print "# $id: @_"; + $WARN++; + }; + + sub moan { + print "$id: @_"; + } + + sub test_unpack_U { + $WARN = 0; + unpack('U*', $_[0]); + } + + for (@MK) { + if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) { + # print "# $_\n"; + } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+(\d+))?$/) { + $id = $1; + my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen) = + ($2, $3, $4, $5, $6, $7); + my @hex = split(/:/, $hex); + unless (@hex == $byteslen) { + my $nhex = @hex; + moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n"; + } + { + use bytes; + my $bytesbyteslen = length($bytes); + unless ($bytesbyteslen == $byteslen) { + moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n"; + } + } + if ($okay eq 'y') { + test_unpack_U($bytes); + unless ($WARN == 0) { + moan "unpack('U*') false negative\n"; + print "not "; + } + } elsif ($okay eq 'n') { + test_unpack_U($bytes); + unless ($WARN) { + moan "unpack('U*') false positive\n"; + print "not "; + } + } + print "ok $test\n"; + $test++; + } else { + moan "unknown format\n"; + } + } +} + diff --git a/t/pragma/warn/utf8 b/t/pragma/warn/utf8 index 6a2fe54..012c655 100644 --- a/t/pragma/warn/utf8 +++ b/t/pragma/warn/utf8 @@ -24,6 +24,6 @@ my $a = "sn my $a = "snøstorm"; } EXPECT -Malformed UTF-8 character at - line 3. -Malformed UTF-8 character at - line 8. +Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 3. +Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 8. ######## diff --git a/toke.c b/toke.c index 2ec1f8c..32073a5 100644 --- a/toke.c +++ b/toke.c @@ -813,10 +813,10 @@ Perl_str_to_version(pTHX_ SV *sv) bool utf = SvUTF8(sv) ? TRUE : FALSE; char *end = start + len; while (start < end) { - I32 skip; + STRLEN skip; UV n; if (utf) - n = utf8_to_uv_chk((U8*)start, &skip, 0); + n = utf8_to_uv_chk((U8*)start, len, &skip, 0); else { n = *(U8*)start; skip = 1; @@ -1188,7 +1188,6 @@ S_scan_const(pTHX_ char *start) bool dorange = FALSE; /* are we in a translit range? */ bool didrange = FALSE; /* did we just finish a range? */ bool has_utf = FALSE; /* embedded \x{} */ - I32 len; /* ? */ UV uv; I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) @@ -1329,20 +1328,23 @@ S_scan_const(pTHX_ char *start) /* (now in tr/// code again) */ if (*s & 0x80 && thisutf) { - (void)utf8_to_uv_chk((U8*)s, &len, 0); - if (len == 1) { - /* illegal UTF8, make it valid */ - char *old_pvx = SvPVX(sv); - /* need space for one extra char (NOTE: SvCUR() not set here) */ - d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx); - d = (char*)uv_to_utf8((U8*)d, (U8)*s++); - } - else { - while (len--) - *d++ = *s++; - } - has_utf = TRUE; - continue; + STRLEN len; + UV uv; + + uv = utf8_to_uv_chk((U8*)s, send - s, &len, 1); + if (len == 1) { + /* illegal UTF8, make it valid */ + char *old_pvx = SvPVX(sv); + /* need space for one extra char (NOTE: SvCUR() not set here) */ + d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx); + d = (char*)uv_to_utf8((U8*)d, (U8)*s++); + } + else { + while (len--) + *d++ = *s++; + } + has_utf = TRUE; + continue; } /* backslashes */ @@ -1398,9 +1400,11 @@ S_scan_const(pTHX_ char *start) /* \132 indicates an octal constant */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': - len = 0; /* disallow underscores */ - uv = (UV)scan_oct(s, 3, &len); - s += len; + { + STRLEN len = 0; /* disallow underscores */ + uv = (UV)scan_oct(s, 3, &len); + s += len; + } goto NUM_ESCAPE_INSERT; /* \x24 indicates a hex constant */ @@ -1412,14 +1416,18 @@ S_scan_const(pTHX_ char *start) yyerror("Missing right brace on \\x{}"); e = s; } - len = 1; /* allow underscores */ - uv = (UV)scan_hex(s + 1, e - s - 1, &len); - s = e + 1; + { + STRLEN len = 1; /* allow underscores */ + uv = (UV)scan_hex(s + 1, e - s - 1, &len); + } + s = e + 1; } else { - len = 0; /* disallow underscores */ - uv = (UV)scan_hex(s, 2, &len); - s += len; + { + STRLEN len = 0; /* disallow underscores */ + uv = (UV)scan_hex(s, 2, &len); + s += len; + } } NUM_ESCAPE_INSERT: @@ -1528,8 +1536,10 @@ S_scan_const(pTHX_ char *start) *d = toCTRL(*d); d++; #else - len = *s++; - *d++ = toCTRL(len); + { + U8 c = *s++; + *d++ = toCTRL(c); + } #endif continue; diff --git a/utf8.c b/utf8.c index a713ea1..98236ed 100644 --- a/utf8.c +++ b/utf8.c @@ -153,12 +153,12 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len) } /* -=for apidoc Am|U8* s|utf8_to_uv_chk|I32 *retlen|I32 checking +=for apidoc Am|U8* s|utf8_to_uv_chk|STRLEN curlen|I32 *retlen|I32 checking Returns the character value of the first character in the string C -which is assumed to be in UTF8 encoding; C will be set to the -length, in bytes, of that character, and the pointer C will be -advanced to the end of the character. +which is assumed to be in UTF8 encoding and no longer than C; +C will be set to the length, in bytes, of that character, +and the pointer C will be advanced to the end of the character. If C does not point to a well-formed UTF8 character, the behaviour is dependent on the value of C: if this is true, it is @@ -170,79 +170,150 @@ warning is produced. */ UV -Perl_utf8_to_uv_chk(pTHX_ U8* s, I32* retlen, bool checking) +Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking) { - UV uv = *s; - int len; - if (!(uv & 0x80)) { + dTHR; + UV uv = *s, ouv; + STRLEN len = 1; + bool dowarn = ckWARN_d(WARN_UTF8); + STRLEN expectlen = 0; + + if (uv <= 0x7f) { /* Pure ASCII. */ if (retlen) *retlen = 1; return *s; } - if (!(uv & 0x40)) { - dTHR; - if (checking && retlen) { - *retlen = -1; - return 0; - } - if (ckWARN_d(WARN_UTF8)) - Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character"); - if (retlen) - *retlen = 1; - return *s; + if (uv >= 0x80 && uv <= 0xbf) { + if (dowarn) + Perl_warner(aTHX_ WARN_UTF8, + "Malformed UTF-8 character (unexpected continuation byte 0x%02x)", + uv); + goto malformed; + } + + if (uv >= 0xc0 && uv <= 0xfd && curlen > 1 && s[1] < 0x80) { + if (dowarn) + Perl_warner(aTHX_ WARN_UTF8, + "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x after byte 0x%02x)", + s[1], uv); + goto malformed; + } + + if ((uv == 0xfe || uv == 0xff) && IN_UTF8){ + if (dowarn) + Perl_warner(aTHX_ WARN_UTF8, + "Malformed UTF-8 character (impossible byte 0x%02x)", + uv); + goto malformed; } - if (!(uv & 0x20)) { len = 2; uv &= 0x1f; } - else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; } - else if (!(uv & 0x08)) { len = 4; uv &= 0x07; } - else if (!(uv & 0x04)) { len = 5; uv &= 0x03; } - else if (!(uv & 0x02)) { len = 6; uv &= 0x01; } - else if (!(uv & 0x01)) { len = 7; uv = 0; } + if (!(uv & 0x20)) { len = 2; uv &= 0x1f; } + else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; } + else if (!(uv & 0x08)) { len = 4; uv &= 0x07; } + else if (!(uv & 0x04)) { len = 5; uv &= 0x03; } + else if (!(uv & 0x02)) { len = 6; uv &= 0x01; } + else if (!(uv & 0x01)) { len = 7; uv = 0; } else { len = 13; uv = 0; } /* whoa! */ if (retlen) *retlen = len; - --len; + + expectlen = len; + + if (curlen < expectlen) { + if (dowarn) + Perl_warner(aTHX_ WARN_UTF8, + "Malformed UTF-8 character (%d byte%s, need %d)", + curlen, curlen > 1 ? "s" : "", expectlen); + goto malformed; + } + + len--; s++; + ouv = uv; + while (len--) { if ((*s & 0xc0) != 0x80) { - dTHR; - if (checking && retlen) { - *retlen = -1; - return 0; - } - - if (ckWARN_d(WARN_UTF8)) - Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character"); - if (retlen) - *retlen -= len + 1; - return 0xfffd; + if (dowarn) + Perl_warner(aTHX_ WARN_UTF8, + "Malformed UTF-8 character (unexpected continuation byte 0x%02x)", + *s); + goto malformed; } else - uv = (uv << 6) | (*s++ & 0x3f); + uv = (uv << 6) | (*s & 0x3f); + if (uv < ouv) { + if (dowarn) + Perl_warner(aTHX_ WARN_UTF8, + "Malformed UTF-8 character (overflow at 0x%"UVxf", byte 0x%02x)", + ouv, *s); + goto malformed; + } + s++; + ouv = uv; + } + + if (uv >= 0xd800 && uv <= 0xdfff) { + if (dowarn) + Perl_warner(aTHX_ WARN_UTF8, + "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")", + uv); + goto malformed; + } else if (uv == 0xfffe) { + if (dowarn) + Perl_warner(aTHX_ WARN_UTF8, + "Malformed UTF-8 character (byte order mark 0x%04"UVxf")", + uv); + goto malformed; + } else if (uv == 0xffff && IN_UTF8) { + if (dowarn) + Perl_warner(aTHX_ WARN_UTF8, + "Malformed UTF-8 character (impossible character 0x%04"UVxf")", + uv); + goto malformed; + } else if (expectlen > UTF8LEN(uv)) { + if (dowarn) + Perl_warner(aTHX_ WARN_UTF8, + "Malformed UTF-8 character (%d byte%s, need %d)", + expectlen, expectlen > 1 ? "s": "", UTF8LEN(uv)); + goto malformed; } + return uv; + +malformed: + + if (checking) { + if (retlen) + *retlen = len; + return 0; + } + + if (retlen) + *retlen = -1; + + return UNICODE_REPLACEMENT_CHARACTER; } /* -=for apidoc Am|U8* s|utf8_to_uv|I32 *retlen +=for apidoc Am|U8* s|utf8_to_uv|STRLEN *retlen Returns the character value of the first character in the string C which is assumed to be in UTF8 encoding; C will be set to the length, in bytes, of that character, and the pointer C will be advanced to the end of the character. -If C does not point to a well-formed UTF8 character, an optional UTF8 -warning is produced. +If C does not point to a well-formed UTF8 character, zero is +returned and retlen is set, if possible, to -1. =cut */ UV -Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen) +Perl_utf8_to_uv(pTHX_ U8* s, STRLEN* retlen) { - return Perl_utf8_to_uv_chk(aTHX_ s, retlen, 0); + return Perl_utf8_to_uv_chk(aTHX_ s, (STRLEN)-1, retlen, 0); } /* utf8_distance(a,b) returns the number of UTF8 characters between @@ -324,7 +395,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) if (*s < 0x80) *d++ = *s++; else { - I32 ulen; + STRLEN ulen; *d++ = (U8)utf8_to_uv(s, &ulen); s += ulen; } @@ -853,7 +924,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p) if (!PL_utf8_toupper) PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_toupper, p); - return uv ? uv : utf8_to_uv_chk(p,0,0); + return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0); } UV @@ -864,7 +935,7 @@ Perl_to_utf8_title(pTHX_ U8 *p) if (!PL_utf8_totitle) PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_totitle, p); - return uv ? uv : utf8_to_uv_chk(p,0,0); + return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0); } UV @@ -875,7 +946,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p) if (!PL_utf8_tolower) PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0); uv = swash_fetch(PL_utf8_tolower, p); - return uv ? uv : utf8_to_uv_chk(p,0,0); + return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0); } /* a "swash" is a swatch hash */ @@ -965,7 +1036,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) PUSHMARK(SP); EXTEND(SP,3); PUSHs((SV*)sv); - PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, 0, 0) & ~(needents - 1)))); + PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, STRLEN_MAX, 0, 0) & ~(needents - 1)))); PUSHs(sv_2mortal(newSViv(needents))); PUTBACK; if (call_method("SWASHGET", G_SCALAR)) diff --git a/utf8.h b/utf8.h index 7407335..bb494ab 100644 --- a/utf8.h +++ b/utf8.h @@ -29,7 +29,7 @@ END_EXTERN_C #define UTF8_MAXLEN 13 /* how wide can a single UTF8 encoded character become */ -/*#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)*/ +#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8) #define IN_BYTE (PL_curcop->op_private & HINT_BYTE) #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE) @@ -53,6 +53,8 @@ END_EXTERN_C (uv) < 0x80000000 ? 6 : 7 ) #endif +#define UNICODE_REPLACEMENT_CHARACTER 0xfffd + /* * Note: we try to be careful never to call the isXXX_utf8() functions * unless we're pretty sure we've seen the beginning of a UTF-8 character diff --git a/util.c b/util.c index 6c949c7..2122d4e 100644 --- a/util.c +++ b/util.c @@ -2933,7 +2933,7 @@ Perl_same_dirent(pTHX_ char *a, char *b) #endif /* !HAS_RENAME */ NV -Perl_scan_bin(pTHX_ char *start, I32 len, I32 *retlen) +Perl_scan_bin(pTHX_ char *start, STRLEN len, STRLEN *retlen) { register char *s = start; register NV rnv = 0.0; @@ -3004,7 +3004,7 @@ Perl_scan_bin(pTHX_ char *start, I32 len, I32 *retlen) } NV -Perl_scan_oct(pTHX_ char *start, I32 len, I32 *retlen) +Perl_scan_oct(pTHX_ char *start, STRLEN len, STRLEN *retlen) { register char *s = start; register NV rnv = 0.0; @@ -3074,7 +3074,7 @@ Perl_scan_oct(pTHX_ char *start, I32 len, I32 *retlen) } NV -Perl_scan_hex(pTHX_ char *start, I32 len, I32 *retlen) +Perl_scan_hex(pTHX_ char *start, STRLEN len, STRLEN *retlen) { register char *s = start; register NV rnv = 0.0;