From: Gurusamy Sarathy Date: Mon, 31 Jan 2000 04:57:42 +0000 (+0000) Subject: runtime now looks at the SVf_UTF8 bit on the SV to decide X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=7e2040f0b7c6fc88ec07b6e169aa2f75fc0130a4;p=p5sagit%2Fp5-mst-13.2.git runtime now looks at the SVf_UTF8 bit on the SV to decide whether to use widechar semantics; lexer and RE engine continue to need "use utf8" to enable unicode awareness in literals and patterns (TODO: this needs to be fixed); $1 et al are marked SvUTF8 if the pattern was compiled for utf8 (TODO: propagating it from the data is probably better) p4raw-id: //depot/perl@4930 --- diff --git a/doop.c b/doop.c index 990898d..26ac87d 100644 --- a/doop.c +++ b/doop.c @@ -933,7 +933,7 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv) s = SvPV(sv, len); if (len && !SvPOK(sv)) s = SvPV_force(sv, len); - if (IN_UTF8) { + if (DO_UTF8(sv)) { if (s && len) { char *send = s + len; char *start = s; @@ -946,12 +946,12 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv) *s = '\0'; SvCUR_set(sv, s - start); SvNIOK_off(sv); + SvUTF8_on(astr); } else sv_setpvn(astr, "", 0); } - else - if (s && len) { + else if (s && len) { s += --len; sv_setpvn(astr, s, 1); *s = '\0'; @@ -961,7 +961,7 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv) else sv_setpvn(astr, "", 0); SvSETMAGIC(sv); -} +} I32 Perl_do_chomp(pTHX_ register SV *sv) diff --git a/gv.c b/gv.c index 5a91c08..acd8501 100644 --- a/gv.c +++ b/gv.c @@ -581,9 +581,7 @@ Perl_gv_fetchpv(pTHX_ const char *nambeg, I32 add, I32 sv_type) /* No stash in name, so see how we can default */ if (!stash) { - if (isIDFIRST(*name) - || (IN_UTF8 && ((*name & 0xc0) == 0xc0) && isIDFIRST_utf8((U8*)name))) - { + if (isIDFIRST_lazy(name)) { bool global = FALSE; if (isUPPER(*name)) { diff --git a/mg.c b/mg.c index edabb11..3ba3d08 100644 --- a/mg.c +++ b/mg.c @@ -609,6 +609,10 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) PL_tainted = FALSE; } sv_setpvn(sv, s, i); + if ((PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) + SvUTF8_on(sv); + else + SvUTF8_off(sv); if (PL_tainting) PL_tainted = (was_tainted || RX_MATCH_TAINTED(rx)); break; @@ -1286,7 +1290,7 @@ Perl_magic_getpos(pTHX_ SV *sv, MAGIC *mg) if (mg && mg->mg_len >= 0) { dTHR; I32 i = mg->mg_len; - if (IN_UTF8) + if (DO_UTF8(lsv)) sv_pos_b2u(lsv, &i); sv_setiv(sv, i + PL_curcop->cop_arybase); return 0; @@ -1323,7 +1327,7 @@ Perl_magic_setpos(pTHX_ SV *sv, MAGIC *mg) pos = SvIV(sv) - PL_curcop->cop_arybase; - if (IN_UTF8) { + if (DO_UTF8(lsv)) { ulen = sv_len_utf8(lsv); if (ulen) len = ulen; diff --git a/op.c b/op.c index 02edc8d..fdfdf27 100644 --- a/op.c +++ b/op.c @@ -2867,6 +2867,8 @@ Perl_pmruntime(pTHX_ OP *o, OP *expr, OP *repl) p = SvPV(pat, plen); pm->op_pmflags |= PMf_SKIPWHITE; } + else if (DO_UTF8(pat)) + pm->op_pmdynflags |= PMdf_UTF8; pm->op_pmregexp = CALLREGCOMP(aTHX_ p, p + plen, pm); if (strEQ("\\s+", pm->op_pmregexp->precomp)) pm->op_pmflags |= PMf_WHITE; diff --git a/op.h b/op.h index fc38b9c..8bc8276 100644 --- a/op.h +++ b/op.h @@ -234,6 +234,7 @@ struct pmop { #define PMdf_USED 0x01 /* pm has been used once already */ #define PMdf_TAINTED 0x02 /* pm compiled from tainted pattern */ +#define PMdf_UTF8 0x04 /* pm compiled from utf8 data */ #define PMf_RETAINT 0x0001 /* taint $1 etc. if target tainted */ #define PMf_ONCE 0x0002 /* use pattern only once per reset */ diff --git a/pp.c b/pp.c index 13fbec8..45654a9 100644 --- a/pp.c +++ b/pp.c @@ -389,7 +389,7 @@ PP(pp_pos) mg = mg_find(sv, 'g'); if (mg && mg->mg_len >= 0) { I32 i = mg->mg_len; - if (IN_UTF8) + if (DO_UTF8(sv)) sv_pos_b2u(sv, &i); PUSHi(i + PL_curcop->cop_arybase); RETURN; @@ -1426,7 +1426,7 @@ PP(pp_negate) sv_setsv(TARG, sv); *SvPV_force(TARG, len) = *s == '-' ? '+' : '-'; } - else if (IN_UTF8 && *(U8*)s >= 0xc0 && isIDFIRST_utf8((U8*)s)) { + else if (DO_UTF8(sv) && *(U8*)s >= 0xc0 && isIDFIRST_utf8((U8*)s)) { sv_setpvn(TARG, "-", 1); sv_catsv(TARG, sv); } @@ -1930,13 +1930,12 @@ PP(pp_oct) PP(pp_length) { djSP; dTARGET; + SV *sv = TOPs; - if (IN_UTF8) { - SETi( sv_len_utf8(TOPs) ); - RETURN; - } - - SETi( sv_len(TOPs) ); + if (DO_UTF8(sv)) + SETi(sv_len_utf8(sv)); + else + SETi(sv_len(sv)); RETURN; } @@ -1957,6 +1956,7 @@ PP(pp_substr) STRLEN repl_len; SvTAINTED_off(TARG); /* decontaminate */ + SvUTF8_off(TARG); /* decontaminate */ if (MAXARG > 2) { if (MAXARG > 3) { sv = POPs; @@ -1968,7 +1968,7 @@ PP(pp_substr) sv = POPs; PUTBACK; tmps = SvPV(sv, curlen); - if (IN_UTF8) { + if (DO_UTF8(sv)) { utfcurlen = sv_len_utf8(sv); if (utfcurlen == curlen) utfcurlen = 0; @@ -2017,8 +2017,10 @@ PP(pp_substr) RETPUSHUNDEF; } else { - if (utfcurlen) + if (utfcurlen) { sv_pos_u2b(sv, &pos, &rem); + SvUTF8_on(TARG); + } tmps += pos; sv_setpvn(TARG, tmps, rem); if (repl) @@ -2106,7 +2108,7 @@ PP(pp_index) little = POPs; big = POPs; tmps = SvPV(big, biglen); - if (IN_UTF8 && offset > 0) + if (offset > 0 && DO_UTF8(big)) sv_pos_u2b(big, &offset, 0); if (offset < 0) offset = 0; @@ -2117,7 +2119,7 @@ PP(pp_index) retval = -1; else retval = tmps2 - tmps; - if (IN_UTF8 && retval > 0) + if (retval > 0 && DO_UTF8(big)) sv_pos_b2u(big, &retval); PUSHi(retval + arybase); RETURN; @@ -2145,7 +2147,7 @@ PP(pp_rindex) if (MAXARG < 3) offset = blen; else { - if (IN_UTF8 && offset > 0) + if (offset > 0 && DO_UTF8(big)) sv_pos_u2b(big, &offset, 0); offset = offset - arybase + llen; } @@ -2158,7 +2160,7 @@ PP(pp_rindex) retval = -1; else retval = tmps2 - tmps; - if (IN_UTF8 && retval > 0) + if (retval > 0 && DO_UTF8(big)) sv_pos_b2u(big, &retval); PUSHi(retval + arybase); RETURN; @@ -2179,10 +2181,11 @@ PP(pp_ord) djSP; dTARGET; UV value; STRLEN n_a; - U8 *tmps = (U8*)POPpx; + SV *tmpsv = POPs; + U8 *tmps = (U8*)SvPVx(tmpsv,n_a); I32 retlen; - if (IN_UTF8 && (*tmps & 0x80)) + if ((*tmps & 0x80) && DO_UTF8(tmpsv)) value = utf8_to_uv(tmps, &retlen); else value = (UV)(*tmps & 255); @@ -2196,14 +2199,16 @@ PP(pp_chr) char *tmps; U32 value = POPu; + SvUTF8_off(TARG); /* decontaminate */ (void)SvUPGRADE(TARG,SVt_PV); - if (IN_UTF8 && value >= 128) { + if (value >= 128 && !IN_BYTE) { SvGROW(TARG,8); tmps = SvPVX(TARG); tmps = (char*)uv_to_utf8((U8*)tmps, (UV)value); SvCUR_set(TARG, tmps - SvPVX(TARG)); *tmps = '\0'; + SvUTF8_on(TARG); (void)SvPOK_only(TARG); XPUSHs(TARG); RETURN; @@ -2245,7 +2250,7 @@ PP(pp_ucfirst) register U8 *s; STRLEN slen; - if (IN_UTF8 && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) { + if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) { I32 ulen; U8 tmpbuf[10]; U8 *tend; @@ -2265,6 +2270,7 @@ PP(pp_ucfirst) dTARGET; sv_setpvn(TARG, (char*)tmpbuf, tend - tmpbuf); sv_catpvn(TARG, (char*)(s + ulen), slen - ulen); + SvUTF8_on(TARG); SETs(TARG); } else { @@ -2275,6 +2281,7 @@ PP(pp_ucfirst) else { if (!SvPADTMP(sv) || SvREADONLY(sv)) { dTARGET; + SvUTF8_off(TARG); /* decontaminate */ sv_setsv(TARG, sv); sv = TARG; SETs(sv); @@ -2302,7 +2309,7 @@ PP(pp_lcfirst) register U8 *s; STRLEN slen; - if (IN_UTF8 && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) { + if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) { I32 ulen; U8 tmpbuf[10]; U8 *tend; @@ -2322,6 +2329,7 @@ PP(pp_lcfirst) dTARGET; sv_setpvn(TARG, (char*)tmpbuf, tend - tmpbuf); sv_catpvn(TARG, (char*)(s + ulen), slen - ulen); + SvUTF8_on(TARG); SETs(TARG); } else { @@ -2332,6 +2340,7 @@ PP(pp_lcfirst) else { if (!SvPADTMP(sv) || SvREADONLY(sv)) { dTARGET; + SvUTF8_off(TARG); /* decontaminate */ sv_setsv(TARG, sv); sv = TARG; SETs(sv); @@ -2346,7 +2355,6 @@ PP(pp_lcfirst) else *s = toLOWER(*s); } - SETs(sv); } if (SvSMAGICAL(sv)) mg_set(sv); @@ -2360,7 +2368,7 @@ PP(pp_uc) register U8 *s; STRLEN len; - if (IN_UTF8) { + if (DO_UTF8(sv)) { dTARGET; I32 ulen; register U8 *d; @@ -2368,6 +2376,7 @@ PP(pp_uc) s = (U8*)SvPV(sv,len); if (!len) { + SvUTF8_off(TARG); /* decontaminate */ sv_setpvn(TARG, "", 0); SETs(TARG); } @@ -2392,6 +2401,7 @@ PP(pp_uc) } } *d = '\0'; + SvUTF8_on(TARG); SvCUR_set(TARG, d - (U8*)SvPVX(TARG)); SETs(TARG); } @@ -2399,6 +2409,7 @@ PP(pp_uc) else { if (!SvPADTMP(sv) || SvREADONLY(sv)) { dTARGET; + SvUTF8_off(TARG); /* decontaminate */ sv_setsv(TARG, sv); sv = TARG; SETs(sv); @@ -2431,7 +2442,7 @@ PP(pp_lc) register U8 *s; STRLEN len; - if (IN_UTF8) { + if (DO_UTF8(sv)) { dTARGET; I32 ulen; register U8 *d; @@ -2439,6 +2450,7 @@ PP(pp_lc) s = (U8*)SvPV(sv,len); if (!len) { + SvUTF8_off(TARG); /* decontaminate */ sv_setpvn(TARG, "", 0); SETs(TARG); } @@ -2463,6 +2475,7 @@ PP(pp_lc) } } *d = '\0'; + SvUTF8_on(TARG); SvCUR_set(TARG, d - (U8*)SvPVX(TARG)); SETs(TARG); } @@ -2470,6 +2483,7 @@ PP(pp_lc) else { if (!SvPADTMP(sv) || SvREADONLY(sv)) { dTARGET; + SvUTF8_off(TARG); /* decontaminate */ sv_setsv(TARG, sv); sv = TARG; SETs(sv); @@ -2504,11 +2518,12 @@ PP(pp_quotemeta) register char *s = SvPV(sv,len); register char *d; + SvUTF8_off(TARG); /* decontaminate */ if (len) { (void)SvUPGRADE(TARG, SVt_PV); SvGROW(TARG, (len * 2) + 1); d = SvPVX(TARG); - if (IN_UTF8) { + if (DO_UTF8(sv)) { while (len) { if (*s & 0x80) { STRLEN ulen = UTF8SKIP(s); @@ -2525,6 +2540,7 @@ PP(pp_quotemeta) len--; } } + SvUTF8_on(TARG); } else { while (len--) { @@ -3184,13 +3200,14 @@ PP(pp_reverse) dTARGET; STRLEN len; + SvUTF8_off(TARG); /* decontaminate */ if (SP - MARK > 1) do_join(TARG, &PL_sv_no, MARK, SP); else sv_setsv(TARG, (SP > MARK) ? *SP : DEFSV); up = SvPV_force(TARG, len); if (len > 1) { - if (IN_UTF8) { /* first reverse each character */ + if (DO_UTF8(TARG)) { /* first reverse each character */ U8* s = (U8*)SvPVX(TARG); U8* send = (U8*)(s + len); while (s < send) { diff --git a/pp_ctl.c b/pp_ctl.c index 8e41646..fd725a3 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -114,6 +114,10 @@ PP(pp_regcomp) PL_reginterp_cnt = I32_MAX; /* Mark as safe. */ pm->op_pmflags = pm->op_pmpermflags; /* reset case sensitivity */ + if (DO_UTF8(tmpstr)) + pm->op_pmdynflags |= PMdf_UTF8; + else + pm->op_pmdynflags &= ~PMdf_UTF8; pm->op_pmregexp = CALLREGCOMP(aTHX_ t, t + len, pm); PL_reginterp_cnt = 0; /* XXXX Be extra paranoid - needed inside tie/overload accessors. */ @@ -296,7 +300,8 @@ PP(pp_formline) NV value; bool gotsome; STRLEN len; - STRLEN fudge = SvCUR(tmpForm) * (IN_UTF8 ? 3 : 1) + 1; + STRLEN fudge = SvCUR(tmpForm) * (IN_BYTE ? 1 : 3) + 1; + bool item_is_utf = FALSE; if (!SvMAGICAL(tmpForm) || !SvCOMPILED(tmpForm)) { SvREADONLY_off(tmpForm); @@ -374,7 +379,7 @@ PP(pp_formline) case FF_CHECKNL: item = s = SvPV(sv, len); itemsize = len; - if (IN_UTF8) { + if (DO_UTF8(sv)) { itemsize = sv_len_utf8(sv); if (itemsize != len) { I32 itembytes; @@ -393,11 +398,13 @@ PP(pp_formline) break; s++; } + item_is_utf = TRUE; itemsize = s - item; sv_pos_b2u(sv, &itemsize); break; } } + item_is_utf = FALSE; if (itemsize > fieldsize) itemsize = fieldsize; send = chophere = s + itemsize; @@ -414,7 +421,7 @@ PP(pp_formline) case FF_CHECKCHOP: item = s = SvPV(sv, len); itemsize = len; - if (IN_UTF8) { + if (DO_UTF8(sv)) { itemsize = sv_len_utf8(sv); if (itemsize != len) { I32 itembytes; @@ -452,9 +459,11 @@ PP(pp_formline) itemsize = chophere - item; sv_pos_b2u(sv, &itemsize); } + item_is_utf = TRUE; break; } } + item_is_utf = FALSE; if (itemsize <= fieldsize) { send = chophere = s + itemsize; while (s < send) { @@ -510,7 +519,7 @@ PP(pp_formline) case FF_ITEM: arg = itemsize; s = item; - if (IN_UTF8) { + if (item_is_utf) { while (arg--) { if (*s & 0x80) { switch (UTF8SKIP(s)) { @@ -553,6 +562,7 @@ PP(pp_formline) case FF_LINEGLOB: item = s = SvPV(sv, len); itemsize = len; + item_is_utf = FALSE; /* XXX is this correct? */ if (itemsize) { gotsome = TRUE; send = s + itemsize; diff --git a/pp_hot.c b/pp_hot.c index 18d717b..6ebde09 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -2737,7 +2737,7 @@ S_method_common(pTHX_ SV* meth, U32* hashp) !(ob=(SV*)GvIO(iogv))) { if (!packname || - ((*(U8*)packname >= 0xc0 && IN_UTF8) + ((*(U8*)packname >= 0xc0 && DO_UTF8(sv)) ? !isIDFIRST_utf8((U8*)packname) : !isIDFIRST(*packname) )) diff --git a/regcomp.c b/regcomp.c index 77a4bfc..5972724 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1339,8 +1339,10 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (exp == NULL) FAIL("NULL regexp argument"); - if (PL_curcop == &PL_compiling ? (PL_hints & HINT_UTF8) : IN_UTF8) + if (PL_curcop == &PL_compiling ? (PL_hints & HINT_UTF8) : IN_UTF8) { PL_reg_flags |= RF_utf8; + pm->op_pmdynflags |= PMdf_UTF8; + } else PL_reg_flags = 0; diff --git a/sv.c b/sv.c index 834dac3..d76752f 100644 --- a/sv.c +++ b/sv.c @@ -2769,7 +2769,7 @@ Perl_sv_setsv(pTHX_ SV *dstr, register SV *sstr) *SvEND(dstr) = '\0'; (void)SvPOK_only(dstr); } - if (SvUTF8(sstr)) + if (DO_UTF8(sstr)) SvUTF8_on(dstr); /*SUPPRESS 560*/ if (sflags & SVp_NOK) { @@ -5638,6 +5638,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV STRLEN origlen; I32 svix = 0; static char nullstr[] = "(null)"; + SV *argsv; /* no matter what, this is a string now */ (void)SvPV_force(sv, origlen); @@ -5652,12 +5653,18 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV char *s = va_arg(*args, char*); sv_catpv(sv, s ? s : nullstr); } - else if (svix < svmax) + else if (svix < svmax) { sv_catsv(sv, *svargs); + if (DO_UTF8(*svargs)) + SvUTF8_on(sv); + } return; case '_': if (args) { - sv_catsv(sv, va_arg(*args, SV*)); + argsv = va_arg(*args, SV*); + sv_catsv(sv, argsv); + if (DO_UTF8(argsv)) + SvUTF8_on(sv); return; } /* See comment on '_' below */ @@ -5676,6 +5683,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV STRLEN zeros = 0; bool has_precis = FALSE; STRLEN precis = 0; + bool is_utf = FALSE; char esignbuf[4]; U8 utf8buf[10]; @@ -5816,22 +5824,20 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV goto string; case 'c': - if (IN_UTF8) { - if (args) - uv = va_arg(*args, int); - else - uv = (svix < svmax) ? SvIVx(svargs[svix++]) : 0; - + if (args) + uv = va_arg(*args, int); + else + uv = (svix < svmax) ? SvIVx(svargs[svix++]) : 0; + if (uv >= 128 && !IN_BYTE) { eptr = (char*)utf8buf; elen = uv_to_utf8((U8*)eptr, uv) - utf8buf; - goto string; + is_utf = TRUE; + } + else { + c = (char)uv; + eptr = &c; + elen = 1; } - if (args) - c = va_arg(*args, int); - else - c = (svix < svmax) ? SvIVx(svargs[svix++]) : 0; - eptr = &c; - elen = 1; goto string; case 's': @@ -5851,16 +5857,18 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV } } else if (svix < svmax) { - eptr = SvPVx(svargs[svix++], elen); - if (IN_UTF8) { + argsv = svargs[svix++]; + eptr = SvPVx(argsv, elen); + if (DO_UTF8(argsv)) { if (has_precis && precis < elen) { I32 p = precis; - sv_pos_u2b(svargs[svix - 1], &p, 0); /* sticks at end */ + sv_pos_u2b(argsv, &p, 0); /* sticks at end */ precis = p; } if (width) { /* fudge width (can't fudge elen) */ - width += elen - sv_len_utf8(svargs[svix - 1]); + width += elen - sv_len_utf8(argsv); } + is_utf = TRUE; } } goto string; @@ -5873,7 +5881,10 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV */ if (!args) goto unknown; - eptr = SvPVx(va_arg(*args, SV*), elen); + argsv = va_arg(*args,SV*); + eptr = SvPVx(argsv, elen); + if (DO_UTF8(argsv)) + is_utf = TRUE; string: if (has_precis && elen > precis) @@ -6216,6 +6227,8 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV memset(p, ' ', gap); p += gap; } + if (is_utf) + SvUTF8_on(sv); *p = '\0'; SvCUR(sv) = p - SvPVX(sv); } diff --git a/t/pragma/warn/doop b/t/pragma/warn/doop index cce6bdc..c16e24f 100644 --- a/t/pragma/warn/doop +++ b/t/pragma/warn/doop @@ -1,6 +1,6 @@ doop.c AOK - Malformed UTF-8 character + \x%s will produce malformed UTF-8 character; use \x{%s} for that __END__ @@ -9,7 +9,6 @@ use utf8 ; $_ = "\x80 \xff" ; chop ; EXPECT -Malformed UTF-8 character at - line 4. ######## # doop.c BEGIN { @@ -28,4 +27,3 @@ chop ; EXPECT \x80 will produce malformed UTF-8 character; use \x{80} for that at - line 10. \xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10. -Malformed UTF-8 character at - line 11. diff --git a/t/pragma/warn/pp b/t/pragma/warn/pp index eb09e05..4c70fd5 100644 --- a/t/pragma/warn/pp +++ b/t/pragma/warn/pp @@ -30,7 +30,8 @@ Mandatory Warnings ------------------ - Malformed UTF-8 character + Malformed UTF-8 character (not tested: difficult to produce with + perl now) __END__ # pp.c @@ -109,7 +110,6 @@ use utf8 ; $_ = "\x80 \xff" ; reverse ; EXPECT -Malformed UTF-8 character at - line 4. ######## # pp.c BEGIN { @@ -128,4 +128,3 @@ reverse ; EXPECT \x80 will produce malformed UTF-8 character; use \x{80} for that at - line 10. \xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10. -Malformed UTF-8 character at - line 11. diff --git a/t/pragma/warn/sv b/t/pragma/warn/sv index 97d61bc..cdec48e 100644 --- a/t/pragma/warn/sv +++ b/t/pragma/warn/sv @@ -38,8 +38,8 @@ Mandatory Warnings ------------------ - Malformed UTF-8 character [sv_pos_b2u] - my $a = rindex "a\xff bc ", "bc" ; + Malformed UTF-8 character [sv_pos_b2u] (not tested: difficult to produce + with perl now) Mandatory Warnings TODO ------------------ @@ -286,8 +286,6 @@ $^W =0 ; my $a = rindex "a\xff bc ", "bc" ; EXPECT \xff will produce malformed UTF-8 character; use \x{ff} for that at - line 12. -Malformed UTF-8 character at - line 12. -Malformed UTF-8 character at - line 16. ######## # sv.c use warnings 'misc'; diff --git a/t/pragma/warn/utf8 b/t/pragma/warn/utf8 index 19b8d1d..cb1f202 100644 --- a/t/pragma/warn/utf8 +++ b/t/pragma/warn/utf8 @@ -1,14 +1,13 @@ utf8.c AOK - All Mandatory warnings - [utf8_to_uv] Malformed UTF-8 character my $a = ord "\x80" ; Malformed UTF-8 character my $a = ord "\xf080" ; + <<<<<< this warning can't be easily triggered from perl anymore [utf16_to_utf8] Malformed UTF-16 surrogate @@ -19,7 +18,6 @@ __END__ use utf8 ; my $a = ord "\x80" ; EXPECT -Malformed UTF-8 character at - line 3. ######## # utf8.c [utf8_to_uv] BEGIN { @@ -37,15 +35,12 @@ my $a = ord "\x80" ; my $a = ord "\x80" ; } EXPECT -Malformed UTF-8 character at - line 9. \x80 will produce malformed UTF-8 character; use \x{80} for that at - line 12. -Malformed UTF-8 character at - line 12. ######## # utf8.c [utf8_to_uv] use utf8 ; my $a = ord "\xf080" ; EXPECT -Malformed UTF-8 character at - line 3. ######## # utf8.c [utf8_to_uv] BEGIN { @@ -63,6 +58,4 @@ my $a = ord "\xf080" ; my $a = ord "\xf080" ; } EXPECT -Malformed UTF-8 character at - line 9. \xf0 will produce malformed UTF-8 character; use \x{f0} for that at - line 12. -Malformed UTF-8 character at - line 12. diff --git a/toke.c b/toke.c index e7e2174..cc370bc 100644 --- a/toke.c +++ b/toke.c @@ -32,19 +32,8 @@ static void restore_rsfp(pTHXo_ void *f); #define XFAKEBRACK 128 #define XENUMMASK 127 +/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/ #define UTF (PL_hints & HINT_UTF8) -/* - * Note: we try to be careful never to call the isXXX_utf8() functions - * unless we're pretty sure we've seen the beginning of a UTF-8 character - * (that is, the two high bits are set). Otherwise we risk loading in the - * heavy-duty SWASHINIT and SWASHGET routines unnecessarily. - */ -#define isIDFIRST_lazy(p) ((!UTF || (*((U8*)p) < 0xc0)) \ - ? isIDFIRST(*(p)) \ - : isIDFIRST_utf8((U8*)p)) -#define isALNUM_lazy(p) ((!UTF || (*((U8*)p) < 0xc0)) \ - ? isALNUM(*(p)) \ - : isALNUM_utf8((U8*)p)) /* In variables name $^X, these are the legal values for X. * 1999-02-27 mjd-perl-patch@plover.com */ @@ -223,9 +212,9 @@ S_no_op(pTHX_ char *what, char *s) yywarn(Perl_form(aTHX_ "%s found where operator expected", what)); if (is_first) Perl_warn(aTHX_ "\t(Missing semicolon on previous line?)\n"); - else if (PL_oldoldbufptr && isIDFIRST_lazy(PL_oldoldbufptr)) { + else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) { char *t; - for (t = PL_oldoldbufptr; *t && (isALNUM_lazy(t) || *t == ':'); t++) ; + for (t = PL_oldoldbufptr; *t && (isALNUM_lazy_if(t,UTF) || *t == ':'); t++) ; if (t < PL_bufptr && isSPACE(*t)) Perl_warn(aTHX_ "\t(Do you need to predeclare %.*s?)\n", t - PL_oldoldbufptr, PL_oldoldbufptr); @@ -633,7 +622,7 @@ S_check_uni(pTHX) return; while (isSPACE(*PL_last_uni)) PL_last_uni++; - for (s = PL_last_uni; isALNUM_lazy(s) || *s == '-'; s++) ; + for (s = PL_last_uni; isALNUM_lazy_if(s,UTF) || *s == '-'; s++) ; if ((t = strchr(s, '(')) && t < PL_bufptr) return; if (ckWARN_d(WARN_AMBIGUOUS)){ @@ -756,7 +745,7 @@ S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow start = skipspace(start); s = start; - if (isIDFIRST_lazy(s) || + if (isIDFIRST_lazy_if(s,UTF) || (allow_pack && *s == ':') || (allow_initial_tick && *s == '\'') ) { @@ -1159,6 +1148,7 @@ S_scan_const(pTHX_ char *start) register char *s = start; /* start of the constant */ register char *d = SvPVX(sv); /* destination for copies */ bool dorange = FALSE; /* are we in a translit range? */ + bool has_utf = FALSE; /* embedded \x{} */ I32 len; /* ? */ I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) ? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) @@ -1264,7 +1254,8 @@ S_scan_const(pTHX_ char *start) } /* check for embedded arrays (@foo, @:foo, @'foo, @{foo}, @$foo) */ - else if (*s == '@' && s[1] && (isALNUM_lazy(s+1) || strchr(":'{$", s[1]))) + else if (*s == '@' && s[1] + && (isALNUM_lazy_if(s+1,UTF) || strchr(":'{$", s[1]))) break; /* check for embedded scalars. only stop if we're sure it's a @@ -1368,6 +1359,7 @@ S_scan_const(pTHX_ char *start) d = (char*)uv_to_utf8((U8*)d, (UV)scan_hex(s + 1, e - s - 1, &len)); s = e + 1; + has_utf = TRUE; } else { UV uv = (UV)scan_hex(s, 2, &len); @@ -1375,6 +1367,7 @@ S_scan_const(pTHX_ char *start) utf != (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) { d = (char*)uv_to_utf8((U8*)d, uv); /* doing a CU or UC */ + has_utf = TRUE; } else { if (uv >= 127 && UTF) { @@ -1485,6 +1478,8 @@ S_scan_const(pTHX_ char *start) *d = '\0'; SvCUR_set(sv, d - SvPVX(sv)); SvPOK_on(sv); + if (has_utf) + SvUTF8_on(sv); /* shrink the sv if we allocated more than we used */ if (SvCUR(sv) + 5 < SvLEN(sv)) { @@ -1593,7 +1588,7 @@ S_intuit_more(pTHX_ register char *s) case '&': case '$': weight -= seen[un_char] * 10; - if (isALNUM_lazy(s+1)) { + if (isALNUM_lazy_if(s+1,UTF)) { scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE); if ((int)strlen(tmpbuf) > 1 && gv_fetchpv(tmpbuf,FALSE, SVt_PV)) weight -= 100; @@ -2285,7 +2280,7 @@ Perl_yylex(pTHX) retry: switch (*s) { default: - if (isIDFIRST_lazy(s)) + if (isIDFIRST_lazy_if(s,UTF)) goto keylookup; Perl_croak(aTHX_ "Unrecognized character \\x%02X", *s & 255); case 4: @@ -2653,7 +2648,7 @@ Perl_yylex(pTHX) else if (*s == '>') { s++; s = skipspace(s); - if (isIDFIRST_lazy(s)) { + if (isIDFIRST_lazy_if(s,UTF)) { s = force_word(s,METHOD,FALSE,TRUE,FALSE); TOKEN(ARROW); } @@ -2749,7 +2744,7 @@ Perl_yylex(pTHX) grabattrs: s = skipspace(s); attrs = Nullop; - while (isIDFIRST_lazy(s)) { + while (isIDFIRST_lazy_if(s,UTF)) { d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len); if (isLOWER(*s) && (tmp = keyword(PL_tokenbuf, len))) { if (tmp < 0) tmp = -tmp; @@ -2894,7 +2889,7 @@ Perl_yylex(pTHX) while (d < PL_bufend && (*d == ' ' || *d == '\t')) d++; } - if (d < PL_bufend && isIDFIRST_lazy(d)) { + if (d < PL_bufend && isIDFIRST_lazy_if(d,UTF)) { d = scan_word(d, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE, &len); while (d < PL_bufend && (*d == ' ' || *d == '\t')) @@ -2985,9 +2980,9 @@ Perl_yylex(pTHX) } t++; } - else if (isALNUM_lazy(t)) { + else if (isALNUM_lazy_if(t,UTF)) { t += UTF8SKIP(t); - while (t < PL_bufend && isALNUM_lazy(t)) + while (t < PL_bufend && isALNUM_lazy_if(t,UTF)) t += UTF8SKIP(t); } while (t < PL_bufend && isSPACE(*t)) @@ -3047,7 +3042,9 @@ Perl_yylex(pTHX) AOPERATOR(ANDAND); s--; if (PL_expect == XOPERATOR) { - if (ckWARN(WARN_SEMICOLON) && isIDFIRST_lazy(s) && PL_bufptr == PL_linestart) { + if (ckWARN(WARN_SEMICOLON) + && isIDFIRST_lazy_if(s,UTF) && PL_bufptr == PL_linestart) + { CopLINE_dec(PL_curcop); Perl_warner(aTHX_ WARN_SEMICOLON, PL_warn_nosemi); CopLINE_inc(PL_curcop); @@ -3177,7 +3174,7 @@ Perl_yylex(pTHX) } } - if (s[1] == '#' && (isIDFIRST_lazy(s+2) || strchr("{$:+-", s[2]))) { + if (s[1] == '#' && (isIDFIRST_lazy_if(s+2,UTF) || strchr("{$:+-", s[2]))) { PL_tokenbuf[0] = '@'; s = scan_ident(s + 1, PL_bufend, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE); @@ -3220,7 +3217,7 @@ Perl_yylex(pTHX) PL_tokenbuf[0] = '@'; if (ckWARN(WARN_SYNTAX)) { for(t = s + 1; - isSPACE(*t) || isALNUM_lazy(t) || *t == '$'; + isSPACE(*t) || isALNUM_lazy_if(t,UTF) || *t == '$'; t++) ; if (*t++ == ',') { PL_bufptr = skipspace(PL_bufptr); @@ -3240,7 +3237,7 @@ Perl_yylex(pTHX) char tmpbuf[sizeof PL_tokenbuf]; STRLEN len; for (t++; isSPACE(*t); t++) ; - if (isIDFIRST_lazy(t)) { + if (isIDFIRST_lazy_if(t,UTF)) { t = scan_word(t, tmpbuf, sizeof tmpbuf, TRUE, &len); for (; isSPACE(*t); t++) ; if (*t == ';' && get_cv(tmpbuf, FALSE)) @@ -3258,9 +3255,9 @@ Perl_yylex(pTHX) PL_expect = XOPERATOR; else if (strchr("$@\"'`q", *s)) PL_expect = XTERM; /* e.g. print $fh "foo" */ - else if (strchr("&*<%", *s) && isIDFIRST_lazy(s+1)) + else if (strchr("&*<%", *s) && isIDFIRST_lazy_if(s+1,UTF)) PL_expect = XTERM; /* e.g. print $fh &sub */ - else if (isIDFIRST_lazy(s)) { + else if (isIDFIRST_lazy_if(s,UTF)) { char tmpbuf[sizeof PL_tokenbuf]; scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len); if (tmp = keyword(tmpbuf, len)) { @@ -3318,7 +3315,7 @@ Perl_yylex(pTHX) if (ckWARN(WARN_SYNTAX)) { if (*s == '[' || *s == '{') { char *t = s + 1; - while (*t && (isALNUM_lazy(t) || strchr(" \t$#+-'\"", *t))) + while (*t && (isALNUM_lazy_if(t,UTF) || strchr(" \t$#+-'\"", *t))) t++; if (*t == '}' || *t == ']') { t++; @@ -3339,7 +3336,8 @@ Perl_yylex(pTHX) /* Disable warning on "study /blah/" */ if (PL_oldoldbufptr == PL_last_uni && (*PL_last_uni != 's' || s - PL_last_uni < 5 - || memNE(PL_last_uni, "study", 5) || isALNUM_lazy(PL_last_uni+5))) + || memNE(PL_last_uni, "study", 5) + || isALNUM_lazy_if(PL_last_uni+5,UTF))) check_uni(); s = scan_pat(s,OP_MATCH); TERM(sublex_start()); @@ -3665,7 +3663,7 @@ Perl_yylex(pTHX) /* Two barewords in a row may indicate method call. */ - if ((isIDFIRST_lazy(s) || *s == '$') && (tmp=intuit_method(s,gv))) + if ((isIDFIRST_lazy_if(s,UTF) || *s == '$') && (tmp=intuit_method(s,gv))) return tmp; /* If not a declared subroutine, it's an indirect object. */ @@ -3711,7 +3709,7 @@ Perl_yylex(pTHX) /* If followed by a bareword, see if it looks like indir obj. */ - if ((isIDFIRST_lazy(s) || *s == '$') && (tmp = intuit_method(s,gv))) + if ((isIDFIRST_lazy_if(s,UTF) || *s == '$') && (tmp = intuit_method(s,gv))) return tmp; /* Not a method, so call it a subroutine (if defined) */ @@ -4045,7 +4043,7 @@ Perl_yylex(pTHX) case KEY_foreach: yylval.ival = CopLINE(PL_curcop); s = skipspace(s); - if (PL_expect == XSTATE && isIDFIRST_lazy(s)) { + if (PL_expect == XSTATE && isIDFIRST_lazy_if(s,UTF)) { char *p = s; if ((PL_bufend - p) >= 3 && strnEQ(p, "my", 2) && isSPACE(*(p + 2))) @@ -4054,7 +4052,7 @@ Perl_yylex(pTHX) strnEQ(p, "our", 3) && isSPACE(*(p + 3))) p += 3; p = skipspace(p); - if (isIDFIRST_lazy(p)) { + if (isIDFIRST_lazy_if(p,UTF)) { p = scan_ident(p, PL_bufend, PL_tokenbuf, sizeof PL_tokenbuf, TRUE); p = skipspace(p); @@ -4269,7 +4267,7 @@ Perl_yylex(pTHX) case KEY_my: PL_in_my = tmp; s = skipspace(s); - if (isIDFIRST_lazy(s)) { + if (isIDFIRST_lazy_if(s,UTF)) { s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len); if (len == 3 && strnEQ(PL_tokenbuf, "sub", 3)) goto really_sub; @@ -4307,9 +4305,9 @@ Perl_yylex(pTHX) case KEY_open: s = skipspace(s); - if (isIDFIRST_lazy(s)) { + if (isIDFIRST_lazy_if(s,UTF)) { char *t; - for (d = s; isALNUM_lazy(d); d++) ; + for (d = s; isALNUM_lazy_if(d,UTF); d++) ; t = skipspace(d); if (strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_AMBIGUOUS)) Perl_warner(aTHX_ WARN_AMBIGUOUS, @@ -4448,7 +4446,7 @@ Perl_yylex(pTHX) else { *PL_tokenbuf = '\0'; s = force_word(s,WORD,TRUE,TRUE,FALSE); - if (isIDFIRST_lazy(PL_tokenbuf)) + if (isIDFIRST_lazy_if(PL_tokenbuf,UTF)) gv_stashpvn(PL_tokenbuf, strlen(PL_tokenbuf), TRUE); else if (*s == '<') yyerror("<> should be quotes"); @@ -4639,7 +4637,7 @@ Perl_yylex(pTHX) s = skipspace(s); - if (isIDFIRST_lazy(s) || *s == '\'' || + if (isIDFIRST_lazy_if(s,UTF) || *s == '\'' || (*s == ':' && s[1] == ':')) { PL_expect = XBLOCK; @@ -5529,9 +5527,9 @@ S_checkcomma(pTHX_ register char *s, char *name, char *what) s++; while (s < PL_bufend && isSPACE(*s)) s++; - if (isIDFIRST_lazy(s)) { + if (isIDFIRST_lazy_if(s,UTF)) { w = s++; - while (isALNUM_lazy(s)) + while (isALNUM_lazy_if(s,UTF)) s++; while (s < PL_bufend && isSPACE(*s)) s++; @@ -5653,7 +5651,7 @@ S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_packag Perl_croak(aTHX_ ident_too_long); if (isALNUM(*s)) /* UTF handled below */ *d++ = *s++; - else if (*s == '\'' && allow_package && isIDFIRST_lazy(s+1)) { + else if (*s == '\'' && allow_package && isIDFIRST_lazy_if(s+1,UTF)) { *d++ = ':'; *d++ = ':'; s++; @@ -5705,7 +5703,7 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des Perl_croak(aTHX_ ident_too_long); if (isALNUM(*s)) /* UTF handled below */ *d++ = *s++; - else if (*s == '\'' && isIDFIRST_lazy(s+1)) { + else if (*s == '\'' && isIDFIRST_lazy_if(s+1,UTF)) { *d++ = ':'; *d++ = ':'; s++; @@ -5736,7 +5734,7 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des return s; } if (*s == '$' && s[1] && - (isALNUM_lazy(s+1) || strchr("${", s[1]) || strnEQ(s+1,"::",2)) ) + (isALNUM_lazy_if(s+1,UTF) || strchr("${", s[1]) || strnEQ(s+1,"::",2)) ) { return s; } @@ -5763,11 +5761,11 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des } } } - if (isIDFIRST_lazy(d)) { + if (isIDFIRST_lazy_if(d,UTF)) { d++; if (UTF) { e = s; - while (e < send && isALNUM_lazy(e) || *e == ':') { + while (e < send && isALNUM_lazy_if(e,UTF) || *e == ':') { e += UTF8SKIP(e); while (e < send && *e & 0x80 && is_utf8_mark((U8*)e)) e += UTF8SKIP(e); @@ -6071,9 +6069,9 @@ S_scan_heredoc(pTHX_ register char *s) s++, term = '\''; else term = '"'; - if (!isALNUM_lazy(s)) + if (!isALNUM_lazy_if(s,UTF)) deprecate("bare << to mean <<\"\""); - for (; isALNUM_lazy(s); s++) { + for (; isALNUM_lazy_if(s,UTF); s++) { if (d < e) *d++ = *s; } @@ -6284,7 +6282,7 @@ S_scan_inputsymbol(pTHX_ char *start) if (*d == '$' && d[1]) d++; /* allow or */ - while (*d && (isALNUM_lazy(d) || *d == '\'' || *d == ':')) + while (*d && (isALNUM_lazy_if(d,UTF) || *d == '\'' || *d == ':')) d++; /* If we've tried to read what we allow filehandles to look like, and diff --git a/utf8.h b/utf8.h index e71264c..578abc7 100644 --- a/utf8.h +++ b/utf8.h @@ -28,5 +28,21 @@ END_EXTERN_C #define IN_UTF8 (PL_curcop->op_private & HINT_UTF8) #define IN_BYTE (PL_curcop->op_private & HINT_BYTE) +#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE) #define UTF8SKIP(s) PL_utf8skip[*(U8*)s] + +/* + * Note: we try to be careful never to call the isXXX_utf8() functions + * unless we're pretty sure we've seen the beginning of a UTF-8 character + * (that is, the two high bits are set). Otherwise we risk loading in the + * heavy-duty SWASHINIT and SWASHGET routines unnecessarily. + */ +#define isIDFIRST_lazy_if(p,c) ((!c || (*((U8*)p) < 0xc0)) \ + ? isIDFIRST(*(p)) \ + : isIDFIRST_utf8((U8*)p)) +#define isALNUM_lazy_if(p,c) ((!c || (*((U8*)p) < 0xc0)) \ + ? isALNUM(*(p)) \ + : isALNUM_utf8((U8*)p)) +#define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1) +#define isALNUM_lazy(p) isALNUM_lazy_if(p,1)