X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=6726ba104c3b2be90e38e049d3391f5fec6bca0d;hb=24b5d5ccc3bc7535f387e48d7e29656751ae98a5;hp=7e1e6bd6d50f43d0cb4ca8f96a040a22f7fa4656;hpb=9dc45d57a9216877b08d137c07effd664b49bda8;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index 7e1e6bd..6726ba1 100644 --- a/regcomp.c +++ b/regcomp.c @@ -385,15 +385,14 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN(loc,m) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\ m, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END \ #define vWARNdep(loc,m) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \ - Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\ m, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END \ @@ -401,7 +400,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN2(loc, m, a1) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\ a1, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -409,7 +408,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN3(loc, m, a1, a2) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -417,7 +416,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN4(loc, m, a1, a2, a3) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\ a1, a2, a3, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -426,7 +425,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN5(loc, m, a1, a2, a3, a4) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, a3, a4, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -506,6 +505,8 @@ S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data) data->offset_float_max = (l ? data->last_start_max : data->pos_min + data->pos_delta); + if ((U32)data->offset_float_max > (U32)I32_MAX) + data->offset_float_max = I32_MAX; if (data->flags & SF_BEFORE_EOL) data->flags |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); @@ -932,6 +933,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg ? I32_MAX : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); + if (UTF) + SvUTF8_on(data->last_found); data->last_end = data->pos_min + l; data->pos_min += l; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; @@ -1964,17 +1967,23 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) goto remove_float; /* As in (a)+. */ - r->float_substr = data.longest_float; + if (SvUTF8(data.longest_float)) { + r->float_utf8 = data.longest_float; + r->float_substr = Nullsv; + } else { + r->float_substr = data.longest_float; + r->float_utf8 = Nullsv; + } r->float_min_offset = data.offset_float_min; r->float_max_offset = data.offset_float_max; t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FL_BEFORE_MEOL) || (RExC_flags16 & PMf_MULTILINE))); - fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0); + fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); } else { remove_float: - r->float_substr = Nullsv; + r->float_substr = r->float_utf8 = Nullsv; SvREFCNT_dec(data.longest_float); longest_float_length = 0; } @@ -1986,22 +1995,29 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) || (RExC_flags16 & PMf_MULTILINE)))) { int t; - r->anchored_substr = data.longest_fixed; + if (SvUTF8(data.longest_fixed)) { + r->anchored_utf8 = data.longest_fixed; + r->anchored_substr = Nullsv; + } else { + r->anchored_substr = data.longest_fixed; + r->anchored_utf8 = Nullsv; + } r->anchored_offset = data.offset_fixed; t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) || (RExC_flags16 & PMf_MULTILINE))); - fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0); + fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); } else { - r->anchored_substr = Nullsv; + r->anchored_substr = r->anchored_utf8 = Nullsv; SvREFCNT_dec(data.longest_fixed); longest_fixed_length = 0; } if (r->regstclass && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY)) r->regstclass = NULL; - if ((!r->anchored_substr || r->anchored_offset) && stclass_flag + if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) + && stclass_flag && !(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { I32 n = add_data(pRExC_state, 1, "f"); @@ -2024,20 +2040,22 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ if (longest_fixed_length > longest_float_length) { r->check_substr = r->anchored_substr; + r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; if (r->reganch & ROPT_ANCH_SINGLE) r->reganch |= ROPT_NOSCAN; } else { r->check_substr = r->float_substr; + r->check_utf8 = r->float_utf8; r->check_offset_min = data.offset_float_min; r->check_offset_max = data.offset_float_max; } /* XXXX Currently intuiting is not compatible with ANCH_GPOS. This should be changed ASAP! */ - if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) { + if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) { r->reganch |= RE_USE_INTUIT; - if (SvTAIL(r->check_substr)) + if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->reganch |= RE_INTUIT_TAIL; } } @@ -2053,7 +2071,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) data.start_class = &ch_class; data.last_closep = &last_close; minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS); - r->check_substr = r->anchored_substr = r->float_substr = Nullsv; + r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 + = r->float_substr = r->float_utf8 = Nullsv; if (!(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { I32 n = add_data(pRExC_state, 1, "f"); @@ -2953,8 +2972,11 @@ tryagain: } RExC_end++; } - else + else { RExC_end = RExC_parse + 2; + if (RExC_end > oldregxend) + RExC_end = oldregxend; + } RExC_parse--; ret = regclass(pRExC_state); @@ -3037,6 +3059,7 @@ tryagain: RExC_parse++; defchar: + ender = 0; ret = reg_node(pRExC_state, FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT); @@ -3182,6 +3205,8 @@ tryagain: if (len) p = oldp; else if (UTF) { + STRLEN unilen; + if (FOLD) { /* Emit all the Unicode characters. */ for (foldbuf = tmpbuf; @@ -3189,20 +3214,24 @@ tryagain: foldlen -= numlen) { ender = utf8_to_uvchr(foldbuf, &numlen); if (numlen > 0) { - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen; + reguni(pRExC_state, ender, s, &unilen); + s += unilen; + len += unilen; + /* In EBCDIC the numlen + * and unilen can differ. */ foldbuf += numlen; + if (numlen >= foldlen) + break; } else break; /* "Can't happen." */ } } else { - reguni(pRExC_state, ender, s, &numlen); - if (numlen > 0) { - s += numlen; - len += numlen; + reguni(pRExC_state, ender, s, &unilen); + if (unilen > 0) { + s += unilen; + len += unilen; } } } @@ -3213,6 +3242,8 @@ tryagain: break; } if (UTF) { + STRLEN unilen; + if (FOLD) { /* Emit all the Unicode characters. */ for (foldbuf = tmpbuf; @@ -3220,20 +3251,24 @@ tryagain: foldlen -= numlen) { ender = utf8_to_uvchr(foldbuf, &numlen); if (numlen > 0) { - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen; + reguni(pRExC_state, ender, s, &unilen); + len += unilen; + s += unilen; + /* In EBCDIC the numlen + * and unilen can differ. */ foldbuf += numlen; + if (numlen >= foldlen) + break; } else break; } } else { - reguni(pRExC_state, ender, s, &numlen); - if (numlen > 0) { - s += numlen; - len += numlen; + reguni(pRExC_state, ender, s, &unilen); + if (unilen > 0) { + s += unilen; + len += unilen; } } len--; @@ -3274,7 +3309,7 @@ tryagain: if (RExC_utf8) SvUTF8_on(sv); if (sv_utf8_downgrade(sv, TRUE)) { - char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding); + char *s = sv_recode_to_utf8(sv, PL_encoding); STRLEN newlen = SvCUR(sv); if (!SIZE_ONLY) { @@ -3446,15 +3481,17 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) STATIC void S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) { - if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && - POSIXCC(UCHARAT(RExC_parse))) { + if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) { char *s = RExC_parse; char c = *s++; while(*s && isALNUM(*s)) s++; if (*s && c == *s && s[1] == ']') { - vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c); + if (ckWARN(WARN_REGEXP)) + vWARN3(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); /* [[=foo=]] and [[.foo.]] are still future. */ if (POSIXCC_NOTYET(c)) { @@ -3512,7 +3549,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; - if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue)) + if (!SIZE_ONLY && POSIXCC(nextvalue)) checkposixcc(pRExC_state); /* allow 1st char to be ] (allowing it to be - is dealt with later) */ @@ -3561,6 +3598,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) case 'D': namedclass = ANYOF_NDIGIT; break; case 'p': case 'P': + if (RExC_parse >= RExC_end) + vFAIL2("Empty \\%c{}", (U8)value); if (*RExC_parse == '{') { U8 c = (U8)value; e = strchr(RExC_parse++, '}'); @@ -4077,14 +4116,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) else if (prevnatvalue == natvalue) { Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); if (FOLD) { - U8 tmpbuf [UTF8_MAXLEN+1]; U8 foldbuf[UTF8_MAXLEN_FOLD+1]; STRLEN foldlen; - UV f; - - uvchr_to_utf8(tmpbuf, natvalue); - to_utf8_fold(tmpbuf, foldbuf, &foldlen); - f = UNI_TO_NATIVE(utf8_to_uvchr(foldbuf, 0)); + UV f = to_uni_fold(natvalue, foldbuf, &foldlen); /* If folding and foldable and a single * character, insert also the folded version @@ -4523,6 +4557,15 @@ Perl_regdump(pTHX_ regexp *r) PL_colors[1], SvTAIL(r->anchored_substr) ? "$" : "", (IV)r->anchored_offset); + else if (r->anchored_utf8) + PerlIO_printf(Perl_debug_log, + "anchored utf8 `%s%.*s%s'%s at %"IVdf" ", + PL_colors[0], + (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)), + SvPVX(r->anchored_utf8), + PL_colors[1], + SvTAIL(r->anchored_utf8) ? "$" : "", + (IV)r->anchored_offset); if (r->float_substr) PerlIO_printf(Perl_debug_log, "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", @@ -4532,15 +4575,25 @@ Perl_regdump(pTHX_ regexp *r) PL_colors[1], SvTAIL(r->float_substr) ? "$" : "", (IV)r->float_min_offset, (UV)r->float_max_offset); - if (r->check_substr) + else if (r->float_utf8) + PerlIO_printf(Perl_debug_log, + "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", + PL_colors[0], + (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)), + SvPVX(r->float_utf8), + PL_colors[1], + SvTAIL(r->float_utf8) ? "$" : "", + (IV)r->float_min_offset, (UV)r->float_max_offset); + if (r->check_substr || r->check_utf8) PerlIO_printf(Perl_debug_log, r->check_substr == r->float_substr + && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored"); if (r->reganch & ROPT_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); if (r->reganch & ROPT_CHECK_ALL) PerlIO_printf(Perl_debug_log, " isall"); - if (r->check_substr) + if (r->check_substr || r->check_utf8) PerlIO_printf(Perl_debug_log, ") "); if (r->regstclass) { @@ -4789,18 +4842,21 @@ Perl_re_intuit_string(pTHX_ regexp *prog) { /* Assume that RE_INTUIT is set */ DEBUG_r( { STRLEN n_a; - char *s = SvPV(prog->check_substr,n_a); + char *s = SvPV(prog->check_substr + ? prog->check_substr : prog->check_utf8, n_a); if (!PL_colorset) reginitcolors(); PerlIO_printf(Perl_debug_log, - "%sUsing REx substr:%s `%s%.60s%s%s'\n", - PL_colors[4],PL_colors[5],PL_colors[0], + "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n", + PL_colors[4], + prog->check_substr ? "" : "utf8 ", + PL_colors[5],PL_colors[0], s, PL_colors[1], (strlen(s) > 60 ? "..." : "")); } ); - return prog->check_substr; + return prog->check_substr ? prog->check_substr : prog->check_utf8; } void @@ -4835,8 +4891,12 @@ Perl_pregfree(pTHX_ struct regexp *r) if (r->substrs) { if (r->anchored_substr) SvREFCNT_dec(r->anchored_substr); + if (r->anchored_utf8) + SvREFCNT_dec(r->anchored_utf8); if (r->float_substr) SvREFCNT_dec(r->float_substr); + if (r->float_utf8) + SvREFCNT_dec(r->float_utf8); Safefree(r->substrs); } if (r->data) {