X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=6726ba104c3b2be90e38e049d3391f5fec6bca0d;hb=24b5d5ccc3bc7535f387e48d7e29656751ae98a5;hp=3459e0a7ee91ff077c3ee1354b03d83efa6a055b;hpb=108481f44f73c82d1f291486a8da7654b4d58fe6;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index 3459e0a..6726ba1 100644 --- a/regcomp.c +++ b/regcomp.c @@ -69,7 +69,7 @@ * **** Alterations to Henry's code are... **** - **** Copyright (c) 1991-2001, Larry Wall + **** Copyright (c) 1991-2002, Larry Wall **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -385,15 +385,14 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN(loc,m) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\ m, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END \ #define vWARNdep(loc,m) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \ - Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\ m, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END \ @@ -401,7 +400,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN2(loc, m, a1) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\ a1, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -409,7 +408,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN3(loc, m, a1, a2) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -417,7 +416,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN4(loc, m, a1, a2, a3) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\ a1, a2, a3, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -426,7 +425,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN5(loc, m, a1, a2, a3, a4) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, a3, a4, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -506,6 +505,8 @@ S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data) data->offset_float_max = (l ? data->last_start_max : data->pos_min + data->pos_delta); + if ((U32)data->offset_float_max > (U32)I32_MAX) + data->offset_float_max = I32_MAX; if (data->flags & SF_BEFORE_EOL) data->flags |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); @@ -736,6 +737,50 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg n = nnext; } } + + if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) { +/* + Two problematic code points in Unicode casefolding of EXACT nodes: + + U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + + which casefold to + + Unicode UTF-8 + + U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 + U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 + + This means that in case-insensitive matching (or "loose matching", + as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte + length of the above casefolded versions) can match a target string + of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). + This would rather mess up the minimum length computation. + + What we'll do is to look for the tail four bytes, and then peek + at the preceding two bytes to see whether we need to decrease + the minimum length by four (six minus two). + + Thanks to the design of UTF-8, there cannot be false matches: + A sequence of valid UTF-8 bytes cannot be a subsequence of + another valid sequence of UTF-8 bytes. + +*/ + char *s0 = STRING(scan), *s, *t; + char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4; + char *t0 = "\xcc\x88\xcc\x81"; + char *t1 = t0 + 3; + + for (s = s0 + 2; + s < s2 && (t = ninstr(s, s1, t0, t1)); + s = t + 4) { + if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || + ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) + min -= 4; + } + } + #ifdef DEBUGGING /* Allow dumping */ n = scan + NODE_SZ_STR(scan); @@ -888,6 +933,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg ? I32_MAX : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); + if (UTF) + SvUTF8_on(data->last_found); data->last_end = data->pos_min + l; data->pos_min += l; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; @@ -1848,7 +1895,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) first = NEXTOPER(first); goto again; } - else if ((OP(first) == STAR && + else if (!sawopen && (OP(first) == STAR && PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) && !(r->reganch & ROPT_ANCH) ) { @@ -1920,17 +1967,23 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) goto remove_float; /* As in (a)+. */ - r->float_substr = data.longest_float; + if (SvUTF8(data.longest_float)) { + r->float_utf8 = data.longest_float; + r->float_substr = Nullsv; + } else { + r->float_substr = data.longest_float; + r->float_utf8 = Nullsv; + } r->float_min_offset = data.offset_float_min; r->float_max_offset = data.offset_float_max; t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FL_BEFORE_MEOL) || (RExC_flags16 & PMf_MULTILINE))); - fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0); + fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); } else { remove_float: - r->float_substr = Nullsv; + r->float_substr = r->float_utf8 = Nullsv; SvREFCNT_dec(data.longest_float); longest_float_length = 0; } @@ -1942,22 +1995,29 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) || (RExC_flags16 & PMf_MULTILINE)))) { int t; - r->anchored_substr = data.longest_fixed; + if (SvUTF8(data.longest_fixed)) { + r->anchored_utf8 = data.longest_fixed; + r->anchored_substr = Nullsv; + } else { + r->anchored_substr = data.longest_fixed; + r->anchored_utf8 = Nullsv; + } r->anchored_offset = data.offset_fixed; t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) || (RExC_flags16 & PMf_MULTILINE))); - fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0); + fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); } else { - r->anchored_substr = Nullsv; + r->anchored_substr = r->anchored_utf8 = Nullsv; SvREFCNT_dec(data.longest_fixed); longest_fixed_length = 0; } if (r->regstclass && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY)) r->regstclass = NULL; - if ((!r->anchored_substr || r->anchored_offset) && stclass_flag + if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) + && stclass_flag && !(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { I32 n = add_data(pRExC_state, 1, "f"); @@ -1980,20 +2040,22 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ if (longest_fixed_length > longest_float_length) { r->check_substr = r->anchored_substr; + r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; if (r->reganch & ROPT_ANCH_SINGLE) r->reganch |= ROPT_NOSCAN; } else { r->check_substr = r->float_substr; + r->check_utf8 = r->float_utf8; r->check_offset_min = data.offset_float_min; r->check_offset_max = data.offset_float_max; } /* XXXX Currently intuiting is not compatible with ANCH_GPOS. This should be changed ASAP! */ - if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) { + if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) { r->reganch |= RE_USE_INTUIT; - if (SvTAIL(r->check_substr)) + if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->reganch |= RE_INTUIT_TAIL; } } @@ -2009,7 +2071,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) data.start_class = &ch_class; data.last_closep = &last_close; minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS); - r->check_substr = r->anchored_substr = r->float_substr = Nullsv; + r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 + = r->float_substr = r->float_utf8 = Nullsv; if (!(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { I32 n = add_data(pRExC_state, 1, "f"); @@ -2124,6 +2187,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) /* FALL THROUGH*/ case '?': /* (??...) */ logical = 1; + if (*RExC_parse != '{') + goto unknown; paren = *RExC_parse++; /* FALL THROUGH */ case '{': /* (?{...}) */ @@ -2361,9 +2426,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) } else if (paren != '?') /* Not Conditional */ ret = br; - if (flags&HASWIDTH) - *flagp |= HASWIDTH; - *flagp |= flags&SPSTART; + *flagp |= flags & (SPSTART | HASWIDTH); lastbr = br; while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { @@ -2909,8 +2972,11 @@ tryagain: } RExC_end++; } - else + else { RExC_end = RExC_parse + 2; + if (RExC_end > oldregxend) + RExC_end = oldregxend; + } RExC_parse--; ret = regclass(pRExC_state); @@ -2993,6 +3059,7 @@ tryagain: RExC_parse++; defchar: + ender = 0; ret = reg_node(pRExC_state, FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT); @@ -3138,22 +3205,34 @@ tryagain: if (len) p = oldp; else if (UTF) { + STRLEN unilen; + if (FOLD) { /* Emit all the Unicode characters. */ for (foldbuf = tmpbuf; foldlen; foldlen -= numlen) { ender = utf8_to_uvchr(foldbuf, &numlen); - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen; - foldbuf += numlen; + if (numlen > 0) { + reguni(pRExC_state, ender, s, &unilen); + s += unilen; + len += unilen; + /* In EBCDIC the numlen + * and unilen can differ. */ + foldbuf += numlen; + if (numlen >= foldlen) + break; + } + else + break; /* "Can't happen." */ } } else { - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen; + reguni(pRExC_state, ender, s, &unilen); + if (unilen > 0) { + s += unilen; + len += unilen; + } } } else { @@ -3163,22 +3242,34 @@ tryagain: break; } if (UTF) { + STRLEN unilen; + if (FOLD) { /* Emit all the Unicode characters. */ for (foldbuf = tmpbuf; foldlen; foldlen -= numlen) { ender = utf8_to_uvchr(foldbuf, &numlen); - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen; - foldbuf += numlen; + if (numlen > 0) { + reguni(pRExC_state, ender, s, &unilen); + len += unilen; + s += unilen; + /* In EBCDIC the numlen + * and unilen can differ. */ + foldbuf += numlen; + if (numlen >= foldlen) + break; + } + else + break; } } else { - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen; + reguni(pRExC_state, ender, s, &unilen); + if (unilen > 0) { + s += unilen; + len += unilen; + } } len--; } @@ -3218,7 +3309,7 @@ tryagain: if (RExC_utf8) SvUTF8_on(sv); if (sv_utf8_downgrade(sv, TRUE)) { - char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding); + char *s = sv_recode_to_utf8(sv, PL_encoding); STRLEN newlen = SvCUR(sv); if (!SIZE_ONLY) { @@ -3390,15 +3481,17 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) STATIC void S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) { - if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && - POSIXCC(UCHARAT(RExC_parse))) { + if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) { char *s = RExC_parse; char c = *s++; while(*s && isALNUM(*s)) s++; if (*s && c == *s && s[1] == ']') { - vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c); + if (ckWARN(WARN_REGEXP)) + vWARN3(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); /* [[=foo=]] and [[.foo.]] are still future. */ if (POSIXCC_NOTYET(c)) { @@ -3456,11 +3549,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; - if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue)) + if (!SIZE_ONLY && POSIXCC(nextvalue)) checkposixcc(pRExC_state); - if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-') - goto charclassloop; /* allow 1st char to be ] or - */ + /* allow 1st char to be ] (allowing it to be - is dealt with later) */ + if (UCHARAT(RExC_parse) == ']') + goto charclassloop; while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { @@ -3504,6 +3598,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) case 'D': namedclass = ANYOF_NDIGIT; break; case 'p': case 'P': + if (RExC_parse >= RExC_end) + vFAIL2("Empty \\%c{}", (U8)value); if (*RExC_parse == '{') { U8 c = (U8)value; e = strchr(RExC_parse++, '}'); @@ -4009,22 +4105,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_BITMAP_SET(ret, i); } if (value > 255 || UTF) { + UV prevnatvalue = NATIVE_TO_UNI(prevvalue); + UV natvalue = NATIVE_TO_UNI(value); + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; - if (prevvalue < value) + if (prevnatvalue < natvalue) { /* what about > ? */ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - (UV)prevvalue, (UV)value); - else if (prevvalue == value) { - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - (UV)value); + prevnatvalue, natvalue); + } + else if (prevnatvalue == natvalue) { + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); if (FOLD) { - U8 tmpbuf [UTF8_MAXLEN+1]; U8 foldbuf[UTF8_MAXLEN_FOLD+1]; STRLEN foldlen; - UV f; - - uvchr_to_utf8(tmpbuf, value); - to_utf8_fold(tmpbuf, foldbuf, &foldlen); - f = utf8_to_uvchr(foldbuf, 0); + UV f = to_uni_fold(natvalue, foldbuf, &foldlen); /* If folding and foldable and a single * character, insert also the folded version @@ -4463,6 +4557,15 @@ Perl_regdump(pTHX_ regexp *r) PL_colors[1], SvTAIL(r->anchored_substr) ? "$" : "", (IV)r->anchored_offset); + else if (r->anchored_utf8) + PerlIO_printf(Perl_debug_log, + "anchored utf8 `%s%.*s%s'%s at %"IVdf" ", + PL_colors[0], + (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)), + SvPVX(r->anchored_utf8), + PL_colors[1], + SvTAIL(r->anchored_utf8) ? "$" : "", + (IV)r->anchored_offset); if (r->float_substr) PerlIO_printf(Perl_debug_log, "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", @@ -4472,15 +4575,25 @@ Perl_regdump(pTHX_ regexp *r) PL_colors[1], SvTAIL(r->float_substr) ? "$" : "", (IV)r->float_min_offset, (UV)r->float_max_offset); - if (r->check_substr) + else if (r->float_utf8) + PerlIO_printf(Perl_debug_log, + "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", + PL_colors[0], + (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)), + SvPVX(r->float_utf8), + PL_colors[1], + SvTAIL(r->float_utf8) ? "$" : "", + (IV)r->float_min_offset, (UV)r->float_max_offset); + if (r->check_substr || r->check_utf8) PerlIO_printf(Perl_debug_log, r->check_substr == r->float_substr + && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored"); if (r->reganch & ROPT_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); if (r->reganch & ROPT_CHECK_ALL) PerlIO_printf(Perl_debug_log, " isall"); - if (r->check_substr) + if (r->check_substr || r->check_utf8) PerlIO_printf(Perl_debug_log, ") "); if (r->regstclass) { @@ -4557,9 +4670,13 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) if (k == EXACT) { SV *dsv = sv_2mortal(newSVpvn("", 0)); - bool do_utf8 = DO_UTF8(sv); + /* Using is_utf8_string() is a crude hack but it may + * be the best for now since we have no flag "this EXACTish + * node was UTF-8" --jhi */ + bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o)); char *s = do_utf8 ? - pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) : + pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, + UNI_DISPLAY_REGEX) : STRING(o); int len = do_utf8 ? strlen(s) : @@ -4725,18 +4842,21 @@ Perl_re_intuit_string(pTHX_ regexp *prog) { /* Assume that RE_INTUIT is set */ DEBUG_r( { STRLEN n_a; - char *s = SvPV(prog->check_substr,n_a); + char *s = SvPV(prog->check_substr + ? prog->check_substr : prog->check_utf8, n_a); if (!PL_colorset) reginitcolors(); PerlIO_printf(Perl_debug_log, - "%sUsing REx substr:%s `%s%.60s%s%s'\n", - PL_colors[4],PL_colors[5],PL_colors[0], + "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n", + PL_colors[4], + prog->check_substr ? "" : "utf8 ", + PL_colors[5],PL_colors[0], s, PL_colors[1], (strlen(s) > 60 ? "..." : "")); } ); - return prog->check_substr; + return prog->check_substr ? prog->check_substr : prog->check_utf8; } void @@ -4750,7 +4870,7 @@ Perl_pregfree(pTHX_ struct regexp *r) return; DEBUG_r({ char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, - UNI_DISPLAY_ISPRINT); + UNI_DISPLAY_REGEX); int len = SvCUR(dsv); if (!PL_colorset) reginitcolors(); @@ -4771,8 +4891,12 @@ Perl_pregfree(pTHX_ struct regexp *r) if (r->substrs) { if (r->anchored_substr) SvREFCNT_dec(r->anchored_substr); + if (r->anchored_utf8) + SvREFCNT_dec(r->anchored_utf8); if (r->float_substr) SvREFCNT_dec(r->float_substr); + if (r->float_utf8) + SvREFCNT_dec(r->float_utf8); Safefree(r->substrs); } if (r->data) {