X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=639f1405824a1e3c19eb19c092e25be5cec3be3a;hb=7ddb28a3f19d8cce47422d3e3553bf3197f83187;hp=c73e8157500415614113b688492576865227528a;hpb=f8bef55053427841654eec6e37b8f0f4224b2976;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index c73e815..639f140 100644 --- a/regcomp.c +++ b/regcomp.c @@ -69,7 +69,7 @@ * **** Alterations to Henry's code are... **** - **** Copyright (c) 1991-2001, Larry Wall + **** Copyright (c) 1991-2002, Larry Wall **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -385,15 +385,14 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN(loc,m) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\ m, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END \ #define vWARNdep(loc,m) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \ - Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\ m, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END \ @@ -401,7 +400,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN2(loc, m, a1) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\ a1, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -409,7 +408,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN3(loc, m, a1, a2) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -417,7 +416,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN4(loc, m, a1, a2, a3) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\ a1, a2, a3, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -426,7 +425,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN5(loc, m, a1, a2, a3, a4) \ STMT_START { \ IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, a3, a4, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -658,6 +657,17 @@ S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, str } } +/* + * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2. + * These need to be revisited when a newer toolchain becomes available. + */ +#if defined(__sparc64__) && defined(__GNUC__) +# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) +# undef SPARC64_GCC_WORKAROUND +# define SPARC64_GCC_WORKAROUND 1 +# endif +#endif + /* REx optimizer. Converts nodes into quickier variants "in place". Finds fixed substrings. */ @@ -725,6 +735,50 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg n = nnext; } } + + if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) { +/* + Two problematic code points in Unicode casefolding of EXACT nodes: + + U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + + which casefold to + + Unicode UTF-8 + + U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 + U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 + + This means that in case-insensitive matching (or "loose matching", + as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte + length of the above casefolded versions) can match a target string + of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). + This would rather mess up the minimum length computation. + + What we'll do is to look for the tail four bytes, and then peek + at the preceding two bytes to see whether we need to decrease + the minimum length by four (six minus two). + + Thanks to the design of UTF-8, there cannot be false matches: + A sequence of valid UTF-8 bytes cannot be a subsequence of + another valid sequence of UTF-8 bytes. + +*/ + char *s0 = STRING(scan), *s, *t; + char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4; + char *t0 = "\xcc\x88\xcc\x81"; + char *t1 = t0 + 3; + + for (s = s0 + 2; + s < s2 && (t = ninstr(s, s1, t0, t1)); + s = t + 4) { + if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || + ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) + min -= 4; + } + } + #ifdef DEBUGGING /* Allow dumping */ n = scan + NODE_SZ_STR(scan); @@ -877,6 +931,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg ? I32_MAX : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); + if (UTF) + SvUTF8_on(data->last_found); data->last_end = data->pos_min + l; data->pos_min += l; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; @@ -1203,11 +1259,28 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg int counted = mincount != 0; if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ +#if defined(SPARC64_GCC_WORKAROUND) + I32 b = 0; + STRLEN l = 0; + char *s = NULL; + I32 old = 0; + + if (pos_before >= data->last_start_min) + b = pos_before; + else + b = data->last_start_min; + + l = 0; + s = SvPV(data->last_found, l); + old = b - data->last_start_min; + +#else I32 b = pos_before >= data->last_start_min ? pos_before : data->last_start_min; STRLEN l; char *s = SvPV(data->last_found, l); I32 old = b - data->last_start_min; +#endif if (UTF) old = utf8_hop((U8*)s, old) - (U8*)s; @@ -1662,17 +1735,15 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (exp == NULL) FAIL("NULL regexp argument"); - /* XXXX This looks very suspicious... */ - if (pm->op_pmdynflags & PMdf_CMP_UTF8) - RExC_utf8 = 1; - else - RExC_utf8 = 0; + RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; RExC_precomp = exp; - DEBUG_r(if (!PL_colorset) reginitcolors()); - DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", - PL_colors[4],PL_colors[5],PL_colors[0], - (int)(xend - exp), RExC_precomp, PL_colors[1])); + DEBUG_r({ + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], + (int)(xend - exp), RExC_precomp, PL_colors[1]); + }); RExC_flags16 = pm->op_pmflags; RExC_sawback = 0; @@ -1822,7 +1893,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) first = NEXTOPER(first); goto again; } - else if ((OP(first) == STAR && + else if (!sawopen && (OP(first) == STAR && PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) && !(r->reganch & ROPT_ANCH) ) { @@ -1894,17 +1965,23 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) goto remove_float; /* As in (a)+. */ - r->float_substr = data.longest_float; + if (SvUTF8(data.longest_float)) { + r->float_utf8 = data.longest_float; + r->float_substr = Nullsv; + } else { + r->float_substr = data.longest_float; + r->float_utf8 = Nullsv; + } r->float_min_offset = data.offset_float_min; r->float_max_offset = data.offset_float_max; t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FL_BEFORE_MEOL) || (RExC_flags16 & PMf_MULTILINE))); - fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0); + fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); } else { remove_float: - r->float_substr = Nullsv; + r->float_substr = r->float_utf8 = Nullsv; SvREFCNT_dec(data.longest_float); longest_float_length = 0; } @@ -1916,22 +1993,29 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) || (RExC_flags16 & PMf_MULTILINE)))) { int t; - r->anchored_substr = data.longest_fixed; + if (SvUTF8(data.longest_fixed)) { + r->anchored_utf8 = data.longest_fixed; + r->anchored_substr = Nullsv; + } else { + r->anchored_substr = data.longest_fixed; + r->anchored_utf8 = Nullsv; + } r->anchored_offset = data.offset_fixed; t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) || (RExC_flags16 & PMf_MULTILINE))); - fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0); + fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); } else { - r->anchored_substr = Nullsv; + r->anchored_substr = r->anchored_utf8 = Nullsv; SvREFCNT_dec(data.longest_fixed); longest_fixed_length = 0; } if (r->regstclass && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY)) r->regstclass = NULL; - if ((!r->anchored_substr || r->anchored_offset) && stclass_flag + if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) + && stclass_flag && !(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { I32 n = add_data(pRExC_state, 1, "f"); @@ -1954,20 +2038,22 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ if (longest_fixed_length > longest_float_length) { r->check_substr = r->anchored_substr; + r->check_utf8 = r->anchored_utf8; r->check_offset_min = r->check_offset_max = r->anchored_offset; if (r->reganch & ROPT_ANCH_SINGLE) r->reganch |= ROPT_NOSCAN; } else { r->check_substr = r->float_substr; + r->check_utf8 = r->float_utf8; r->check_offset_min = data.offset_float_min; r->check_offset_max = data.offset_float_max; } /* XXXX Currently intuiting is not compatible with ANCH_GPOS. This should be changed ASAP! */ - if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) { + if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) { r->reganch |= RE_USE_INTUIT; - if (SvTAIL(r->check_substr)) + if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8)) r->reganch |= RE_INTUIT_TAIL; } } @@ -1983,7 +2069,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) data.start_class = &ch_class; data.last_closep = &last_close; minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS); - r->check_substr = r->anchored_substr = r->float_substr = Nullsv; + r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 + = r->float_substr = r->float_utf8 = Nullsv; if (!(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { I32 n = add_data(pRExC_state, 1, "f"); @@ -2098,6 +2185,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) /* FALL THROUGH*/ case '?': /* (??...) */ logical = 1; + if (*RExC_parse != '{') + goto unknown; paren = *RExC_parse++; /* FALL THROUGH */ case '{': /* (?{...}) */ @@ -2335,9 +2424,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) } else if (paren != '?') /* Not Conditional */ ret = br; - if (flags&HASWIDTH) - *flagp |= HASWIDTH; - *flagp |= flags&SPSTART; + *flagp |= flags & (SPSTART | HASWIDTH); lastbr = br; while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { @@ -2959,14 +3046,15 @@ tryagain: register char *p; char *oldp, *s; STRLEN numlen; - STRLEN ulen; - U8 tmpbuf[UTF8_MAXLEN*2+1]; + STRLEN foldlen; + U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf; parse_start = RExC_parse - 1; RExC_parse++; defchar: + ender = 0; ret = reg_node(pRExC_state, FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT); @@ -2991,6 +3079,8 @@ tryagain: case '\\': switch (*++p) { case 'A': + case 'C': + case 'X': case 'G': case 'Z': case 'z': @@ -3103,16 +3193,42 @@ tryagain: if (RExC_flags16 & PMf_EXTENDED) p = regwhite(p, RExC_end); if (UTF && FOLD) { - toLOWER_uni(ender, tmpbuf, &ulen); - ender = utf8_to_uvchr(tmpbuf, 0); + /* Prime the casefolded buffer. */ + ender = toFOLD_uni(ender, tmpbuf, &foldlen); } if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; - else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen; + else if (UTF) { + STRLEN unilen; + + if (FOLD) { + /* Emit all the Unicode characters. */ + for (foldbuf = tmpbuf; + foldlen; + foldlen -= numlen) { + ender = utf8_to_uvchr(foldbuf, &numlen); + if (numlen > 0) { + reguni(pRExC_state, ender, s, &unilen); + s += unilen; + len += unilen; + /* In EBCDIC the numlen + * and unilen can differ. */ + foldbuf += numlen; + if (numlen >= foldlen) + break; + } + else + break; /* "Can't happen." */ + } + } + else { + reguni(pRExC_state, ender, s, &unilen); + if (unilen > 0) { + s += unilen; + len += unilen; + } + } } else { len++; @@ -3120,10 +3236,37 @@ tryagain: } break; } - if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen - 1; + if (UTF) { + STRLEN unilen; + + if (FOLD) { + /* Emit all the Unicode characters. */ + for (foldbuf = tmpbuf; + foldlen; + foldlen -= numlen) { + ender = utf8_to_uvchr(foldbuf, &numlen); + if (numlen > 0) { + reguni(pRExC_state, ender, s, &unilen); + len += unilen; + s += unilen; + /* In EBCDIC the numlen + * and unilen can differ. */ + foldbuf += numlen; + if (numlen >= foldlen) + break; + } + else + break; + } + } + else { + reguni(pRExC_state, ender, s, &unilen); + if (unilen > 0) { + s += unilen; + len += unilen; + } + } + len--; } else REGC(ender, s++); @@ -3152,20 +3295,28 @@ tryagain: break; } - if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT && !RExC_utf8) { + /* If the encoding pragma is in effect recode the text of + * any EXACT-kind nodes. */ + if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) { STRLEN oldlen = STR_LEN(ret); SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); - char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding); - STRLEN newlen = SvCUR(sv); - if (!SIZE_ONLY) { - DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n", - oldlen, STRING(ret), newlen, s)); - Copy(s, STRING(ret), newlen, char); - STR_LEN(ret) += newlen - oldlen; - RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); - } else - RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); - RExC_utf8 = 1; + + if (RExC_utf8) + SvUTF8_on(sv); + if (sv_utf8_downgrade(sv, TRUE)) { + char *s = sv_recode_to_utf8(sv, PL_encoding); + STRLEN newlen = SvCUR(sv); + + if (!SIZE_ONLY) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n", + (int)oldlen, STRING(ret), + (int)newlen, s)); + Copy(s, STRING(ret), newlen, char); + STR_LEN(ret) += newlen - oldlen; + RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); + } else + RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); + } } return(ret); @@ -3325,8 +3476,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) STATIC void S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) { - if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && - POSIXCC(UCHARAT(RExC_parse))) { + if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) { char *s = RExC_parse; char c = *s++; @@ -3362,7 +3512,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) SV *listsv = Nullsv; register char *e; UV n; - bool optimize_invert = TRUE; + bool optimize_invert = TRUE; + AV* unicode_alternate = 0; ret = reganode(pRExC_state, ANYOF, 0); @@ -3390,11 +3541,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; - if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue)) + if (!SIZE_ONLY && POSIXCC(nextvalue)) checkposixcc(pRExC_state); - if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-') - goto charclassloop; /* allow 1st char to be ] or - */ + /* allow 1st char to be ] (allowing it to be - is dealt with later) */ + if (UCHARAT(RExC_parse) == ']') + goto charclassloop; while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { @@ -3939,17 +4091,71 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } else #endif - for (i = prevvalue; i <= ceilvalue; i++) - ANYOF_BITMAP_SET(ret, i); + for (i = prevvalue; i <= ceilvalue; i++) + ANYOF_BITMAP_SET(ret, i); } - if (value > 255) { + if (value > 255 || UTF) { + UV prevnatvalue = NATIVE_TO_UNI(prevvalue); + UV natvalue = NATIVE_TO_UNI(value); + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; - if (prevvalue < value) + if (prevnatvalue < natvalue) { /* what about > ? */ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - (UV)prevvalue, (UV)value); - else if (prevvalue == value) - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - (UV)value); + prevnatvalue, natvalue); + } + else if (prevnatvalue == natvalue) { + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); + if (FOLD) { + U8 foldbuf[UTF8_MAXLEN_FOLD+1]; + STRLEN foldlen; + UV f = to_uni_fold(natvalue, foldbuf, &foldlen); + + /* If folding and foldable and a single + * character, insert also the folded version + * to the charclass. */ + if (f != value) { + if (foldlen == UNISKIP(f)) + Perl_sv_catpvf(aTHX_ listsv, + "%04"UVxf"\n", f); + else { + /* Any multicharacter foldings + * require the following transform: + * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) + * where E folds into "pq" and F folds + * into "rst", all other characters + * fold to single characters. We save + * away these multicharacter foldings, + * to be later saved as part of the + * additional "s" data. */ + SV *sv; + + if (!unicode_alternate) + unicode_alternate = newAV(); + sv = newSVpvn((char*)foldbuf, foldlen); + SvUTF8_on(sv); + av_push(unicode_alternate, sv); + } + } + + /* If folding and the value is one of the Greek + * sigmas insert a few more sigmas to make the + * folding rules of the sigmas to work right. + * Note that not all the possible combinations + * are handled here: some of them are handled + * by the standard folding rules, and some of + * them (literal or EXACTF cases) are handled + * during runtime in regexec.c:S_find_byclass(). */ + if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", + (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", + (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); + } + else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA) + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", + (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); + } + } } } @@ -3993,8 +4199,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) AV *av = newAV(); SV *rv; + /* The 0th element stores the character class description + * in its textual form: used later (regexec.c:Perl_regclass_swatch()) + * to initialize the appropriate swash (which gets stored in + * the 1st element), and also useful for dumping the regnode. + * The 2nd element stores the multicharacter foldings, + * used later (regexec.c:s_reginclasslen()). */ av_store(av, 0, listsv); av_store(av, 1, NULL); + av_store(av, 2, (SV*)unicode_alternate); rv = newRV_noinc((SV*)av); n = add_data(pRExC_state, 1, "s"); RExC_rx->data->data[n] = (void*)rv; @@ -4334,6 +4547,15 @@ Perl_regdump(pTHX_ regexp *r) PL_colors[1], SvTAIL(r->anchored_substr) ? "$" : "", (IV)r->anchored_offset); + else if (r->anchored_utf8) + PerlIO_printf(Perl_debug_log, + "anchored utf8 `%s%.*s%s'%s at %"IVdf" ", + PL_colors[0], + (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)), + SvPVX(r->anchored_utf8), + PL_colors[1], + SvTAIL(r->anchored_utf8) ? "$" : "", + (IV)r->anchored_offset); if (r->float_substr) PerlIO_printf(Perl_debug_log, "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", @@ -4343,15 +4565,25 @@ Perl_regdump(pTHX_ regexp *r) PL_colors[1], SvTAIL(r->float_substr) ? "$" : "", (IV)r->float_min_offset, (UV)r->float_max_offset); - if (r->check_substr) + else if (r->float_utf8) + PerlIO_printf(Perl_debug_log, + "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", + PL_colors[0], + (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)), + SvPVX(r->float_utf8), + PL_colors[1], + SvTAIL(r->float_utf8) ? "$" : "", + (IV)r->float_min_offset, (UV)r->float_max_offset); + if (r->check_substr || r->check_utf8) PerlIO_printf(Perl_debug_log, r->check_substr == r->float_substr + && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored"); if (r->reganch & ROPT_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); if (r->reganch & ROPT_CHECK_ALL) PerlIO_printf(Perl_debug_log, " isall"); - if (r->check_substr) + if (r->check_substr || r->check_utf8) PerlIO_printf(Perl_debug_log, ") "); if (r->regstclass) { @@ -4428,9 +4660,13 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) if (k == EXACT) { SV *dsv = sv_2mortal(newSVpvn("", 0)); - bool do_utf8 = DO_UTF8(sv); + /* Using is_utf8_string() is a crude hack but it may + * be the best for now since we have no flag "this EXACTish + * node was UTF-8" --jhi */ + bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o)); char *s = do_utf8 ? - pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) : + pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, + UNI_DISPLAY_REGEX) : STRING(o); int len = do_utf8 ? strlen(s) : @@ -4524,7 +4760,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) { SV *lv; - SV *sw = regclass_swash(o, FALSE, &lv); + SV *sw = regclass_swash(o, FALSE, &lv, 0); if (lv) { if (sw) { @@ -4596,33 +4832,45 @@ Perl_re_intuit_string(pTHX_ regexp *prog) { /* Assume that RE_INTUIT is set */ DEBUG_r( { STRLEN n_a; - char *s = SvPV(prog->check_substr,n_a); + char *s = SvPV(prog->check_substr + ? prog->check_substr : prog->check_utf8, n_a); if (!PL_colorset) reginitcolors(); PerlIO_printf(Perl_debug_log, - "%sUsing REx substr:%s `%s%.60s%s%s'\n", - PL_colors[4],PL_colors[5],PL_colors[0], + "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n", + PL_colors[4], + prog->check_substr ? "" : "utf8 ", + PL_colors[5],PL_colors[0], s, PL_colors[1], (strlen(s) > 60 ? "..." : "")); } ); - return prog->check_substr; + return prog->check_substr ? prog->check_substr : prog->check_utf8; } void Perl_pregfree(pTHX_ struct regexp *r) { - DEBUG_r(if (!PL_colorset) reginitcolors()); +#ifdef DEBUGGING + SV *dsv = PERL_DEBUG_PAD_ZERO(0); +#endif if (!r || (--r->refcnt > 0)) return; - DEBUG_r(PerlIO_printf(Perl_debug_log, - "%sFreeing REx:%s `%s%.60s%s%s'\n", - PL_colors[4],PL_colors[5],PL_colors[0], - r->precomp, - PL_colors[1], - (strlen(r->precomp) > 60 ? "..." : ""))); + DEBUG_r({ + char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, + UNI_DISPLAY_REGEX); + int len = SvCUR(dsv); + if (!PL_colorset) + reginitcolors(); + PerlIO_printf(Perl_debug_log, + "%sFreeing REx:%s `%s%*.*s%s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], + len, len, s, + PL_colors[1], + len > 60 ? "..." : ""); + }); if (r->precomp) Safefree(r->precomp); @@ -4633,8 +4881,12 @@ Perl_pregfree(pTHX_ struct regexp *r) if (r->substrs) { if (r->anchored_substr) SvREFCNT_dec(r->anchored_substr); + if (r->anchored_utf8) + SvREFCNT_dec(r->anchored_utf8); if (r->float_substr) SvREFCNT_dec(r->float_substr); + if (r->float_utf8) + SvREFCNT_dec(r->float_utf8); Safefree(r->substrs); } if (r->data) { @@ -4678,7 +4930,7 @@ Perl_pregfree(pTHX_ struct regexp *r) new_comppad = NULL; break; case 'n': - break; + break; default: Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]); }