X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=89b3e53babf8e82478dc2c5743a720dbb8f477d1;hb=be26652545762cccb4c0118f022cf9d0ec20cf93;hp=83438e90a0cbd151621f1874d66a5b85a527432c;hpb=1d7c184104c076988718a01b77c8706aae05b092;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index 83438e9..89b3e53 100644 --- a/regcomp.c +++ b/regcomp.c @@ -69,7 +69,7 @@ * **** Alterations to Henry's code are... **** - **** Copyright (c) 1991-1999, Larry Wall + **** Copyright (c) 1991-2000, Larry Wall **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -277,6 +277,7 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl) STATIC void S_cl_init(pTHX_ struct regnode_charclass_class *cl) { + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(cl); } @@ -284,10 +285,9 @@ S_cl_init(pTHX_ struct regnode_charclass_class *cl) STATIC void S_cl_init_zero(pTHX_ struct regnode_charclass_class *cl) { + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(cl); - ANYOF_CLASS_ZERO(cl); - ANYOF_BITMAP_ZERO(cl); if (LOC) cl->flags |= ANYOF_LOCALE; } @@ -298,8 +298,6 @@ STATIC void S_cl_and(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class *and_with) { - int value; - if (!(and_with->flags & ANYOF_CLASS) && !(cl->flags & ANYOF_CLASS) && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) @@ -323,8 +321,6 @@ S_cl_and(pTHX_ struct regnode_charclass_class *cl, STATIC void S_cl_or(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with) { - int value; - if (or_with->flags & ANYOF_INVERT) { /* We do not use * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) @@ -545,9 +541,21 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da } } else if (flags & SCF_DO_STCLASS_AND) { - cl_and(data->start_class, &accum); - if (min1) + if (min1) { + cl_and(data->start_class, &accum); flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + StructCopy(data->start_class, &and_with, + struct regnode_charclass_class); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, + struct regnode_charclass_class); + flags |= SCF_DO_STCLASS_OR; + data->start_class->flags |= ANYOF_EOS; + } } } else if (code == BRANCHJ) /* single branch is optimized. */ @@ -590,7 +598,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) && (!(data->start_class->flags & ANYOF_FOLD) || !ANYOF_BITMAP_TEST(data->start_class, - PL_fold[*STRING(scan)]))) + PL_fold[*(U8*)STRING(scan)]))) compat = 0; ANYOF_CLASS_ZERO(data->start_class); ANYOF_BITMAP_ZERO(data->start_class); @@ -632,7 +640,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) && !ANYOF_BITMAP_TEST(data->start_class, - PL_fold[*STRING(scan)])) + PL_fold[*(U8*)STRING(scan)])) compat = 0; ANYOF_CLASS_ZERO(data->start_class); ANYOF_BITMAP_ZERO(data->start_class); @@ -753,15 +761,15 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da } if (!scan) /* It was not CURLYX, but CURLY. */ scan = next; - if (ckWARN(WARN_UNSAFE) && (minnext + deltanext == 0) + if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0) && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) && maxcount <= REG_INFTY/3) /* Complement check for big count */ - Perl_warner(aTHX_ WARN_UNSAFE, + Perl_warner(aTHX_ WARN_REGEXP, "Strange *+?{} on zero-length expression"); min += minnext * mincount; - is_inf_internal |= (maxcount == REG_INFTY - && (minnext + deltanext) > 0 - || deltanext == I32_MAX); + is_inf_internal |= ((maxcount == REG_INFTY + && (minnext + deltanext) > 0) + || deltanext == I32_MAX); is_inf |= is_inf_internal; delta += (minnext + deltanext) * maxcount - minnext * mincount; @@ -1214,7 +1222,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da if (data) data->flags |= SF_HAS_EVAL; } - else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded */ + else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */ if (flags & SCF_DO_SUBSTR) { scan_commit(data); data->longest = &(data->longest_float); @@ -1222,6 +1230,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ cl_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; } /* Else: zero-length, ignore. */ scan = regnext(scan); @@ -1314,9 +1323,6 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) dTHR; register regexp *r; regnode *scan; - SV **longest; - SV *longest_fixed; - SV *longest_float; regnode *first; I32 flags; I32 minlen = 0; @@ -1327,8 +1333,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (exp == NULL) FAIL("NULL regexp argument"); - if (PL_curcop == &PL_compiling ? (PL_hints & HINT_UTF8) : IN_UTF8) + if (pm->op_pmdynflags & PMdf_UTF8) { PL_reg_flags |= RF_utf8; + } else PL_reg_flags = 0; @@ -1353,7 +1360,10 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) PL_regsize = 0L; PL_regcode = &PL_regdummy; PL_reg_whilem_seen = 0; +#if 0 /* REGC() is (currently) a NOP at the first pass. + * Clever compilers notice this and complain. --jhi */ REGC((U8)REG_MAGIC, (char*)PL_regcode); +#endif if (reg(0, &flags) == NULL) { Safefree(PL_regprecomp); PL_regprecomp = Nullch; @@ -1375,6 +1385,10 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) char, regexp); if (r == NULL) FAIL("regexp out of space"); +#ifdef DEBUGGING + /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ + Zero(r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode), char); +#endif r->refcnt = 1; r->prelen = xend - exp; r->precomp = PL_regprecomp; @@ -1590,7 +1604,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ DEBUG_r((sv = sv_newmortal(), regprop(sv, (regnode*)data.start_class), - PerlIO_printf(Perl_debug_log, "synthetic stclass.\n", + PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", SvPVX(sv)))); } @@ -1639,7 +1653,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ DEBUG_r((sv = sv_newmortal(), regprop(sv, (regnode*)data.start_class), - PerlIO_printf(Perl_debug_log, "synthetic stclass.\n", + PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", SvPVX(sv)))); } } @@ -1718,6 +1732,11 @@ S_reg(pTHX_ I32 paren, I32 *flagp) *flagp = TRYAGAIN; return NULL; case 'p': + if (SIZE_ONLY) + Perl_warner(aTHX_ WARN_REGEXP, + "(?p{}) is deprecated - use (??{})"); + /* FALL THROUGH*/ + case '?': logical = 1; paren = *PL_regcomp_parse++; /* FALL THROUGH */ @@ -2189,8 +2208,8 @@ S_regpiece(pTHX_ I32 *flagp) goto do_curly; } nest_check: - if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) { - Perl_warner(aTHX_ WARN_UNSAFE, "%.*s matches null string many times", + if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) { + Perl_warner(aTHX_ WARN_REGEXP, "%.*s matches null string many times", PL_regcomp_parse - origparse, origparse); } @@ -2277,8 +2296,14 @@ tryagain: nextchar(); ret = reg(1, &flags); if (ret == NULL) { - if (flags & TRYAGAIN) + if (flags & TRYAGAIN) { + if (PL_regcomp_parse == PL_regxend) { + /* Make parent create an empty node if needed. */ + *flagp |= TRYAGAIN; + return(NULL); + } goto tryagain; + } return(NULL); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); @@ -2558,11 +2583,19 @@ tryagain: p++; break; case 'e': - ender = '\033'; +#ifdef ASCIIish + ender = '\033'; +#else + ender = '\047'; +#endif p++; break; case 'a': - ender = '\007'; +#ifdef ASCIIish + ender = '\007'; +#else + ender = '\057'; +#endif p++; break; case 'x': @@ -2572,8 +2605,10 @@ tryagain: if (!e) FAIL("Missing right brace on \\x{}"); else if (UTF) { - ender = (UV)scan_hex(p + 1, e - p, &numlen); - if (numlen + len >= 127) { /* numlen is generous */ + numlen = 1; /* allow underscores */ + ender = (UV)scan_hex(p + 1, e - p - 1, &numlen); + /* numlen is generous */ + if (numlen + len >= 127) { p--; goto loopdone; } @@ -2583,6 +2618,7 @@ tryagain: FAIL("Can't use \\x{} without 'use utf8' declaration"); } else { + numlen = 0; /* disallow underscores */ ender = (UV)scan_hex(p, 2, &numlen); p += numlen; } @@ -2596,6 +2632,7 @@ tryagain: case '5': case '6': case '7': case '8':case '9': if (*p == '0' || (isDIGIT(p[1]) && atoi(p) >= PL_regnpar) ) { + numlen = 0; /* disallow underscores */ ender = (UV)scan_oct(p, 3, &numlen); p += numlen; } @@ -2609,8 +2646,8 @@ tryagain: FAIL("trailing \\ in regexp"); /* FALL THROUGH */ default: - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && isALPHA(*p)) - Perl_warner(aTHX_ WARN_UNSAFE, + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p)) + Perl_warner(aTHX_ WARN_REGEXP, "/%.127s/: Unrecognized escape \\%c passed through", PL_regprecomp, *p); @@ -2795,15 +2832,15 @@ S_regpposixcc(pTHX_ I32 value) } break; } - if ((namedclass == OOB_NAMEDCLASS || - !(posixcc + skip + 2 < PL_regxend && - (posixcc[skip] == ':' && - posixcc[skip + 1] == ']')))) - Perl_croak(aTHX_ "Character class [:%.*s:] unknown", - t - s - 1, s + 1); - } else if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY) + if (namedclass == OOB_NAMEDCLASS || + posixcc[skip] != ':' || + posixcc[skip+1] != ']') + Perl_croak(aTHX_ + "Character class [:%.*s:] unknown", + t - s - 1, s + 1); + } else if (ckWARN(WARN_REGEXP) && !SIZE_ONLY) /* [[=foo=]] and [[.foo.]] are still future. */ - Perl_warner(aTHX_ WARN_UNSAFE, + Perl_warner(aTHX_ WARN_REGEXP, "Character class syntax [%c %c] is reserved for future extensions", c, c); } else { /* Maternal grandfather: @@ -2819,7 +2856,7 @@ S_regpposixcc(pTHX_ I32 value) STATIC void S_checkposixcc(pTHX) { - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && (*PL_regcomp_parse == ':' || *PL_regcomp_parse == '=' || *PL_regcomp_parse == '.')) { @@ -2829,10 +2866,10 @@ S_checkposixcc(pTHX) while(*s && isALNUM(*s)) s++; if (*s && c == *s && s[1] == ']') { - Perl_warner(aTHX_ WARN_UNSAFE, + Perl_warner(aTHX_ WARN_REGEXP, "Character class syntax [%c %c] belongs inside character classes", c, c); if (c == '=' || c == '.') - Perl_warner(aTHX_ WARN_UNSAFE, + Perl_warner(aTHX_ WARN_REGEXP, "Character class syntax [%c %c] is reserved for future extensions", c, c); } } @@ -2842,11 +2879,10 @@ STATIC regnode * S_regclass(pTHX) { dTHR; - register UV value; + register U32 value; register I32 lastvalue = OOB_CHAR8; register I32 range = 0; register regnode *ret; - register I32 def; I32 numlen; I32 namedclass; char *rangebegin; @@ -2871,7 +2907,7 @@ S_regclass(pTHX) ANYOF_FLAGS(ret) |= ANYOF_INVERT; } - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE)) + if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) checkposixcc(); if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-') @@ -2886,6 +2922,8 @@ S_regclass(pTHX) namedclass = regpposixcc(value); else if (value == '\\') { value = UCHARAT(PL_regcomp_parse++); + /* Some compilers cannot handle switching on 64-bit integer + * values, therefore value cannot be an UV. --jhi */ switch (value) { case 'w': namedclass = ANYOF_ALNUM; break; case 'W': namedclass = ANYOF_NALNUM; break; @@ -2898,9 +2936,15 @@ S_regclass(pTHX) case 't': value = '\t'; break; case 'f': value = '\f'; break; case 'b': value = '\b'; break; +#ifdef ASCIIish case 'e': value = '\033'; break; case 'a': value = '\007'; break; +#else + case 'e': value = '\047'; break; + case 'a': value = '\057'; break; +#endif case 'x': + numlen = 0; /* disallow underscores */ value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen); PL_regcomp_parse += numlen; break; @@ -2910,15 +2954,16 @@ S_regclass(pTHX) break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + numlen = 0; /* disallow underscores */ value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen); PL_regcomp_parse += numlen; break; default: - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && isALPHA(value)) - Perl_warner(aTHX_ WARN_UNSAFE, + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) + Perl_warner(aTHX_ WARN_REGEXP, "/%.127s/: Unrecognized escape \\%c in character class passed through", PL_regprecomp, - value); + (int)value); break; } } @@ -2928,8 +2973,8 @@ S_regclass(pTHX) need_class = 1; if (range) { /* a-\d, a-[:digit:] */ if (!SIZE_ONLY) { - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, + if (ckWARN(WARN_REGEXP)) + Perl_warner(aTHX_ WARN_REGEXP, "/%.127s/: false [] range \"%*.*s\" in regexp", PL_regprecomp, PL_regcomp_parse - rangebegin, @@ -3213,8 +3258,8 @@ S_regclass(pTHX) PL_regcomp_parse[1] != ']') { PL_regcomp_parse++; if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, + if (ckWARN(WARN_REGEXP)) + Perl_warner(aTHX_ WARN_REGEXP, "/%.127s/: false [] range \"%*.*s\" in regexp", PL_regprecomp, PL_regcomp_parse - rangebegin, @@ -3282,7 +3327,7 @@ S_regclassutf8(pTHX) { dTHR; register char *e; - register UV value; + register U32 value; register U32 lastvalue = OOB_UTF8; register I32 range = 0; register regnode *ret; @@ -3307,7 +3352,7 @@ S_regclassutf8(pTHX) listsv = newSVpvn("# comment\n",10); } - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE)) + if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) checkposixcc(); if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-') @@ -3323,8 +3368,11 @@ S_regclassutf8(pTHX) if (value == '[') namedclass = regpposixcc(value); else if (value == '\\') { - value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen); + value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, &numlen); PL_regcomp_parse += numlen; + /* Some compilers cannot handle switching on 64-bit integer + * values, therefore value cannot be an UV. Yes, this will + * be a problem later if we want switch on Unicode. --jhi */ switch (value) { case 'w': namedclass = ANYOF_ALNUM; break; case 'W': namedclass = ANYOF_NALNUM; break; @@ -3347,10 +3395,10 @@ S_regclassutf8(pTHX) if (!SIZE_ONLY) { if (value == 'p') Perl_sv_catpvf(aTHX_ listsv, - "+utf8::%.*s\n", n, PL_regcomp_parse); + "+utf8::%.*s\n", (int)n, PL_regcomp_parse); else Perl_sv_catpvf(aTHX_ listsv, - "!utf8::%.*s\n", n, PL_regcomp_parse); + "!utf8::%.*s\n", (int)n, PL_regcomp_parse); } PL_regcomp_parse = e + 1; lastvalue = OOB_UTF8; @@ -3360,19 +3408,26 @@ S_regclassutf8(pTHX) case 't': value = '\t'; break; case 'f': value = '\f'; break; case 'b': value = '\b'; break; +#ifdef ASCIIish case 'e': value = '\033'; break; case 'a': value = '\007'; break; +#else + case 'e': value = '\047'; break; + case 'a': value = '\057'; break; +#endif case 'x': if (*PL_regcomp_parse == '{') { e = strchr(PL_regcomp_parse++, '}'); if (!e) FAIL("Missing right brace on \\x{}"); + numlen = 1; /* allow underscores */ value = (UV)scan_hex(PL_regcomp_parse, e - PL_regcomp_parse, &numlen); PL_regcomp_parse = e + 1; } else { + numlen = 0; /* disallow underscores */ value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen); PL_regcomp_parse += numlen; } @@ -3383,23 +3438,24 @@ S_regclassutf8(pTHX) break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + numlen = 0; /* disallow underscores */ value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen); PL_regcomp_parse += numlen; break; default: - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && isALPHA(value)) - Perl_warner(aTHX_ WARN_UNSAFE, + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) + Perl_warner(aTHX_ WARN_REGEXP, "/%.127s/: Unrecognized escape \\%c in character class passed through", PL_regprecomp, - value); + (int)value); break; } } if (namedclass > OOB_NAMEDCLASS) { if (range) { /* a-\d, a-[:digit:] */ if (!SIZE_ONLY) { - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, + if (ckWARN(WARN_REGEXP)) + Perl_warner(aTHX_ WARN_REGEXP, "/%.127s/: false [] range \"%*.*s\" in regexp", PL_regprecomp, PL_regcomp_parse - rangebegin, @@ -3486,8 +3542,8 @@ S_regclassutf8(pTHX) PL_regcomp_parse[1] != ']') { PL_regcomp_parse++; if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, + if (ckWARN(WARN_REGEXP)) + Perl_warner(aTHX_ WARN_REGEXP, "/%.127s/: false [] range \"%*.*s\" in regexp", PL_regprecomp, PL_regcomp_parse - rangebegin, @@ -3611,7 +3667,7 @@ S_reguni(pTHX_ UV uv, char* s, I32* lenp) { dTHR; if (SIZE_ONLY) { - U8 tmpbuf[10]; + U8 tmpbuf[UTF8_MAXLEN]; *lenp = uv_to_utf8(tmpbuf, uv) - tmpbuf; } else @@ -3661,7 +3717,6 @@ S_regtail(pTHX_ regnode *p, regnode *val) dTHR; register regnode *scan; register regnode *temp; - register I32 offset; if (SIZE_ONLY) return; @@ -3730,7 +3785,7 @@ S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) { #ifdef DEBUGGING register U8 op = EXACT; /* Arbitrary non-END op. */ - register regnode *next, *onode; + register regnode *next; while (op != END && (!last || node < last)) { /* While that wasn't END last time... */ @@ -3906,7 +3961,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP ) - Perl_sv_catpvf(aTHX_ sv, "%d", ARG(o)); /* Parenth number */ + Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == ANYOF) {