X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=e7042eaa4e251fe61a057095b75c2a643ddb0525;hb=8fc173423e29547f0d1de6373cac1b08dfb0c024;hp=2d69750d8eb09af60873ff96f0eab40d003a2523;hpb=bca6c107bab5aabfb35be60303a5b31e9c9ec4a4;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index 2d69750..e7042ea 100644 --- a/regcomp.c +++ b/regcomp.c @@ -69,7 +69,7 @@ * **** Alterations to Henry's code are... **** - **** Copyright (c) 1991-1999, Larry Wall + **** Copyright (c) 1991-2000, Larry Wall **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -151,6 +151,7 @@ typedef struct scan_data_t { I32 offset_float_max; I32 flags; I32 whilem_c; + struct regnode_charclass_class *start_class; } scan_data_t; /* @@ -158,7 +159,7 @@ typedef struct scan_data_t { */ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0 }; + 0, 0, 0, 0, 0 }; #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) #define SF_BEFORE_SEOL 0x1 @@ -184,6 +185,9 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define SF_IN_PAR 0x100 #define SF_HAS_EVAL 0x200 #define SCF_DO_SUBSTR 0x400 +#define SCF_DO_STCLASS_AND 0x0800 +#define SCF_DO_STCLASS_OR 0x1000 +#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) #define RF_utf8 8 #define UTF (PL_reg_flags & RF_utf8) @@ -197,11 +201,194 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) + +/* length of regex to show in messages that don't mark a position within */ +#define RegexLengthToShowInErrorMessages 127 + +/* + * If MARKER[12] are adjusted, be sure to adjust the constants at the top + * of t/op/regmesg.t, the tests in t/op/re_tests, and those in + * op/pragma/warn/regcomp. + */ +#define MARKER1 "HERE" /* marker as it appears in the description */ +#define MARKER2 " << HERE " /* marker as it appears within the regex */ + +#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/" + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * arg. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define FAIL(msg) \ + STMT_START { \ + char *ellipses = ""; \ + unsigned len = strlen(PL_regprecomp); \ + \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ + \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ + msg, (int)len, PL_regprecomp, ellipses); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * args. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define FAIL2(pat,msg) \ + STMT_START { \ + char *ellipses = ""; \ + unsigned len = strlen(PL_regprecomp); \ + \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ + \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \ + msg, (int)len, PL_regprecomp, ellipses); \ + } STMT_END + + +/* + * Simple_vFAIL -- like FAIL, but marks the current location in the scan + */ +#define Simple_vFAIL(m) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ + \ + Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ + m, (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() + */ +#define vFAIL(m) \ + STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ + Simple_vFAIL(m); \ + } STMT_END + +/* + * Like Simple_vFAIL(), but accepts two arguments. + */ +#define Simple_vFAIL2(m,a1) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ + \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). + */ +#define vFAIL2(m,a1) \ + STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ + Simple_vFAIL2(m, a1); \ + } STMT_END + + +/* + * Like Simple_vFAIL(), but accepts three arguments. + */ +#define Simple_vFAIL3(m, a1, a2) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ + \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). + */ +#define vFAIL3(m,a1,a2) \ + STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ + Simple_vFAIL3(m, a1, a2); \ + } STMT_END + +/* + * Like Simple_vFAIL(), but accepts four arguments. + */ +#define Simple_vFAIL4(m, a1, a2, a3) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ + \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +/* + * Like Simple_vFAIL(), but accepts five arguments. + */ +#define Simple_vFAIL5(m, a1, a2, a3, a4) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + + +#define vWARN(loc,m) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\ + m, (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END \ + + +#define vWARN2(loc, m, a1) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + a1, \ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +#define vWARN3(loc, m, a1, a2) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp) - (PL_regxend - (loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ + a1, a2, \ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +#define vWARN4(loc, m, a1, a2, a3) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + a1, a2, a3, \ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + + + /* Allow for side effects in s */ #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END static void clear_re(pTHXo_ void *r); +/* Mark that we cannot extend a found fixed substring at this point. + Updata the longest found anchored substring and the longest found + floating substrings if needed. */ + STATIC void S_scan_commit(pTHX_ scan_data_t *data) { @@ -236,6 +423,131 @@ S_scan_commit(pTHX_ scan_data_t *data) data->flags &= ~SF_BEFORE_EOL; } +/* Can match anything (initialization) */ +STATIC void +S_cl_anything(pTHX_ struct regnode_charclass_class *cl) +{ + int value; + + ANYOF_CLASS_ZERO(cl); + for (value = 0; value < 256; ++value) + ANYOF_BITMAP_SET(cl, value); + cl->flags = ANYOF_EOS; + if (LOC) + cl->flags |= ANYOF_LOCALE; +} + +/* Can match anything (initialization) */ +STATIC int +S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl) +{ + int value; + + for (value = 0; value <= ANYOF_MAX; value += 2) + if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) + return 1; + for (value = 0; value < 256; ++value) + if (!ANYOF_BITMAP_TEST(cl, value)) + return 0; + return 1; +} + +/* Can match anything (initialization) */ +STATIC void +S_cl_init(pTHX_ struct regnode_charclass_class *cl) +{ + Zero(cl, 1, struct regnode_charclass_class); + cl->type = ANYOF; + cl_anything(cl); +} + +STATIC void +S_cl_init_zero(pTHX_ struct regnode_charclass_class *cl) +{ + Zero(cl, 1, struct regnode_charclass_class); + cl->type = ANYOF; + cl_anything(cl); + if (LOC) + cl->flags |= ANYOF_LOCALE; +} + +/* 'And' a given class with another one. Can create false positives */ +/* We assume that cl is not inverted */ +STATIC void +S_cl_and(pTHX_ struct regnode_charclass_class *cl, + struct regnode_charclass_class *and_with) +{ + if (!(and_with->flags & ANYOF_CLASS) + && !(cl->flags & ANYOF_CLASS) + && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) + && !(and_with->flags & ANYOF_FOLD) + && !(cl->flags & ANYOF_FOLD)) { + int i; + + if (and_with->flags & ANYOF_INVERT) + for (i = 0; i < ANYOF_BITMAP_SIZE; i++) + cl->bitmap[i] &= ~and_with->bitmap[i]; + else + for (i = 0; i < ANYOF_BITMAP_SIZE; i++) + cl->bitmap[i] &= and_with->bitmap[i]; + } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ + if (!(and_with->flags & ANYOF_EOS)) + cl->flags &= ~ANYOF_EOS; +} + +/* 'OR' a given class with another one. Can create false positives */ +/* We assume that cl is not inverted */ +STATIC void +S_cl_or(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with) +{ + if (or_with->flags & ANYOF_INVERT) { + /* We do not use + * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) + * <= (B1 | !B2) | (CL1 | !CL2) + * which is wasteful if CL2 is small, but we ignore CL2: + * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1 + * XXXX Can we handle case-fold? Unclear: + * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = + * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) + */ + if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) + && !(or_with->flags & ANYOF_FOLD) + && !(cl->flags & ANYOF_FOLD) ) { + int i; + + for (i = 0; i < ANYOF_BITMAP_SIZE; i++) + cl->bitmap[i] |= ~or_with->bitmap[i]; + } /* XXXX: logic is complicated otherwise */ + else { + cl_anything(cl); + } + } else { + /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ + if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) + && (!(or_with->flags & ANYOF_FOLD) + || (cl->flags & ANYOF_FOLD)) ) { + int i; + + /* OR char bitmap and class bitmap separately */ + for (i = 0; i < ANYOF_BITMAP_SIZE; i++) + cl->bitmap[i] |= or_with->bitmap[i]; + if (or_with->flags & ANYOF_CLASS) { + for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++) + cl->classflags[i] |= or_with->classflags[i]; + cl->flags |= ANYOF_CLASS; + } + } + else { /* XXXX: logic is complicated, leave it along for a moment. */ + cl_anything(cl); + } + } + if (or_with->flags & ANYOF_EOS) + cl->flags |= ANYOF_EOS; +} + +/* REx optimizer. Converts nodes into quickier variants "in place". + Finds fixed substrings. */ + /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set to the position after last scanned or to NULL. */ @@ -253,11 +565,13 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da int is_inf_internal = 0; /* The studied chunk is infinite */ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; scan_data_t data_fake; + struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */ while (scan && OP(scan) != END && scan < last) { /* Peephole optimizer: */ if (PL_regkind[(U8)OP(scan)] == EXACT) { + /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; #ifdef DEBUGGING @@ -305,19 +619,16 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da /* Allow dumping */ n = scan + NODE_SZ_STR(scan); while (n <= stop) { - /* Purify reports a benign UMR here sometimes, because we - * don't initialize the OP() slot of a node when that node - * is occupied by just the trailing null of the string in - * an EXACT node */ if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) { OP(n) = OPTIMIZED; NEXT_OFF(n) = 0; } n++; } -#endif - +#endif } + /* Follow the next-chain of the current node and optimize + away all the NOTHINGs from it. */ if (OP(scan) != CURLYX) { int max = (reg_off_by_arg[OP(scan)] ? I32_MAX @@ -338,6 +649,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da else NEXT_OFF(scan) = off; } + /* The principal pseudo-switch. Cannot be a switch, since we + look into several different things. */ if (OP(scan) == BRANCH || OP(scan) == BRANCHJ || OP(scan) == IFTHEN || OP(scan) == SUSPEND) { next = regnext(scan); @@ -345,11 +658,15 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da if (OP(next) == code || code == IFTHEN || code == SUSPEND) { I32 max1 = 0, min1 = I32_MAX, num = 0; + struct regnode_charclass_class accum; - if (flags & SCF_DO_SUBSTR) - scan_commit(data); + if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ + scan_commit(data); /* Cannot merge strings after this. */ + if (flags & SCF_DO_STCLASS) + cl_init_zero(&accum); while (OP(scan) == code) { - I32 deltanext, minnext; + I32 deltanext, minnext, f = 0; + struct regnode_charclass_class this_class; num++; data_fake.flags = 0; @@ -359,9 +676,14 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da scan = NEXTOPER(scan); if (code != BRANCH) scan = NEXTOPER(scan); - /* We suppose the run is continuous, last=next...*/ + if (flags & SCF_DO_STCLASS) { + cl_init(&this_class); + data_fake.start_class = &this_class; + f = SCF_DO_STCLASS_AND; + } + /* we suppose the run is continuous, last=next...*/ minnext = study_chunk(&scan, &deltanext, next, - &data_fake, 0); + &data_fake, f); if (min1 > minnext) min1 = minnext; if (max1 < minnext + deltanext) @@ -375,6 +697,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da data->flags |= SF_HAS_EVAL; if (data) data->whilem_c = data_fake.whilem_c; + if (flags & SCF_DO_STCLASS) + cl_or(&accum, &this_class); if (code == SUSPEND) break; } @@ -388,6 +712,30 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da } min += min1; delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + cl_or(data->start_class, &accum); + if (min1) { + cl_and(data->start_class, &and_with); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + cl_and(data->start_class, &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + StructCopy(data->start_class, &and_with, + struct regnode_charclass_class); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, + struct regnode_charclass_class); + flags |= SCF_DO_STCLASS_OR; + data->start_class->flags |= ANYOF_EOS; + } + } } else if (code == BRANCHJ) /* single branch is optimized. */ scan = NEXTOPER(NEXTOPER(scan)); @@ -421,9 +769,34 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da data->pos_min += l; /* As in the first entry. */ data->flags &= ~SF_BEFORE_EOL; } + if (flags & SCF_DO_STCLASS_AND) { + /* Check whether it is compatible with what we know already! */ + int compat = 1; + + if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) + && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) + && (!(data->start_class->flags & ANYOF_FOLD) + || !ANYOF_BITMAP_TEST(data->start_class, + PL_fold[*(U8*)STRING(scan)]))) + compat = 0; + ANYOF_CLASS_ZERO(data->start_class); + ANYOF_BITMAP_ZERO(data->start_class); + if (compat) + ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + data->start_class->flags &= ~ANYOF_EOS; + } + else if (flags & SCF_DO_STCLASS_OR) { + /* false positive possible if the class is case-folded */ + ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + data->start_class->flags &= ~ANYOF_EOS; + cl_and(data->start_class, &and_with); + } + flags &= ~SCF_DO_STCLASS; } - else if (PL_regkind[(U8)OP(scan)] == EXACT) { + else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */ I32 l = STR_LEN(scan); + + /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) scan_commit(data); if (UTF) { @@ -439,19 +812,51 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da min += l; if (data && (flags & SCF_DO_SUBSTR)) data->pos_min += l; + if (flags & SCF_DO_STCLASS_AND) { + /* Check whether it is compatible with what we know already! */ + int compat = 1; + + if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) + && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) + && !ANYOF_BITMAP_TEST(data->start_class, + PL_fold[*(U8*)STRING(scan)])) + compat = 0; + ANYOF_CLASS_ZERO(data->start_class); + ANYOF_BITMAP_ZERO(data->start_class); + if (compat) { + ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + data->start_class->flags &= ~ANYOF_EOS; + data->start_class->flags |= ANYOF_FOLD; + if (OP(scan) == EXACTFL) + data->start_class->flags |= ANYOF_LOCALE; + } + } + else if (flags & SCF_DO_STCLASS_OR) { + if (data->start_class->flags & ANYOF_FOLD) { + /* false positive possible if the class is case-folded. + Assume that the locale settings are the same... */ + ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + data->start_class->flags &= ~ANYOF_EOS; + } + cl_and(data->start_class, &and_with); + } + flags &= ~SCF_DO_STCLASS; } else if (strchr((char*)PL_varies,OP(scan))) { I32 mincount, maxcount, minnext, deltanext, pos_before, fl; + I32 f = flags; regnode *oscan = scan; - + struct regnode_charclass_class this_class; + struct regnode_charclass_class *oclass = NULL; + switch (PL_regkind[(U8)OP(scan)]) { - case WHILEM: + case WHILEM: /* End of (?:...)* . */ scan = NEXTOPER(scan); goto finish; case PLUS: - if (flags & SCF_DO_SUBSTR) { + if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { next = NEXTOPER(scan); - if (OP(next) == EXACT) { + if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { mincount = 1; maxcount = REG_INFTY; next = regnext(scan); @@ -464,10 +869,17 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da min++; /* Fall through. */ case STAR: + if (flags & SCF_DO_STCLASS) { + mincount = 0; + maxcount = REG_INFTY; + next = regnext(scan); + scan = NEXTOPER(scan); + goto do_curly; + } is_inf = is_inf_internal = 1; scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { - scan_commit(data); + scan_commit(data); /* Cannot extend fixed substrings */ data->longest = &(data->longest_float); } goto optimize_curly_tail; @@ -478,7 +890,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; do_curly: if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) scan_commit(data); + if (mincount == 0) scan_commit(data); /* Cannot extend fixed substrings */ pos_before = data->pos_min; } if (data) { @@ -487,21 +899,59 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da if (is_inf) data->flags |= SF_IS_INF; } + if (flags & SCF_DO_STCLASS) { + cl_init(&this_class); + oclass = data->start_class; + data->start_class = &this_class; + f |= SCF_DO_STCLASS_AND; + f &= ~SCF_DO_STCLASS_OR; + } + /* This will finish on WHILEM, setting scan, or on NULL: */ minnext = study_chunk(&scan, &deltanext, last, data, mincount == 0 - ? (flags & ~SCF_DO_SUBSTR) : flags); + ? (f & ~SCF_DO_SUBSTR) : f); + + if (flags & SCF_DO_STCLASS) + data->start_class = oclass; + if (mincount == 0 || minnext == 0) { + if (flags & SCF_DO_STCLASS_OR) { + cl_or(data->start_class, &this_class); + } + else if (flags & SCF_DO_STCLASS_AND) { + /* Switch to OR mode: cache the old value of + * data->start_class */ + StructCopy(data->start_class, &and_with, + struct regnode_charclass_class); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&this_class, data->start_class, + struct regnode_charclass_class); + flags |= SCF_DO_STCLASS_OR; + data->start_class->flags |= ANYOF_EOS; + } + } else { /* Non-zero len */ + if (flags & SCF_DO_STCLASS_OR) { + cl_or(data->start_class, &this_class); + cl_and(data->start_class, &and_with); + } + else if (flags & SCF_DO_STCLASS_AND) + cl_and(data->start_class, &this_class); + flags &= ~SCF_DO_STCLASS; + } if (!scan) /* It was not CURLYX, but CURLY. */ scan = next; - if (ckWARN(WARN_UNSAFE) && (minnext + deltanext == 0) + if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0) && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) && maxcount <= REG_INFTY/3) /* Complement check for big count */ - Perl_warner(aTHX_ WARN_UNSAFE, - "Strange *+?{} on zero-length expression"); + { + vWARN(PL_regcomp_parse, + "Quantifier unexpected on zero-length expression"); + } + min += minnext * mincount; - is_inf_internal |= (maxcount == REG_INFTY - && (minnext + deltanext) > 0 - || deltanext == I32_MAX); + is_inf_internal |= ((maxcount == REG_INFTY + && (minnext + deltanext) > 0) + || deltanext == I32_MAX); is_inf |= is_inf_internal; delta += (minnext + deltanext) * maxcount - minnext * mincount; @@ -560,7 +1010,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ if (OP(nxt) != CLOSE) - FAIL("panic opt close"); + FAIL("Panic opt close"); oscan->flags = ARG(nxt); OP(nxt1) = OPTIMIZED; /* was OPEN. */ OP(nxt) = OPTIMIZED; /* was CLOSE. */ @@ -633,6 +1083,11 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da sv_catsv(data->last_found, last_str); data->last_end += l * (mincount - 1); } + } else { + /* start offset must point into the last copy */ + data->last_start_min += minnext * (mincount - 1); + data->last_start_max += is_inf ? 0 : (maxcount - 1) + * (minnext + data->pos_delta); } } /* It is counted once already... */ @@ -640,6 +1095,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da data->pos_delta += - counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount; if (mincount != maxcount) { + /* Cannot extend fixed substrings found inside + the group. */ scan_commit(data); if (mincount && last_str) { sv_setsv(data->last_found, last_str); @@ -664,45 +1121,264 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da NEXT_OFF(oscan) += NEXT_OFF(next); } continue; - default: /* REF only? */ + default: /* REF and CLUMP only? */ if (flags & SCF_DO_SUBSTR) { - scan_commit(data); + scan_commit(data); /* Cannot expect anything... */ data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) + cl_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; break; } } else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) { + int value; + if (flags & SCF_DO_SUBSTR) { scan_commit(data); data->pos_min++; } min++; + if (flags & SCF_DO_STCLASS) { + data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ + + /* Some of the logic below assumes that switching + locale on will only add false positives. */ + switch (PL_regkind[(U8)OP(scan)]) { + case ANYUTF8: + case SANY: + case SANYUTF8: + case ALNUMUTF8: + case ANYOFUTF8: + case ALNUMLUTF8: + case NALNUMUTF8: + case NALNUMLUTF8: + case SPACEUTF8: + case NSPACEUTF8: + case SPACELUTF8: + case NSPACELUTF8: + case DIGITUTF8: + case NDIGITUTF8: + default: + do_default: + /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + cl_anything(data->start_class); + break; + case REG_ANY: + if (OP(scan) == SANY) + goto do_default; + if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ + value = (ANYOF_BITMAP_TEST(data->start_class,'\n') + || (data->start_class->flags & ANYOF_CLASS)); + cl_anything(data->start_class); + } + if (flags & SCF_DO_STCLASS_AND || !value) + ANYOF_BITMAP_CLEAR(data->start_class,'\n'); + break; + case ANYOF: + if (flags & SCF_DO_STCLASS_AND) + cl_and(data->start_class, + (struct regnode_charclass_class*)scan); + else + cl_or(data->start_class, + (struct regnode_charclass_class*)scan); + break; + case ALNUM: + if (flags & SCF_DO_STCLASS_AND) { + if (!(data->start_class->flags & ANYOF_LOCALE)) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); + for (value = 0; value < 256; value++) + if (!isALNUM(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + else { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); + else { + for (value = 0; value < 256; value++) + if (isALNUM(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + } + break; + case ALNUML: + if (flags & SCF_DO_STCLASS_AND) { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); + } + else { + ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); + data->start_class->flags |= ANYOF_LOCALE; + } + break; + case NALNUM: + if (flags & SCF_DO_STCLASS_AND) { + if (!(data->start_class->flags & ANYOF_LOCALE)) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); + for (value = 0; value < 256; value++) + if (isALNUM(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + else { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); + else { + for (value = 0; value < 256; value++) + if (!isALNUM(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + } + break; + case NALNUML: + if (flags & SCF_DO_STCLASS_AND) { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); + } + else { + data->start_class->flags |= ANYOF_LOCALE; + ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); + } + break; + case SPACE: + if (flags & SCF_DO_STCLASS_AND) { + if (!(data->start_class->flags & ANYOF_LOCALE)) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); + for (value = 0; value < 256; value++) + if (!isSPACE(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + else { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); + else { + for (value = 0; value < 256; value++) + if (isSPACE(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + } + break; + case SPACEL: + if (flags & SCF_DO_STCLASS_AND) { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); + } + else { + data->start_class->flags |= ANYOF_LOCALE; + ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); + } + break; + case NSPACE: + if (flags & SCF_DO_STCLASS_AND) { + if (!(data->start_class->flags & ANYOF_LOCALE)) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); + for (value = 0; value < 256; value++) + if (isSPACE(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + else { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); + else { + for (value = 0; value < 256; value++) + if (!isSPACE(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + } + break; + case NSPACEL: + if (flags & SCF_DO_STCLASS_AND) { + if (data->start_class->flags & ANYOF_LOCALE) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); + for (value = 0; value < 256; value++) + if (!isSPACE(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + else { + data->start_class->flags |= ANYOF_LOCALE; + ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); + } + break; + case DIGIT: + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); + for (value = 0; value < 256; value++) + if (!isDIGIT(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); + else { + for (value = 0; value < 256; value++) + if (isDIGIT(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + } + break; + case NDIGIT: + if (flags & SCF_DO_STCLASS_AND) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); + for (value = 0; value < 256; value++) + if (isDIGIT(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { + if (data->start_class->flags & ANYOF_LOCALE) + ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); + else { + for (value = 0; value < 256; value++) + if (!isDIGIT(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + } + break; + } + if (flags & SCF_DO_STCLASS_OR) + cl_and(data->start_class, &and_with); + flags &= ~SCF_DO_STCLASS; + } } else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); } - else if (PL_regkind[(U8)OP(scan)] == BRANCHJ - && (scan->flags || data) + else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ + /* Lookbehind, or need to calculate parens/evals/stclass: */ + && (scan->flags || data || (flags & SCF_DO_STCLASS)) && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { + /* Lookahead/lookbehind */ I32 deltanext, minnext; regnode *nscan; + struct regnode_charclass_class intrnl; + int f = 0; data_fake.flags = 0; if (data) data_fake.whilem_c = data->whilem_c; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + cl_init(&intrnl); + data_fake.start_class = &intrnl; + f = SCF_DO_STCLASS_AND; + } next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - minnext = study_chunk(&nscan, &deltanext, last, &data_fake, 0); + minnext = study_chunk(&nscan, &deltanext, last, &data_fake, f); if (scan->flags) { if (deltanext) { - FAIL("variable length lookbehind not implemented"); + vFAIL("Variable length lookbehind not implemented"); } else if (minnext > U8_MAX) { - FAIL2("lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); } scan->flags = minnext; } @@ -712,6 +1388,13 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da data->flags |= SF_HAS_EVAL; if (data) data->whilem_c = data_fake.whilem_c; + if (f) { + int was = (data->start_class->flags & ANYOF_EOS); + + cl_and(data->start_class, &intrnl); + if (was) + data->start_class->flags |= ANYOF_EOS; + } } else if (OP(scan) == OPEN) { pars++; @@ -726,12 +1409,15 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da if (data) data->flags |= SF_HAS_EVAL; } - else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded */ + else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */ if (flags & SCF_DO_SUBSTR) { scan_commit(data); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + cl_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; } /* Else: zero-length, ignore. */ scan = regnext(scan); @@ -752,6 +1438,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da data->flags |= SF_HAS_PAR; data->flags &= ~SF_IN_PAR; } + if (flags & SCF_DO_STCLASS_OR) + cl_and(data->start_class, &and_with); return min; } @@ -801,6 +1489,7 @@ Perl_reginitcolors(pTHX) PL_colorset = 1; } + /* - pregcomp - compile a regular expression into internal code * @@ -822,9 +1511,6 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) dTHR; register regexp *r; regnode *scan; - SV **longest; - SV *longest_fixed; - SV *longest_float; regnode *first; I32 flags; I32 minlen = 0; @@ -835,8 +1521,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (exp == NULL) FAIL("NULL regexp argument"); - if (PL_curcop == &PL_compiling ? (PL_hints & HINT_UTF8) : IN_UTF8) + if (pm->op_pmdynflags & PMdf_UTF8) { PL_reg_flags |= RF_utf8; + } else PL_reg_flags = 0; @@ -861,13 +1548,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) PL_regsize = 0L; PL_regcode = &PL_regdummy; PL_reg_whilem_seen = 0; +#if 0 /* REGC() is (currently) a NOP at the first pass. + * Clever compilers notice this and complain. --jhi */ REGC((U8)REG_MAGIC, (char*)PL_regcode); +#endif if (reg(0, &flags) == NULL) { Safefree(PL_regprecomp); PL_regprecomp = Nullch; return(NULL); } - DEBUG_r(PerlIO_printf(Perl_debug_log, "size %d ", PL_regsize)); + DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)PL_regsize)); /* Small enough for pointer-storage convention? If extralen==0, this means that we will not need long jumps. */ @@ -882,7 +1572,12 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) Newc(1001, r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode), char, regexp); if (r == NULL) - FAIL("regexp out of space"); + FAIL("Regexp out of space"); + +#ifdef DEBUGGING + /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ + Zero(r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode), char); +#endif r->refcnt = 1; r->prelen = xend - exp; r->precomp = PL_regprecomp; @@ -924,16 +1619,21 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) Newz(1004, r->substrs, 1, struct reg_substr_data); StructCopy(&zero_scan_data, &data, scan_data_t); + /* XXXX Should not we check for something else? Usually it is OPEN1... */ if (OP(scan) != BRANCH) { /* Only one top-level choice. */ I32 fake; STRLEN longest_float_length, longest_fixed_length; + struct regnode_charclass_class ch_class; + int stclass_flag; first = scan; /* Skip introductions and multiplicators >= 1. */ while ((OP(first) == OPEN && (sawopen = 1)) || + /* An OR of *one* alternative - should not happen now. */ (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || (OP(first) == PLUS) || (OP(first) == MINMOD) || + /* An {n,m} with n>0 */ (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) { if (OP(first) == PLUS) sawplus = 1; @@ -944,13 +1644,13 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* Starting-point info. */ again: - if (PL_regkind[(U8)OP(first) == EXACT]) { + if (PL_regkind[(U8)OP(first)] == EXACT) { if (OP(first) == EXACT); /* Empty, get anchored substr later. */ else if ((OP(first) == EXACTF || OP(first) == EXACTFL) && !UTF) r->regstclass = first; } - else if (strchr((char*)PL_simple+4,OP(first))) + else if (strchr((char*)PL_simple,OP(first))) r->regstclass = first; else if (PL_regkind[(U8)OP(first)] == BOUND || PL_regkind[(U8)OP(first)] == NBOUND) @@ -1011,9 +1711,15 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) data.last_found = newSVpvn("",0); data.longest = &(data.longest_fixed); first = scan; - + if (!r->regstclass) { + cl_init(&ch_class); + data.start_class = &ch_class; + stclass_flag = SCF_DO_STCLASS_AND; + } else /* XXXX Check for BOUND? */ + stclass_flag = 0; + minlen = study_chunk(&first, &fake, scan + PL_regsize, /* Up to end */ - &data, SCF_DO_SUBSTR); + &data, SCF_DO_SUBSTR | stclass_flag); if ( PL_regnpar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !PL_seen_zerolen @@ -1068,6 +1774,28 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) SvREFCNT_dec(data.longest_fixed); longest_fixed_length = 0; } + if (r->regstclass + && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8 + || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY)) + r->regstclass = NULL; + if ((!r->anchored_substr || r->anchored_offset) && stclass_flag + && !(data.start_class->flags & ANYOF_EOS) + && !cl_is_anything(data.start_class)) { + SV *sv; + I32 n = add_data(1, "f"); + + New(1006, PL_regcomp_rx->data->data[n], 1, + struct regnode_charclass_class); + StructCopy(data.start_class, + (struct regnode_charclass_class*)PL_regcomp_rx->data->data[n], + struct regnode_charclass_class); + r->regstclass = (regnode*)PL_regcomp_rx->data->data[n]; + r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ + DEBUG_r((sv = sv_newmortal(), + regprop(sv, (regnode*)data.start_class), + PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", + SvPVX(sv)))); + } /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ if (longest_fixed_length > longest_float_length) { @@ -1092,11 +1820,31 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) else { /* Several toplevels. Best we can is to set minlen. */ I32 fake; + struct regnode_charclass_class ch_class; DEBUG_r(PerlIO_printf(Perl_debug_log, "\n")); scan = r->program + 1; - minlen = study_chunk(&scan, &fake, scan + PL_regsize, &data, 0); + cl_init(&ch_class); + data.start_class = &ch_class; + minlen = study_chunk(&scan, &fake, scan + PL_regsize, &data, SCF_DO_STCLASS_AND); r->check_substr = r->anchored_substr = r->float_substr = Nullsv; + if (!(data.start_class->flags & ANYOF_EOS) + && !cl_is_anything(data.start_class)) { + SV *sv; + I32 n = add_data(1, "f"); + + New(1006, PL_regcomp_rx->data->data[n], 1, + struct regnode_charclass_class); + StructCopy(data.start_class, + (struct regnode_charclass_class*)PL_regcomp_rx->data->data[n], + struct regnode_charclass_class); + r->regstclass = (regnode*)PL_regcomp_rx->data->data[n]; + r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ + DEBUG_r((sv = sv_newmortal(), + regprop(sv, (regnode*)data.start_class), + PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", + SvPVX(sv)))); + } } r->minlen = minlen; @@ -1132,6 +1880,7 @@ S_reg(pTHX_ I32 paren, I32 *flagp) register regnode *ender = 0; register I32 parno = 0; I32 flags, oregflags = PL_regflags, have_branch = 0, open = 0; + char *oregcomp_parse = PL_regcomp_parse; char c; *flagp = 0; /* Tentatively. */ @@ -1142,6 +1891,7 @@ S_reg(pTHX_ I32 paren, I32 *flagp) U16 posflags = 0, negflags = 0; U16 *flagsp = &posflags; int logical = 0; + char *seqstart = PL_regcomp_parse; PL_regcomp_parse++; paren = *PL_regcomp_parse++; @@ -1162,7 +1912,7 @@ S_reg(pTHX_ I32 paren, I32 *flagp) break; case '$': case '@': - FAIL2("Sequence (?%c...) not implemented", (int)paren); + vFAIL2("Sequence (?%c...) not implemented", (int)paren); break; case '#': while (*PL_regcomp_parse && *PL_regcomp_parse != ')') @@ -1173,6 +1923,10 @@ S_reg(pTHX_ I32 paren, I32 *flagp) *flagp = TRYAGAIN; return NULL; case 'p': + if (SIZE_ONLY) + vWARN(PL_regcomp_parse, "(?p{}) is deprecated - use (??{})"); + /* FALL THROUGH*/ + case '?': logical = 1; paren = *PL_regcomp_parse++; /* FALL THROUGH */ @@ -1197,7 +1951,10 @@ S_reg(pTHX_ I32 paren, I32 *flagp) PL_regcomp_parse++; } if (*PL_regcomp_parse != ')') - FAIL("Sequence (?{...}) not terminated or not {}-balanced"); + { + PL_regcomp_parse = s; + vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); + } if (!SIZE_ONLY) { AV *av; @@ -1206,7 +1963,10 @@ S_reg(pTHX_ I32 paren, I32 *flagp) else sv = newSVpvn("", 0); + ENTER; + Perl_save_re_context(aTHX); rop = sv_compile_2op(sv, &sop, "re", &av); + LEAVE; n = add_data(3, "nop"); PL_regcomp_rx->data->data[n] = (void*)rop; @@ -1256,7 +2016,7 @@ S_reg(pTHX_ I32 paren, I32 *flagp) PL_regcomp_parse++; ret = reganode(GROUPP, parno); if ((c = *nextchar()) != ')') - FAIL2("Switch (?(number%c not recognized", c); + vFAIL("Switch condition not recognized"); insert_if: regtail(ret, reganode(IFTHEN, 0)); br = regbranch(&flags, 1); @@ -1278,7 +2038,7 @@ S_reg(pTHX_ I32 paren, I32 *flagp) else lastbr = NULL; if (c != ')') - FAIL("Switch (?(condition)... contains too many branches"); + vFAIL("Switch (?(condition)... contains too many branches"); ender = reg_node(TAIL); regtail(br, ender); if (lastbr) { @@ -1290,11 +2050,12 @@ S_reg(pTHX_ I32 paren, I32 *flagp) return ret; } else { - FAIL2("Unknown condition for (?(%.2s", PL_regcomp_parse); + vFAIL2("Unknown switch condition (?(%.2s", PL_regcomp_parse); } } case 0: - FAIL("Sequence (? incomplete"); + PL_regcomp_parse--; /* for vFAIL to print correctly */ + vFAIL("Sequence (? incomplete"); break; default: --PL_regcomp_parse; @@ -1317,8 +2078,10 @@ S_reg(pTHX_ I32 paren, I32 *flagp) break; } unknown: - if (*PL_regcomp_parse != ')') - FAIL2("Sequence (?%c...) not recognized", *PL_regcomp_parse); + if (*PL_regcomp_parse != ')') { + PL_regcomp_parse++; + vFAIL3("Sequence (%.*s...) not recognized", PL_regcomp_parse-seqstart, seqstart); + } nextchar(); *flagp = TRYAGAIN; return NULL; @@ -1430,15 +2193,17 @@ S_reg(pTHX_ I32 paren, I32 *flagp) if (paren) { PL_regflags = oregflags; if (PL_regcomp_parse >= PL_regxend || *nextchar() != ')') { - FAIL("unmatched () in regexp"); + PL_regcomp_parse = oregcomp_parse; + vFAIL("Unmatched ("); } } else if (!paren && PL_regcomp_parse < PL_regxend) { if (*PL_regcomp_parse == ')') { - FAIL("unmatched () in regexp"); + PL_regcomp_parse++; + vFAIL("Unmatched )"); } else - FAIL("junk on end of regexp"); /* "Can't happen". */ + FAIL("Junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } @@ -1563,7 +2328,7 @@ S_regpiece(pTHX_ I32 *flagp) if (!max && *maxpos != '0') max = REG_INFTY; /* meaning "infinity" */ else if (max >= REG_INFTY) - FAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); PL_regcomp_parse = next; nextchar(); @@ -1597,7 +2362,7 @@ S_regpiece(pTHX_ I32 *flagp) if (max > 0) *flagp |= HASWIDTH; if (max && max < min) - FAIL("Can't do {n,m} with n > m"); + vFAIL("Can't do {n,m} with n > m"); if (!SIZE_ONLY) { ARG1_SET(ret, min); ARG2_SET(ret, max); @@ -1613,8 +2378,19 @@ S_regpiece(pTHX_ I32 *flagp) } #if 0 /* Now runtime fix should be reliable. */ + + /* if this is reinstated, don't forget to put this back into perldiag: + + =item Regexp *+ operand could be empty at {#} in regex m/%s/ + + (F) The part of the regexp subject to either the * or + quantifier + could match an empty string. The {#} shows in the regular + expression about where the problem was discovered. + + */ + if (!(flags&HASWIDTH) && op != '?') - FAIL("regexp *+ operand could be empty"); + vFAIL("Regexp *+ operand could be empty"); #endif nextchar(); @@ -1644,9 +2420,11 @@ S_regpiece(pTHX_ I32 *flagp) goto do_curly; } nest_check: - if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) { - Perl_warner(aTHX_ WARN_UNSAFE, "%.*s matches null string many times", - PL_regcomp_parse - origparse, origparse); + if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) { + vWARN3(PL_regcomp_parse, + "%.*s matches null string many times", + PL_regcomp_parse - origparse, + origparse); } if (*PL_regcomp_parse == '?') { @@ -1654,8 +2432,10 @@ S_regpiece(pTHX_ I32 *flagp) reginsert(MINMOD, ret); regtail(ret, ret + NODE_STEP_REGNODE); } - if (ISMULT2(PL_regcomp_parse)) - FAIL("nested *?+ in regexp"); + if (ISMULT2(PL_regcomp_parse)) { + PL_regcomp_parse++; + vFAIL("Nested quantifiers"); + } return(ret); } @@ -1668,8 +2448,7 @@ S_regpiece(pTHX_ I32 *flagp) * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. * - * [Yes, it is worth fixing, some scripts can run twice the speed.] - */ + * [Yes, it is worth fixing, some scripts can run twice the speed.] */ STATIC regnode * S_regatom(pTHX_ I32 *flagp) { @@ -1692,9 +2471,9 @@ tryagain: ret = reg_node(BOL); break; case '$': - if (PL_regcomp_parse[1]) - PL_seen_zerolen++; nextchar(); + if (*PL_regcomp_parse) + PL_seen_zerolen++; if (PL_regflags & PMf_MULTILINE) ret = reg_node(MEOL); else if (PL_regflags & PMf_SINGLELINE) @@ -1721,19 +2500,29 @@ tryagain: PL_regnaughty++; break; case '[': - PL_regcomp_parse++; + { + char *oregcomp_parse = ++PL_regcomp_parse; ret = (UTF ? regclassutf8() : regclass()); - if (*PL_regcomp_parse != ']') - FAIL("unmatched [] in regexp"); + if (*PL_regcomp_parse != ']') { + PL_regcomp_parse = oregcomp_parse; + vFAIL("Unmatched ["); + } nextchar(); *flagp |= HASWIDTH|SIMPLE; break; + } case '(': nextchar(); ret = reg(1, &flags); if (ret == NULL) { - if (flags & TRYAGAIN) + if (flags & TRYAGAIN) { + if (PL_regcomp_parse == PL_regxend) { + /* Make parent create an empty node if needed. */ + *flagp |= TRYAGAIN; + return(NULL); + } goto tryagain; + } return(NULL); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); @@ -1744,7 +2533,7 @@ tryagain: *flagp |= TRYAGAIN; return NULL; } - FAIL2("internal urp in regexp at /%s/", PL_regcomp_parse); + vFAIL("Internal urp"); /* Supposed to be caught earlier. */ break; case '{': @@ -1756,7 +2545,8 @@ tryagain: case '?': case '+': case '*': - FAIL("?+*{} follows nothing in regexp"); + PL_regcomp_parse++; + vFAIL("Quantifier follows nothing"); break; case '\\': switch (*++PL_regcomp_parse) { @@ -1880,8 +2670,11 @@ tryagain: if (PL_regcomp_parse[1] == '{') { PL_regxend = strchr(PL_regcomp_parse, '}'); - if (!PL_regxend) - FAIL("Missing right brace on \\p{}"); + if (!PL_regxend) { + PL_regcomp_parse += 2; + PL_regxend = oldregxend; + vFAIL("Missing right brace on \\p{}"); + } PL_regxend++; } else @@ -1914,15 +2707,16 @@ tryagain: if (num > 9 && num >= PL_regnpar) goto defchar; else { + while (isDIGIT(*PL_regcomp_parse)) + PL_regcomp_parse++; + if (!SIZE_ONLY && num > PL_regcomp_rx->nparens) - FAIL("reference to nonexistent group"); + vFAIL("Reference to nonexistent group"); PL_regsawback = 1; ret = reganode(FOLD ? (LOC ? REFFL : REFF) : REF, num); *flagp |= HASWIDTH; - while (isDIGIT(*PL_regcomp_parse)) - PL_regcomp_parse++; PL_regcomp_parse--; nextchar(); } @@ -1930,7 +2724,7 @@ tryagain: break; case '\0': if (PL_regcomp_parse >= PL_regxend) - FAIL("trailing \\ in regexp"); + FAIL("Trailing \\"); /* FALL THROUGH */ default: /* Do not generate `unrecognized' warnings here, we fall @@ -2013,31 +2807,48 @@ tryagain: p++; break; case 'e': - ender = '\033'; +#ifdef ASCIIish + ender = '\033'; +#else + ender = '\047'; +#endif p++; break; case 'a': - ender = '\007'; +#ifdef ASCIIish + ender = '\007'; +#else + ender = '\057'; +#endif p++; break; case 'x': if (*++p == '{') { char* e = strchr(p, '}'); - if (!e) - FAIL("Missing right brace on \\x{}"); + if (!e) { + PL_regcomp_parse = p + 1; + vFAIL("Missing right brace on \\x{}"); + } else if (UTF) { - ender = (UV)scan_hex(p + 1, e - p, &numlen); - if (numlen + len >= 127) { /* numlen is generous */ + numlen = 1; /* allow underscores */ + ender = (UV)scan_hex(p + 1, e - p - 1, &numlen); + /* numlen is generous */ + if (numlen + len >= 127) { p--; goto loopdone; } p = e + 1; } else - FAIL("Can't use \\x{} without 'use utf8' declaration"); + { + PL_regcomp_parse = e + 1; + vFAIL("Can't use \\x{} without 'use utf8' declaration"); + } + } else { + numlen = 0; /* disallow underscores */ ender = (UV)scan_hex(p, 2, &numlen); p += numlen; } @@ -2051,6 +2862,7 @@ tryagain: case '5': case '6': case '7': case '8':case '9': if (*p == '0' || (isDIGIT(p[1]) && atoi(p) >= PL_regnpar) ) { + numlen = 0; /* disallow underscores */ ender = (UV)scan_oct(p, 3, &numlen); p += numlen; } @@ -2061,21 +2873,18 @@ tryagain: break; case '\0': if (p >= PL_regxend) - FAIL("trailing \\ in regexp"); + FAIL("Trailing \\"); /* FALL THROUGH */ default: - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && isALPHA(*p)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: Unrecognized escape \\%c passed through", - PL_regprecomp, - *p); + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p)) + vWARN2(p +1, "Unrecognized escape \\%c passed through", *p); goto normal_default; } break; default: normal_default: if ((*p & 0xc0) == 0xc0 && UTF) { - ender = utf8_to_uv((U8*)p, &numlen); + ender = utf8_to_uv_chk((U8*)p, &numlen, 0); p += numlen; } else @@ -2116,7 +2925,7 @@ tryagain: PL_regcomp_parse = p - 1; nextchar(); if (len < 0) - FAIL("internal disaster in regexp"); + vFAIL("Internal disaster"); if (len > 0) *flagp |= HASWIDTH; if (len == 1) @@ -2198,6 +3007,11 @@ S_regpposixcc(pTHX_ I32 value) namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII; break; + case 'b': + if (strnEQ(posixcc, "blank", 5)) + namedclass = + complement ? ANYOF_NBLANK : ANYOF_BLANK; + break; case 'c': if (strnEQ(posixcc, "cntrl", 5)) namedclass = @@ -2229,7 +3043,8 @@ S_regpposixcc(pTHX_ I32 value) case 's': if (strnEQ(posixcc, "space", 5)) namedclass = - complement ? ANYOF_NSPACE : ANYOF_SPACE; + complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; + break; case 'u': if (strnEQ(posixcc, "upper", 5)) namedclass = @@ -2250,16 +3065,22 @@ S_regpposixcc(pTHX_ I32 value) } break; } - if ((namedclass == OOB_NAMEDCLASS || - !(posixcc + skip + 2 < PL_regxend && - (posixcc[skip] == ':' && - posixcc[skip + 1] == ']')))) - Perl_croak(aTHX_ "Character class [:%.*s:] unknown", - t - s - 1, s + 1); - } else if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY) + if (namedclass == OOB_NAMEDCLASS || + posixcc[skip] != ':' || + posixcc[skip+1] != ']') + { + Simple_vFAIL3("POSIX class [:%.*s:] unknown", + t - s - 1, s + 1); + } + } else if (!SIZE_ONLY) { /* [[=foo=]] and [[.foo.]] are still future. */ - Perl_warner(aTHX_ WARN_UNSAFE, - "Character class syntax [%c %c] is reserved for future extensions", c, c); + + /* adjust PL_regcomp_parse so the warning shows after + the class closes */ + while (*PL_regcomp_parse && *PL_regcomp_parse != ']') + PL_regcomp_parse++; + Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } } else { /* Maternal grandfather: * "[:" ending in ":" but not in ":]" */ @@ -2274,7 +3095,7 @@ S_regpposixcc(pTHX_ I32 value) STATIC void S_checkposixcc(pTHX) { - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && (*PL_regcomp_parse == ':' || *PL_regcomp_parse == '=' || *PL_regcomp_parse == '.')) { @@ -2284,11 +3105,17 @@ S_checkposixcc(pTHX) while(*s && isALNUM(*s)) s++; if (*s && c == *s && s[1] == ']') { - Perl_warner(aTHX_ WARN_UNSAFE, - "Character class syntax [%c %c] belongs inside character classes", c, c); + vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c); + + /* [[=foo=]] and [[.foo.]] are still future. */ if (c == '=' || c == '.') - Perl_warner(aTHX_ WARN_UNSAFE, - "Character class syntax [%c %c] is reserved for future extensions", c, c); + { + /* adjust PL_regcomp_parse so the error shows after + the class closes */ + while (*PL_regcomp_parse && *PL_regcomp_parse++ != ']') + ; + Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } } } } @@ -2297,11 +3124,10 @@ STATIC regnode * S_regclass(pTHX) { dTHR; - register UV value; + register U32 value; register I32 lastvalue = OOB_CHAR8; register I32 range = 0; register regnode *ret; - register I32 def; I32 numlen; I32 namedclass; char *rangebegin; @@ -2326,7 +3152,7 @@ S_regclass(pTHX) ANYOF_FLAGS(ret) |= ANYOF_INVERT; } - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE)) + if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) checkposixcc(); if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-') @@ -2341,6 +3167,8 @@ S_regclass(pTHX) namedclass = regpposixcc(value); else if (value == '\\') { value = UCHARAT(PL_regcomp_parse++); + /* Some compilers cannot handle switching on 64-bit integer + * values, therefore the 'value' cannot be an UV. --jhi */ switch (value) { case 'w': namedclass = ANYOF_ALNUM; break; case 'W': namedclass = ANYOF_NALNUM; break; @@ -2353,9 +3181,15 @@ S_regclass(pTHX) case 't': value = '\t'; break; case 'f': value = '\f'; break; case 'b': value = '\b'; break; +#ifdef ASCIIish case 'e': value = '\033'; break; case 'a': value = '\007'; break; +#else + case 'e': value = '\047'; break; + case 'a': value = '\057'; break; +#endif case 'x': + numlen = 0; /* disallow underscores */ value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen); PL_regcomp_parse += numlen; break; @@ -2365,15 +3199,14 @@ S_regclass(pTHX) break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + numlen = 0; /* disallow underscores */ value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen); PL_regcomp_parse += numlen; break; default: - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && isALPHA(value)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: Unrecognized escape \\%c in character class passed through", - PL_regprecomp, - value); + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) + + vWARN2(PL_regcomp_parse, "Unrecognized escape \\%c in character class passed through", (int)value); break; } } @@ -2383,13 +3216,12 @@ S_regclass(pTHX) need_class = 1; if (range) { /* a-\d, a-[:digit:] */ if (!SIZE_ONLY) { - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + if (ckWARN(WARN_REGEXP)) + vWARN4(PL_regcomp_parse, + "False [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); ANYOF_BITMAP_SET(ret, lastvalue); ANYOF_BITMAP_SET(ret, '-'); } @@ -2491,16 +3323,46 @@ S_regclass(pTHX) if (LOC) ANYOF_CLASS_SET(ret, ANYOF_ASCII); else { +#ifdef ASCIIish for (value = 0; value < 128; value++) ANYOF_BITMAP_SET(ret, value); +#else /* EBCDIC */ + for (value = 0; value < 256; value++) + if (isASCII(value)) + ANYOF_BITMAP_SET(ret, value); +#endif /* EBCDIC */ } break; case ANYOF_NASCII: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_NASCII); else { +#ifdef ASCIIish for (value = 128; value < 256; value++) ANYOF_BITMAP_SET(ret, value); +#else /* EBCDIC */ + for (value = 0; value < 256; value++) + if (!isASCII(value)) + ANYOF_BITMAP_SET(ret, value); +#endif /* EBCDIC */ + } + break; + case ANYOF_BLANK: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_BLANK); + else { + for (value = 0; value < 256; value++) + if (isBLANK(value)) + ANYOF_BITMAP_SET(ret, value); + } + break; + case ANYOF_NBLANK: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NBLANK); + else { + for (value = 0; value < 256; value++) + if (!isBLANK(value)) + ANYOF_BITMAP_SET(ret, value); } break; case ANYOF_CNTRL: @@ -2576,6 +3438,24 @@ S_regclass(pTHX) ANYOF_BITMAP_SET(ret, value); } break; + case ANYOF_PSXSPC: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_PSXSPC); + else { + for (value = 0; value < 256; value++) + if (isPSXSPC(value)) + ANYOF_BITMAP_SET(ret, value); + } + break; + case ANYOF_NPSXSPC: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC); + else { + for (value = 0; value < 256; value++) + if (!isPSXSPC(value)) + ANYOF_BITMAP_SET(ret, value); + } + break; case ANYOF_PUNCT: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_PUNCT); @@ -2631,7 +3511,7 @@ S_regclass(pTHX) } break; default: - FAIL("invalid [::] class in regexp"); + vFAIL("Invalid [::] class"); break; } if (LOC) @@ -2641,12 +3521,10 @@ S_regclass(pTHX) } if (range) { if (lastvalue > value) /* b-a */ { - Perl_croak(aTHX_ - "/%.127s/: invalid [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + Simple_vFAIL4("Invalid [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); } range = 0; } @@ -2656,13 +3534,12 @@ S_regclass(pTHX) PL_regcomp_parse[1] != ']') { PL_regcomp_parse++; if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + if (ckWARN(WARN_REGEXP)) + vWARN4(PL_regcomp_parse, + "False [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); if (!SIZE_ONLY) ANYOF_BITMAP_SET(ret, '-'); } else @@ -2725,7 +3602,7 @@ S_regclassutf8(pTHX) { dTHR; register char *e; - register UV value; + register U32 value; register U32 lastvalue = OOB_UTF8; register I32 range = 0; register regnode *ret; @@ -2750,7 +3627,7 @@ S_regclassutf8(pTHX) listsv = newSVpvn("# comment\n",10); } - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE)) + if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) checkposixcc(); if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-') @@ -2761,13 +3638,16 @@ S_regclassutf8(pTHX) namedclass = OOB_NAMEDCLASS; if (!range) rangebegin = PL_regcomp_parse; - value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen); + value = utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0); PL_regcomp_parse += numlen; if (value == '[') namedclass = regpposixcc(value); else if (value == '\\') { - value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen); + value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0); PL_regcomp_parse += numlen; + /* Some compilers cannot handle switching on 64-bit integer + * values, therefore value cannot be an UV. Yes, this will + * be a problem later if we want switch on Unicode. --jhi */ switch (value) { case 'w': namedclass = ANYOF_ALNUM; break; case 'W': namedclass = ANYOF_NALNUM; break; @@ -2780,7 +3660,7 @@ S_regclassutf8(pTHX) if (*PL_regcomp_parse == '{') { e = strchr(PL_regcomp_parse++, '}'); if (!e) - FAIL("Missing right brace on \\p{}"); + vFAIL("Missing right brace on \\p{}"); n = e - PL_regcomp_parse; } else { @@ -2790,10 +3670,10 @@ S_regclassutf8(pTHX) if (!SIZE_ONLY) { if (value == 'p') Perl_sv_catpvf(aTHX_ listsv, - "+utf8::%.*s\n", n, PL_regcomp_parse); + "+utf8::%.*s\n", (int)n, PL_regcomp_parse); else Perl_sv_catpvf(aTHX_ listsv, - "!utf8::%.*s\n", n, PL_regcomp_parse); + "!utf8::%.*s\n", (int)n, PL_regcomp_parse); } PL_regcomp_parse = e + 1; lastvalue = OOB_UTF8; @@ -2803,19 +3683,26 @@ S_regclassutf8(pTHX) case 't': value = '\t'; break; case 'f': value = '\f'; break; case 'b': value = '\b'; break; +#ifdef ASCIIish case 'e': value = '\033'; break; case 'a': value = '\007'; break; +#else + case 'e': value = '\047'; break; + case 'a': value = '\057'; break; +#endif case 'x': if (*PL_regcomp_parse == '{') { e = strchr(PL_regcomp_parse++, '}'); - if (!e) - FAIL("Missing right brace on \\x{}"); + if (!e) + vFAIL("Missing right brace on \\x{}"); + numlen = 1; /* allow underscores */ value = (UV)scan_hex(PL_regcomp_parse, e - PL_regcomp_parse, &numlen); PL_regcomp_parse = e + 1; } else { + numlen = 0; /* disallow underscores */ value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen); PL_regcomp_parse += numlen; } @@ -2826,28 +3713,27 @@ S_regclassutf8(pTHX) break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + numlen = 0; /* disallow underscores */ value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen); PL_regcomp_parse += numlen; break; default: - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && isALPHA(value)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: Unrecognized escape \\%c in character class passed through", - PL_regprecomp, - value); + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) + vWARN2(PL_regcomp_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); break; } } if (namedclass > OOB_NAMEDCLASS) { if (range) { /* a-\d, a-[:digit:] */ if (!SIZE_ONLY) { - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + if (ckWARN(WARN_REGEXP)) + vWARN4(PL_regcomp_parse, + "False [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); Perl_sv_catpvf(aTHX_ listsv, /* 0x002D is Unicode for '-' */ "%04"UVxf"\n002D\n", (UV)lastvalue); @@ -2897,8 +3783,12 @@ S_regclassutf8(pTHX) case ANYOF_NPUNCT: Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; case ANYOF_SPACE: + case ANYOF_PSXSPC: + case ANYOF_BLANK: Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; case ANYOF_NSPACE: + case ANYOF_NPSXSPC: + case ANYOF_NBLANK: Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; case ANYOF_UPPER: Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; @@ -2914,12 +3804,10 @@ S_regclassutf8(pTHX) } if (range) { if (lastvalue > value) { /* b-a */ - Perl_croak(aTHX_ - "/%.127s/: invalid [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + Simple_vFAIL4("invalid [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); } range = 0; } @@ -2929,13 +3817,12 @@ S_regclassutf8(pTHX) PL_regcomp_parse[1] != ']') { PL_regcomp_parse++; if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + if (ckWARN(WARN_REGEXP)) + vWARN4(PL_regcomp_parse, + "False [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); if (!SIZE_ONLY) Perl_sv_catpvf(aTHX_ listsv, /* 0x002D is Unicode for '-' */ @@ -3054,7 +3941,7 @@ S_reguni(pTHX_ UV uv, char* s, I32* lenp) { dTHR; if (SIZE_ONLY) { - U8 tmpbuf[10]; + U8 tmpbuf[UTF8_MAXLEN]; *lenp = uv_to_utf8(tmpbuf, uv) - tmpbuf; } else @@ -3104,7 +3991,6 @@ S_regtail(pTHX_ regnode *p, regnode *val) dTHR; register regnode *scan; register regnode *temp; - register I32 offset; if (SIZE_ONLY) return; @@ -3173,7 +4059,7 @@ S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) { #ifdef DEBUGGING register U8 op = EXACT; /* Arbitrary non-END op. */ - register regnode *next, *onode; + register regnode *next; while (op != END && (!last || node < last)) { /* While that wasn't END last time... */ @@ -3188,7 +4074,7 @@ S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) goto after_print; regprop(sv, node); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), - 2*l + 1, "", SvPVX(sv)); + (int)(2*l + 1), "", SvPVX(sv)); if (next == NULL) /* Next ptr. */ PerlIO_printf(Perl_debug_log, "(0)"); else @@ -3310,6 +4196,17 @@ Perl_regdump(pTHX_ regexp *r) #endif /* DEBUGGING */ } +STATIC void +S_put_byte(pTHX_ SV *sv, int c) +{ + if (c <= ' ' || c == 127 || c == 255) + Perl_sv_catpvf(aTHX_ sv, "\\%o", c); + else if (c == '-' || c == ']' || c == '\\' || c == '^') + Perl_sv_catpvf(aTHX_ sv, "\\%c", c); + else + Perl_sv_catpvf(aTHX_ sv, "%c", c); +} + /* - regprop - printable representation of opcode */ @@ -3322,7 +4219,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) sv_setpvn(sv, "", 0); if (OP(o) >= reg_num) /* regnode.type is unsigned */ - FAIL("corrupted regexp opcode"); + FAIL("Corrupted regexp opcode"); sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[(U8)OP(o)]; @@ -3338,9 +4235,74 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP ) - Perl_sv_catpvf(aTHX_ sv, "%d", ARG(o)); /* Parenth number */ + Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + else if (k == ANYOF) { + int i, rangestart = -1; + const char * const out[] = { /* Should be syncronized with + ANYOF_ #xdefines in regcomp.h */ + "\\w", + "\\W", + "\\s", + "\\S", + "\\d", + "\\D", + "[:alnum:]", + "[:^alnum:]", + "[:alpha:]", + "[:^alpha:]", + "[:ascii:]", + "[:^ascii:]", + "[:ctrl:]", + "[:^ctrl:]", + "[:graph:]", + "[:^graph:]", + "[:lower:]", + "[:^lower:]", + "[:print:]", + "[:^print:]", + "[:punct:]", + "[:^punct:]", + "[:upper:]", + "[:^upper:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:blank:]", + "[:^blank:]" + }; + + if (o->flags & ANYOF_LOCALE) + sv_catpv(sv, "{loc}"); + if (o->flags & ANYOF_FOLD) + sv_catpv(sv, "{i}"); + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + if (o->flags & ANYOF_INVERT) + sv_catpv(sv, "^"); + for (i = 0; i <= 256; i++) { + if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) + put_byte(sv, rangestart); + else { + put_byte(sv, rangestart); + sv_catpv(sv, "-"); + put_byte(sv, i - 1); + } + rangestart = -1; + } + } + if (o->flags & ANYOF_CLASS) + for (i = 0; i < sizeof(out)/sizeof(char*); i++) + if (ANYOF_CLASS_TEST(o,i)) + sv_catpv(sv, out[i]); + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags); #endif /* DEBUGGING */ @@ -3402,6 +4364,9 @@ Perl_pregfree(pTHX_ struct regexp *r) case 's': SvREFCNT_dec((SV*)r->data->data[n]); break; + case 'f': + Safefree(r->data->data[n]); + break; case 'p': new_comppad = (AV*)r->data->data[n]; break; @@ -3410,8 +4375,13 @@ Perl_pregfree(pTHX_ struct regexp *r) Perl_croak(aTHX_ "panic: pregfree comppad"); old_comppad = PL_comppad; old_curpad = PL_curpad; - PL_comppad = new_comppad; - PL_curpad = AvARRAY(new_comppad); + /* Watch out for global destruction's random ordering. */ + if (SvTYPE(new_comppad) == SVt_PVAV) { + PL_comppad = new_comppad; + PL_curpad = AvARRAY(new_comppad); + } + else + PL_curpad = NULL; op_free((OP_4tree*)r->data->data[n]); PL_comppad = old_comppad; PL_curpad = old_curpad; @@ -3502,46 +4472,45 @@ Perl_save_re_context(pTHX) SAVEPPTR(PL_reginput); /* String-input pointer. */ SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */ SAVEPPTR(PL_regeol); /* End of input, for $ check. */ - SAVESPTR(PL_regstartp); /* Pointer to startp array. */ - SAVESPTR(PL_regendp); /* Ditto for endp. */ - SAVESPTR(PL_reglastparen); /* Similarly for lastparen. */ + SAVEVPTR(PL_regstartp); /* Pointer to startp array. */ + SAVEVPTR(PL_regendp); /* Ditto for endp. */ + SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */ SAVEPPTR(PL_regtill); /* How far we are required to go. */ - SAVEI32(PL_regprev); /* char before regbol, \n if none */ - SAVESPTR(PL_reg_start_tmp); /* from regexec.c */ + SAVEI8(PL_regprev); /* char before regbol, \n if none */ + SAVEVPTR(PL_reg_start_tmp); /* from regexec.c */ PL_reg_start_tmp = 0; SAVEFREEPV(PL_reg_start_tmp); SAVEI32(PL_reg_start_tmpl); /* from regexec.c */ PL_reg_start_tmpl = 0; - SAVESPTR(PL_regdata); + SAVEVPTR(PL_regdata); SAVEI32(PL_reg_flags); /* from regexec.c */ SAVEI32(PL_reg_eval_set); /* from regexec.c */ SAVEI32(PL_regnarrate); /* from regexec.c */ - SAVESPTR(PL_regprogram); /* from regexec.c */ + SAVEVPTR(PL_regprogram); /* from regexec.c */ SAVEINT(PL_regindent); /* from regexec.c */ - SAVESPTR(PL_regcc); /* from regexec.c */ - SAVESPTR(PL_curcop); - SAVESPTR(PL_regcomp_rx); /* from regcomp.c */ + SAVEVPTR(PL_regcc); /* from regexec.c */ + SAVEVPTR(PL_curcop); + SAVEVPTR(PL_regcomp_rx); /* from regcomp.c */ SAVEI32(PL_regseen); /* from regcomp.c */ SAVEI32(PL_regsawback); /* Did we see \1, ...? */ SAVEI32(PL_regnaughty); /* How bad is this pattern? */ - SAVESPTR(PL_regcode); /* Code-emit pointer; ®dummy = don't */ + SAVEVPTR(PL_regcode); /* Code-emit pointer; ®dummy = don't */ SAVEPPTR(PL_regxend); /* End of input for compile */ SAVEPPTR(PL_regcomp_parse); /* Input-scan pointer. */ - SAVESPTR(PL_reg_call_cc); /* from regexec.c */ - SAVESPTR(PL_reg_re); /* from regexec.c */ + SAVEVPTR(PL_reg_call_cc); /* from regexec.c */ + SAVEVPTR(PL_reg_re); /* from regexec.c */ SAVEPPTR(PL_reg_ganch); /* from regexec.c */ SAVESPTR(PL_reg_sv); /* from regexec.c */ - SAVESPTR(PL_reg_magic); /* from regexec.c */ + SAVEVPTR(PL_reg_magic); /* from regexec.c */ SAVEI32(PL_reg_oldpos); /* from regexec.c */ - SAVESPTR(PL_reg_oldcurpm); /* from regexec.c */ - SAVESPTR(PL_reg_curpm); /* from regexec.c */ + SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */ + SAVEVPTR(PL_reg_curpm); /* from regexec.c */ #ifdef DEBUGGING SAVEPPTR(PL_reg_starttry); /* from regexec.c */ #endif } #ifdef PERL_OBJECT -#define NO_XSLOCKS #include "XSUB.h" #undef this #define this pPerl