X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=1fdc44aee59978d7d7025646e700e582dbf713ee;hb=36bad43a54a7c90a90f0ef3c464b4224dbc21068;hp=59b2712e48755421f66adc34c82d1c3aa9be1bf3;hpb=83c2f623ab800d4b316b958f530d3ecd95448b48;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index 59b2712..1fdc44a 100644 --- a/regcomp.c +++ b/regcomp.c @@ -250,10 +250,10 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, * of t/op/regmesg.t, the tests in t/op/re_tests, and those in * op/pragma/warn/regcomp. */ -#define MARKER1 "HERE" /* marker as it appears in the description */ -#define MARKER2 " << HERE " /* marker as it appears within the regex */ +#define MARKER1 "<-- HERE" /* marker as it appears in the description */ +#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ -#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/" +#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" /* * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given @@ -1715,9 +1715,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->offsets[0] = RExC_size; } DEBUG_r(PerlIO_printf(Perl_debug_log, - "%s %u bytes for offset annotations.\n", + "%s %"UVuf" bytes for offset annotations.\n", r->offsets ? "Got" : "Couldn't get", - (2*RExC_size+1) * sizeof(U32))); + (UV)((2*RExC_size+1) * sizeof(U32)))); RExC_rx = r; @@ -2117,7 +2117,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) /* No compiled RE interpolated, has runtime components ===> unsafe. */ FAIL("Eval-group not allowed at runtime, use re 'eval'"); - if (PL_tainted) + if (PL_tainting && PL_tainted) FAIL("Eval-group in insecure regular expression"); } @@ -2799,11 +2799,12 @@ tryagain: break; case 'p': case 'P': - { /* a lovely hack--pretend we saw [\pX] instead */ + { char* oldregxend = RExC_end; char* parse_start = RExC_parse; if (RExC_parse[1] == '{') { + /* a lovely hack--pretend we saw [\pX] instead */ RExC_end = strchr(RExC_parse, '}'); if (!RExC_end) { RExC_parse += 2; @@ -3259,7 +3260,7 @@ STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state) { register UV value; - register IV lastvalue = OOB_UNICODE; + register IV prevvalue = OOB_UNICODE; register IV range = 0; register regnode *ret; STRLEN numlen; @@ -3270,7 +3271,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) register char *e; char *parse_start = RExC_parse; /* MJD */ UV n; - bool dont_optimize_invert = FALSE; + bool optimize_invert = TRUE; ret = reganode(pRExC_state, ANYOF, 0); @@ -3312,8 +3313,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) rangebegin = RExC_parse; if (UTF) { value = utf8n_to_uvchr((U8*)RExC_parse, - RExC_end - RExC_parse, - &numlen, 0); + RExC_end - RExC_parse, + &numlen, 0); RExC_parse += numlen; } else @@ -3423,14 +3424,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) RExC_parse - rangebegin, RExC_parse - rangebegin, rangebegin); - if (lastvalue < 256) { - ANYOF_BITMAP_SET(ret, lastvalue); + if (prevvalue < 256) { + ANYOF_BITMAP_SET(ret, prevvalue); ANYOF_BITMAP_SET(ret, '-'); } else { ANYOF_FLAGS(ret) |= ANYOF_UNICODE; Perl_sv_catpvf(aTHX_ listsv, - "%04"UVxf"\n%04"UVxf"\n", (UV)lastvalue, (UV) '-'); + "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); } } @@ -3438,6 +3439,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } if (!SIZE_ONLY) { + if (namedclass > OOB_NAMEDCLASS) + optimize_invert = FALSE; /* Possible truncation here but in some 64-bit environments * the compiler gets heartburn about switch on 64-bit values. * A similar issue a little earlier when switching on value. @@ -3451,7 +3454,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; case ANYOF_NALNUM: @@ -3462,7 +3464,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; case ANYOF_ALNUMC: @@ -3473,7 +3474,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; case ANYOF_NALNUMC: @@ -3484,7 +3484,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; case ANYOF_ALPHA: @@ -3495,7 +3494,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; case ANYOF_NALPHA: @@ -3506,7 +3504,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; case ANYOF_ASCII: @@ -3518,18 +3515,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_BITMAP_SET(ret, value); #else /* EBCDIC */ for (value = 0; value < 256; value++) { - if (PL_hints & HINT_RE_ASCIIR) { - if (NATIVE_TO_ASCII(value) < 128) - ANYOF_BITMAP_SET(ret, value); - } - else { - if (isASCII(value)) - ANYOF_BITMAP_SET(ret, value); - } + if (isASCII(value)) + ANYOF_BITMAP_SET(ret, value); } #endif /* EBCDIC */ } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; case ANYOF_NASCII: @@ -3541,18 +3531,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_BITMAP_SET(ret, value); #else /* EBCDIC */ for (value = 0; value < 256; value++) { - if (PL_hints & HINT_RE_ASCIIR) { - if (NATIVE_TO_ASCII(value) >= 128) - ANYOF_BITMAP_SET(ret, value); - } - else { - if (!isASCII(value)) - ANYOF_BITMAP_SET(ret, value); - } + if (!isASCII(value)) + ANYOF_BITMAP_SET(ret, value); } #endif /* EBCDIC */ } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; case ANYOF_BLANK: @@ -3563,7 +3546,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; case ANYOF_NBLANK: @@ -3574,7 +3556,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; case ANYOF_CNTRL: @@ -3585,7 +3566,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; case ANYOF_NCNTRL: @@ -3596,7 +3576,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break; case ANYOF_DIGIT: @@ -3607,7 +3586,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) for (value = '0'; value <= '9'; value++) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break; case ANYOF_NDIGIT: @@ -3620,7 +3598,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) for (value = '9' + 1; value < 256; value++) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; case ANYOF_GRAPH: @@ -3631,7 +3608,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; case ANYOF_NGRAPH: @@ -3642,7 +3618,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; case ANYOF_LOWER: @@ -3653,7 +3628,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; case ANYOF_NLOWER: @@ -3664,7 +3638,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; case ANYOF_PRINT: @@ -3675,7 +3648,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; case ANYOF_NPRINT: @@ -3686,7 +3658,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; case ANYOF_PSXSPC: @@ -3697,7 +3668,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; case ANYOF_NPSXSPC: @@ -3708,7 +3678,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; case ANYOF_PUNCT: @@ -3719,7 +3688,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; case ANYOF_NPUNCT: @@ -3730,7 +3698,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; case ANYOF_SPACE: @@ -3741,7 +3708,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isSPACE(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n"); break; case ANYOF_NSPACE: @@ -3752,7 +3718,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isSPACE(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n"); break; case ANYOF_UPPER: @@ -3763,7 +3728,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; case ANYOF_NUPPER: @@ -3774,7 +3738,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; case ANYOF_XDIGIT: @@ -3785,7 +3748,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; case ANYOF_NXDIGIT: @@ -3796,7 +3758,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } - dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; default: @@ -3810,17 +3771,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } /* end of namedclass \blah */ if (range) { - if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) || - ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ { + if (prevvalue > value) /* b-a */ { Simple_vFAIL4("Invalid [] range \"%*.*s\"", RExC_parse - rangebegin, RExC_parse - rangebegin, rangebegin); + range = 0; /* not a valid range */ } - range = 0; /* not a true range */ } else { - lastvalue = value; /* save the beginning of the range */ + prevvalue = value; /* save the beginning of the range */ if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && RExC_parse[1] != ']') { RExC_parse++; @@ -3843,42 +3803,36 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) /* now is the next time */ if (!SIZE_ONLY) { - if (lastvalue < 256 && value < 256) { -#ifdef EBCDIC /* EBCDIC, for example. */ - if (PL_hints & HINT_RE_ASCIIR) { - IV i; - /* New style scheme for ranges: - * after : - * use re 'asciir'; - * do ranges in ASCII/Unicode space - */ - for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++) - ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i)); - } - else if ((isLOWER(lastvalue) && isLOWER(value)) || - (isUPPER(lastvalue) && isUPPER(value))) + IV i; + + if (prevvalue < 256) { + IV ceilvalue = value < 256 ? value : 255; + +#ifdef EBCDIC + if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || + (isUPPER(prevvalue) && isUPPER(ceilvalue))) { - IV i; - if (isLOWER(lastvalue)) { - for (i = lastvalue; i <= value; i++) + if (isLOWER(prevvalue)) { + for (i = prevvalue; i <= ceilvalue; i++) if (isLOWER(i)) ANYOF_BITMAP_SET(ret, i); } else { - for (i = lastvalue; i <= value; i++) + for (i = prevvalue; i <= ceilvalue; i++) if (isUPPER(i)) ANYOF_BITMAP_SET(ret, i); } } else #endif - for ( ; lastvalue <= value; lastvalue++) - ANYOF_BITMAP_SET(ret, lastvalue); - } else { + for (i = prevvalue; i <= ceilvalue; i++) + ANYOF_BITMAP_SET(ret, i); + } + if (value > 255) { ANYOF_FLAGS(ret) |= ANYOF_UNICODE; - if (lastvalue < value) + if (prevvalue < value) Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - (UV)lastvalue, (UV)value); - else + (UV)prevvalue, (UV)value); + else if (prevvalue == value) Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", (UV)value); } @@ -3888,6 +3842,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } if (need_class) { + ANYOF_FLAGS(ret) |= ANYOF_LARGE; if (SIZE_ONLY) RExC_size += ANYOF_CLASS_ADD_SKIP; else @@ -3911,7 +3866,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } /* optimize inverted simple patterns (e.g. [^a-z]) */ - if (!SIZE_ONLY && !dont_optimize_invert && + if (!SIZE_ONLY && optimize_invert && /* If the only flag is inversion. */ (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) @@ -4218,7 +4173,10 @@ S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1); } else if (op == ANYOF) { - node = next; + /* arglen 1 + class block */ + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE) + ? ANYOF_CLASS_SKIP : ANYOF_SKIP); + node = NEXTOPER(node); } else if (PL_regkind[(U8)op] == EXACT) { /* Literal string, where present. */ @@ -4308,11 +4266,11 @@ Perl_regdump(pTHX_ regexp *r) if (r->offsets) { U32 i; U32 len = r->offsets[0]; - PerlIO_printf(Perl_debug_log, "Offsets: [%u]\n\t", r->offsets[0]); + PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]); for (i = 1; i <= len; i++) - PerlIO_printf(Perl_debug_log, "%u[%u] ", - r->offsets[i*2-1], - r->offsets[i*2]); + PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", + (UV)r->offsets[i*2-1], + (UV)r->offsets[i*2]); PerlIO_printf(Perl_debug_log, "\n"); } #endif /* DEBUGGING */ @@ -4444,7 +4402,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) for (i = 0; i <= 256; i++) { /* just the first 256 */ U8 *e = uvchr_to_utf8(s, i); - if (i < 256 && swash_fetch(sw, s)) { + if (i < 256 && swash_fetch(sw, s, TRUE)) { if (rangestart == -1) rangestart = i; } else if (rangestart != -1) { @@ -4679,7 +4637,6 @@ Perl_save_re_context(pTHX) SAVEVPTR(PL_regendp); /* Ditto for endp. */ SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */ SAVEPPTR(PL_regtill); /* How far we are required to go. */ - SAVEI8(PL_regprev); /* char before regbol, \n if none */ SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */ PL_reg_start_tmp = 0; SAVEI32(PL_reg_start_tmpl); /* from regexec.c */