X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=7c8c3911db8c9da4638f673636089547a401d694;hb=2da7a6b5eab232c468b68576568f84877f032fd6;hp=d7b99814d62eab247ae6f519f41ee75a5802cccb;hpb=0ac6acaed7c2092a5668c6b70ddeaf3003e989d8;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index d7b9981..7c8c391 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1163,11 +1163,19 @@ is the recommended Unicode-aware way of saying #define TRIE_STORE_REVCHAR \ STMT_START { \ - SV *tmp = newSVpvs(""); \ - if (UTF) SvUTF8_on(tmp); \ - Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \ - av_push( revcharmap, tmp ); \ - } STMT_END + if (UTF) { \ + SV *zlopp = newSV(2); \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)uvc; \ + av_push(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END #define TRIE_READ_CHAR STMT_START { \ wordlen++; \ @@ -1405,7 +1413,20 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* store the codepoint in the bitmap, and if its ascii also store its folded equivelent. */ TRIE_BITMAP_SET(trie,uvc); - if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + codepoints in the 127 < uvc < 256 range */ + if (127 < uvc && uvc < 192) { + TRIE_BITMAP_SET(trie,194); + } else if (191 < uvc ) { + TRIE_BITMAP_SET(trie,195); + /* && uvc < 256 -- we know uvc is < 256 already */ + } + } set_bit = 0; /* We've done our bit :-) */ } } else { @@ -3297,7 +3318,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str); + mg->mg_len += CHR_SVLEN(last_str) - l; } data->last_end += l * (mincount - 1); } @@ -4243,21 +4264,21 @@ redo_first_pass: r->prelen = plen; r->extflags = pm_flags; { - bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); const char *fptr = STD_PAT_MODS; /*"msix"*/ char *p; - r->wraplen = r->prelen + has_minus + has_k + has_runon + r->wraplen = r->prelen + has_minus + has_p + has_runon + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); Newx(r->wrapped, r->wraplen + 1, char ); p = r->wrapped; *p++='('; *p++='?'; - if (has_k) - *p++ = KEEPCOPY_PAT_MOD; /*'k'*/ + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ { char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; char *colon = r + 1; @@ -4362,7 +4383,7 @@ reStudy: #endif /* Dig out information for optimizations. */ - r->extflags = pm_flags; /* Again? */ + r->extflags = RExC_flags; /* was pm_op */ /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ if (UTF) @@ -4752,6 +4773,9 @@ reStudy: else r->paren_names = NULL; +#ifdef STUPID_PATTERN_CHECKS + if (r->prelen == 0) + r->extflags |= RXf_NULL; if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') /* XXX: this should happen BEFORE we compile */ r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); @@ -4759,7 +4783,23 @@ reStudy: r->extflags |= RXf_WHITE; else if (r->prelen == 1 && r->precomp[0] == '^') r->extflags |= RXf_START_ONLY; - +#else + if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else { + regnode *first = ri->program + 1; + U8 fop = OP(first); + U8 nop = OP(NEXTOPER(first)); + + if (PL_regkind[fop] == NOTHING && nop == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && nop == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END) + r->extflags |= RXf_WHITE; + } +#endif #ifdef DEBUGGING if (RExC_paren_names) { ri->name_list_idx = add_data( pRExC_state, 1, "p" ); @@ -4807,18 +4847,18 @@ Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, { PERL_UNUSED_ARG(value); - if (flags & RXf_HASH_FETCH) { + if (flags & RXapif_FETCH) { return reg_named_buff_fetch(rx, key, flags); - } else if (flags & (RXf_HASH_STORE | RXf_HASH_DELETE | RXf_HASH_CLEAR)) { + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { Perl_croak(aTHX_ PL_no_modify); return NULL; - } else if (flags & RXf_HASH_EXISTS) { + } else if (flags & RXapif_EXISTS) { return reg_named_buff_exists(rx, key, flags) ? &PL_sv_yes : &PL_sv_no; - } else if (flags & RXf_HASH_REGNAMES) { + } else if (flags & RXapif_REGNAMES) { return reg_named_buff_all(rx, flags); - } else if (flags & (RXf_HASH_SCALAR | RXf_HASH_REGNAMES_COUNT)) { + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { return reg_named_buff_scalar(rx, flags); } else { Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); @@ -4832,9 +4872,9 @@ Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, { PERL_UNUSED_ARG(lastkey); - if (flags & RXf_HASH_FIRSTKEY) + if (flags & RXapif_FIRSTKEY) return reg_named_buff_firstkey(rx, flags); - else if (flags & RXf_HASH_NEXTKEY) + else if (flags & RXapif_NEXTKEY) return reg_named_buff_nextkey(rx, flags); else { Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); @@ -4847,7 +4887,7 @@ Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 { AV *retarray = NULL; SV *ret; - if (flags & RXf_HASH_ALL) + if (flags & RXapif_ALL) retarray=newAV(); if (rx && rx->paren_names) { @@ -4885,7 +4925,7 @@ Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key, const U32 flags) { if (rx && rx->paren_names) { - if (flags & RXf_HASH_ALL) { + if (flags & RXapif_ALL) { return hv_exists_ent(rx->paren_names, key, 0); } else { SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags); @@ -4904,9 +4944,13 @@ Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key, SV* Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags) { - (void)hv_iterinit(rx->paren_names); + if ( rx && rx->paren_names ) { + (void)hv_iterinit(rx->paren_names); - return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXf_HASH_FIRSTKEY); + return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY); + } else { + return FALSE; + } } SV* @@ -4929,7 +4973,7 @@ Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) break; } } - if (parno || flags & RXf_HASH_ALL) { + if (parno || flags & RXapif_ALL) { STRLEN len; char *pv = HePV(temphe, len); return newSVpvn(pv,len); @@ -4947,10 +4991,10 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags) I32 length; if (rx && rx->paren_names) { - if (flags & (RXf_HASH_ALL | RXf_HASH_REGNAMES_COUNT)) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { return newSViv(HvTOTALKEYS(rx->paren_names)); - } else if (flags & RXf_HASH_ONE) { - ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXf_HASH_REGNAMES)); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES)); av = (AV*)SvRV(ret); length = av_len(av); return newSViv(length + 1); @@ -4985,7 +5029,7 @@ Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) break; } } - if (parno || flags & RXf_HASH_ALL) { + if (parno || flags & RXapif_ALL) { STRLEN len; char *pv = HePV(temphe, len); av_push(av, newSVpvn(pv,len)); @@ -5008,13 +5052,13 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * cons return; } else - if (paren == RXf_PREMATCH && rx->offs[0].start != -1) { + if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { /* $` */ i = rx->offs[0].start; s = rx->subbeg; } else - if (paren == RXf_POSTMATCH && rx->offs[0].end != -1) { + if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { /* $' */ s = rx->subbeg + rx->offs[0].end; i = rx->sublen - rx->offs[0].end; @@ -5093,7 +5137,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, /* Some of this code was originally in C in F */ switch (paren) { /* $` / ${^PREMATCH} */ - case RXf_PREMATCH: + case RX_BUFF_IDX_PREMATCH: if (rx->offs[0].start != -1) { i = rx->offs[0].start; if (i > 0) { @@ -5104,7 +5148,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, } return 0; /* $' / ${^POSTMATCH} */ - case RXf_POSTMATCH: + case RX_BUFF_IDX_POSTMATCH: if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; if (i > 0) { @@ -5276,7 +5320,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) register regnode *ender = NULL; register I32 parno = 0; I32 flags; - const I32 oregflags = RExC_flags; + U32 oregflags = RExC_flags; bool have_branch = 0; bool is_open = 0; I32 freeze_paren = 0; @@ -5875,8 +5919,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) and must be globally applied -- japhy */ switch (*RExC_parse) { CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); - case 'o': - case 'g': + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; if (! (wastedflags & wflagbit) ) { @@ -5893,7 +5937,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; - case 'c': + case CONTINUE_PAT_MOD: /* 'c' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; @@ -5906,10 +5950,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } break; - case 'k': + case KEEPCOPY_PAT_MOD: /* 'p' */ if (flagsp == &negflags) { if (SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(RExC_parse + 1,"Useless use of (?-k)"); + vWARN(RExC_parse + 1,"Useless use of (?-p)"); } else { *flagsp |= RXf_PMf_KEEPCOPY; } @@ -5929,6 +5973,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case ')': RExC_flags |= posflags; RExC_flags &= ~negflags; + if (paren != ':') { + oregflags |= posflags; + oregflags &= ~negflags; + } nextchar(pRExC_state); if (paren != ':') { *flagp = TRYAGAIN; @@ -6448,6 +6496,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) | PERL_SCAN_DISALLOW_PREFIX | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); UV cp; + char string; len = (STRLEN)(endbrace - name - 2); cp = grok_hex(name + 2, &len, &fl, NULL); if ( len != (STRLEN)(endbrace - name - 2) ) { @@ -6459,7 +6508,8 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) *valuep = cp; return NULL; } - sv_str= Perl_newSVpvf_nocontext("%c",(int)cp); + string = (char)cp; + sv_str= newSVpvn(&string, 1); } else { /* fetch the charnames handler for this scope */ HV * const table = GvHV(PL_hintgv); @@ -6808,6 +6858,7 @@ tryagain: case 0xCE: if (!LOC && FOLD) { U32 len,cp; + len=0; /* silence a spurious compiler warning */ if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ @@ -6847,6 +6898,11 @@ tryagain: RExC_seen_zerolen++; ret = reg_node(pRExC_state, KEEPS); *flagp |= SIMPLE; + /* XXX:dmq : disabling in-place substitution seems to + * be necessary here to avoid cases of memory corruption, as + * with: C<$_="x" x 80; s/x\K/y/> -- rgs + */ + RExC_seen |= REG_SEEN_LOOKBEHIND; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -7033,6 +7089,8 @@ tryagain: goto parse_named_seq; } } num = atoi(RExC_parse); + if (isg && num == 0) + vFAIL("Reference to invalid group 0"); if (isrel) { num = RExC_npar - num; if (num < 1) @@ -8016,12 +8074,16 @@ parseit: { if (isLOWER(prevvalue)) { for (i = prevvalue; i <= ceilvalue; i++) - if (isLOWER(i)) + if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } else { for (i = prevvalue; i <= ceilvalue; i++) - if (isUPPER(i)) + if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } } else @@ -8618,6 +8680,27 @@ S_regcurly(register const char *s) /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ +#ifdef DEBUGGING +void +S_regdump_extflags(pTHX_ const char *lead, const U32 flags) { + int bit; + int set=0; + for (bit=0; bit<32; bit++) { + if (flags & (1<program, ri->program + 1, NULL, NULL, sv, 0, 0); @@ -8699,6 +8783,7 @@ Perl_regdump(pTHX_ const regexp *r) if (r->extflags & RXf_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); #else PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(r); @@ -8729,19 +8814,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) k = PL_regkind[OP(o)]; if (k == EXACT) { - SV * const dsv = sv_2mortal(newSVpvs("")); + sv_catpvs(sv, " "); /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) * is a crude hack but it may be the best for now since * we have no flag "this EXACTish node was UTF-8" * --jhi */ - const char * const s = - pv_pretty(dsv, STRING(o), STR_LEN(o), 60, - PL_colors[0], PL_colors[1], - PERL_PV_ESCAPE_UNI_DETECT | - PERL_PV_PRETTY_ELIPSES | - PERL_PV_PRETTY_LTGT - ); - Perl_sv_catpvf(aTHX_ sv, " %s", s ); + pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); } else if (k == TRIE) { /* print the details of the trie in dumpuntil instead, as * progi->data isn't available here */ @@ -8770,7 +8853,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) int i; int rangestart = -1; U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); - Perl_sv_catpvf(aTHX_ sv, "["); + sv_catpvs(sv, "["); for (i = 0; i <= 256; i++) { if (i < 256 && BITMAP_TEST(bitmap,i)) { if (rangestart == -1) @@ -8787,7 +8870,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) rangestart = -1; } } - Perl_sv_catpvf(aTHX_ sv, "]"); + sv_catpvs(sv, "]"); } } else if (k == CURLY) { @@ -8830,7 +8913,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == FOLDCHAR) - Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]",ARG(o) ); + Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); @@ -9598,12 +9681,24 @@ clear_re(pTHX_ void *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (isCNTRL(c) || c == 255 || !isPRINT(c)) + /* Our definition of isPRINT() ignores locales, so only bytes that are + not part of UTF-8 are considered printable. I assume that the same + holds for UTF-EBCDIC. + Also, code point 255 is not printable in either (it's E0 in EBCDIC, + which Wikipedia says: + + EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all + ones (binary 1111 1111, hexadecimal FF). It is similar, but not + identical, to the ASCII delete (DEL) or rubout control character. + ) So the old condition can be simplified to !isPRINT(c) */ + if (!isPRINT(c)) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); - else if (c == '-' || c == ']' || c == '\\' || c == '^') - Perl_sv_catpvf(aTHX_ sv, "\\%c", c); - else - Perl_sv_catpvf(aTHX_ sv, "%c", c); + else { + const char string = c; + if (c == '-' || c == ']' || c == '\\' || c == '^') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } } @@ -9706,7 +9801,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, PL_colors[0], PL_colors[1], (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELIPSES | + PERL_PV_PRETTY_ELLIPSES | PERL_PV_PRETTY_LTGT ) : "???"