X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=6e9c19abf4672bd2d83dd9680a4834ce8635586d;hb=c57cf257e9e5820063963c3520086480a9afb695;hp=5750a0285cfcc4b90897e0c0bd716b09fb4e0381;hpb=2fdbfb4d61a8af78322ced14c20952a7b3b5761a;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index 5750a02..6e9c19a 100644 --- a/regcomp.c +++ b/regcomp.c @@ -102,6 +102,7 @@ typedef struct RExC_state_t { U32 flags; /* are we folding, multilining? */ char *precomp; /* uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ regexp_internal *rxi; /* internal data for regexp object pprivate field */ char *start; /* Start of input for compile */ @@ -149,6 +150,7 @@ typedef struct RExC_state_t { #define RExC_flags (pRExC_state->flags) #define RExC_precomp (pRExC_state->precomp) +#define RExC_rx_sv (pRExC_state->rx_sv) #define RExC_rx (pRExC_state->rx) #define RExC_rxi (pRExC_state->rxi) #define RExC_start (pRExC_state->start) @@ -389,7 +391,7 @@ static const scan_data_t zero_scan_data = IV len = RExC_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ if (len > RegexLengthToShowInErrorMessages) { \ /* chop 10 shorter than the max, to ensure meaning of "..." */ \ len = RegexLengthToShowInErrorMessages - 10; \ @@ -420,7 +422,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL(m) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL(m); \ } STMT_END @@ -438,7 +440,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL2(m,a1) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL2(m, a1); \ } STMT_END @@ -457,7 +459,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL3(m,a1,a2) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL3(m, a1, a2); \ } STMT_END @@ -1163,11 +1165,19 @@ is the recommended Unicode-aware way of saying #define TRIE_STORE_REVCHAR \ STMT_START { \ - SV *tmp = newSVpvs(""); \ - if (UTF) SvUTF8_on(tmp); \ - Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \ - av_push( revcharmap, tmp ); \ - } STMT_END + if (UTF) { \ + SV *zlopp = newSV(2); \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)uvc; \ + av_push(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END #define TRIE_READ_CHAR STMT_START { \ wordlen++; \ @@ -1222,10 +1232,9 @@ is the recommended Unicode-aware way of saying /* store the word for dumping */ \ SV* tmp; \ if (OP(noper) != NOTHING) \ - tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \ + tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ else \ - tmp = newSVpvn( "", 0 ); \ - if ( UTF ) SvUTF8_on( tmp ); \ + tmp = newSVpvn_utf8( "", 0, UTF ); \ av_push( trie_words, tmp ); \ }); \ \ @@ -1356,7 +1365,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store unicode characters. We use the + (trie->charmap) and we use a an HV* to store Unicode characters. We use the native representation of the character value as the key and IV's for the coded index. @@ -1405,7 +1414,20 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* store the codepoint in the bitmap, and if its ascii also store its folded equivelent. */ TRIE_BITMAP_SET(trie,uvc); - if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + codepoints in the 127 < uvc < 256 range */ + if (127 < uvc && uvc < 192) { + TRIE_BITMAP_SET(trie,194); + } else if (191 < uvc ) { + TRIE_BITMAP_SET(trie,195); + /* && uvc < 256 -- we know uvc is < 256 already */ + } + } set_bit = 0; /* We've done our bit :-) */ } } else { @@ -1966,7 +1988,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( count == 1 ) { SV **tmp = av_fetch( revcharmap, idx, 0); - char *ch = SvPV_nolen( *tmp ); + STRLEN len; + char *ch = SvPV( *tmp, len ); DEBUG_OPTIMISE_r({ SV *sv=sv_newmortal(); PerlIO_printf( Perl_debug_log, @@ -1985,11 +2008,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs str=STRING(convert); STR_LEN(convert)=0; } - while (*ch) { + STR_LEN(convert) += len; + while (len--) *str++ = *ch++; - STR_LEN(convert)++; - } - } else { #ifdef DEBUGGING if (state>1) @@ -2004,24 +2025,35 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->startstate = state; trie->minlen -= (state - 1); trie->maxlen -= (state - 1); - DEBUG_r({ - regnode *fix = convert; - U32 word = trie->wordcount; - mjd_nodelen++; - Set_Node_Offset_Length(convert, mjd_offset, state - 1); - while( ++fix < n ) { - Set_Node_Offset_Length(fix, 0, 0); - } - while (word--) { - SV ** const tmp = av_fetch( trie_words, word, 0 ); - if (tmp) { - if ( STR_LEN(convert) <= SvCUR(*tmp) ) - sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); - else - sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); - } - } - }); +#ifdef DEBUGGING + /* At least the UNICOS C compiler choked on this + * being argument to DEBUG_r(), so let's just have + * it right here. */ + if ( +#ifdef PERL_EXT_RE_BUILD + 1 +#else + DEBUG_r_TEST +#endif + ) { + regnode *fix = convert; + U32 word = trie->wordcount; + mjd_nodelen++; + Set_Node_Offset_Length(convert, mjd_offset, state - 1); + while( ++fix < n ) { + Set_Node_Offset_Length(fix, 0, 0); + } + while (word--) { + SV ** const tmp = av_fetch( trie_words, word, 0 ); + if (tmp) { + if ( STR_LEN(convert) <= SvCUR(*tmp) ) + sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); + else + sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); + } + } + } +#endif if (trie->maxlen) { convert = n; } else { @@ -2760,7 +2792,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, last = cur; } } else { - if ( last ) { +/* + Currently we assume that the trie can handle unicode and ascii + matches fold cased matches. If this proves true then the following + define will prevent tries in this situation. + + #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) +*/ +#define TRIE_TYPE_IS_SAFE 1 + if ( last && TRIE_TYPE_IS_SAFE ) { make_trie( pRExC_state, startbranch, first, cur, tail, count, optype, depth+1 ); @@ -2788,7 +2828,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - if ( last ) { + + if ( last && TRIE_TYPE_IS_SAFE ) { made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && @@ -3278,9 +3319,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l -= old; /* Get the added string: */ - last_str = newSVpvn(s + old, l); - if (UTF) - SvUTF8_on(last_str); + last_str = newSVpvn_utf8(s + old, l, UTF); if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { @@ -3298,7 +3337,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str); + mg->mg_len += CHR_SVLEN(last_str) - l; } data->last_end += l * (mincount - 1); } @@ -4112,16 +4151,16 @@ Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags) #endif REGEXP * -Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) +Perl_re_compile(pTHX_ const SV * const pattern, U32 pm_flags) { dVAR; - register REGEXP *r; + REGEXP *rx; + struct regexp *r; register regexp_internal *ri; STRLEN plen; char* exp = SvPV((SV*)pattern, plen); char* xend = exp + plen; regnode *scan; - regnode *first; I32 flags; I32 minlen = 0; I32 sawplus = 0; @@ -4136,7 +4175,7 @@ Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) GET_RE_DEBUG_FLAGS_DECL; DEBUG_r(if (!PL_colorset) reginitcolors()); - RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8; + RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); @@ -4187,7 +4226,7 @@ redo_first_pass: return(NULL); } if (RExC_utf8 && !RExC_orig_utf8) { - /* It's possible to write a regexp in ascii that represents unicode + /* It's possible to write a regexp in ascii that represents Unicode codepoints outside of the byte range, such as via \x{100}. If we detect such a sequence we have to convert the entire pattern to utf8 and then recompile, as our sizing calculation will have been based @@ -4225,7 +4264,8 @@ redo_first_pass: /* Allocate space and zero-initialize. Note, the two step process of zeroing when in debug mode, thus anything assigned has to happen after that */ - Newxz(r, 1, regexp); + rx = (REGEXP*) newSV_type(SVt_REGEXP); + r = (struct regexp*)SvANY(rx); Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char, regexp_internal); if ( r == NULL || ri == NULL ) @@ -4241,25 +4281,26 @@ redo_first_pass: /* non-zero initialization begins here */ RXi_SET( r, ri ); r->engine= RE_ENGINE_PTR; - r->refcnt = 1; - r->prelen = plen; r->extflags = pm_flags; { - bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); - U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ char *p; - r->wraplen = r->prelen + has_minus + has_k + has_runon + const STRLEN wraplen = plen + has_minus + has_p + has_runon + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); - Newx(r->wrapped, r->wraplen + 1, char ); - p = r->wrapped; + p = sv_grow((SV *)rx, wraplen + 1); + SvCUR_set(rx, wraplen); + SvPOK_on(rx); + SvFLAGS(rx) |= SvUTF8(pattern); *p++='('; *p++='?'; - if (has_k) - *p++ = KEEPCOPY_PAT_MOD; /*'k'*/ + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ { char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; char *colon = r + 1; @@ -4279,9 +4320,10 @@ redo_first_pass: } *p++ = ':'; - Copy(RExC_precomp, p, r->prelen, char); - r->precomp = p; - p += r->prelen; + Copy(RExC_precomp, p, plen, char); + assert ((RX_WRAPPED(rx) - p) < 16); + r->pre_prefix = p - RX_WRAPPED(rx); + p += plen; if (has_runon) *p++ = '\n'; *p++ = ')'; @@ -4307,6 +4349,7 @@ redo_first_pass: (UV)((2*RExC_size+1) * sizeof(U32)))); #endif SetProgLen(ri,RExC_size); + RExC_rx_sv = rx; RExC_rx = r; RExC_rxi = ri; @@ -4323,9 +4366,10 @@ redo_first_pass: /* Store the count of eval-groups for security checks: */ RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); - if (reg(pRExC_state, 0, &flags,1) == NULL) + if (reg(pRExC_state, 0, &flags,1) == NULL) { + ReREFCNT_dec(rx); return(NULL); - + } /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ Newx(r->substrs, 1, struct reg_substr_data); @@ -4363,11 +4407,11 @@ reStudy: #endif /* Dig out information for optimizations. */ - r->extflags = pm_flags; /* Again? */ + r->extflags = RExC_flags; /* was pm_op */ /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ if (UTF) - r->extflags |= RXf_UTF8; /* Unicode in it? */ + SvUTF8_on(rx); /* Unicode in it? */ ri->regstclass = NULL; if (RExC_naughty >= 10) /* Probably an expensive pattern. */ r->intflags |= PREGf_NAUGHTY; @@ -4381,18 +4425,20 @@ reStudy: struct regnode_charclass_class ch_class; /* pointed to by data */ int stclass_flag; I32 last_close = 0; /* pointed to by data */ - - first = scan; + regnode *first= scan; + regnode *first_next= regnext(first); + /* Skip introductions and multiplicators >= 1. */ while ((OP(first) == OPEN && (sawopen = 1)) || /* An OR of *one* alternative - should not happen now. */ - (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || + (OP(first) == BRANCH && OP(first_next) != BRANCH) || /* for now we can't handle lookbehind IFMATCH*/ (OP(first) == IFMATCH && !first->flags) || (OP(first) == PLUS) || (OP(first) == MINMOD) || /* An {n,m} with n>0 */ - (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) + (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || + (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) { if (OP(first) == PLUS) @@ -4404,6 +4450,7 @@ reStudy: first += EXTRA_STEP_2ARGS; } else /* XXX possible optimisation for /(?=)/ */ first = NEXTOPER(first); + first_next= regnext(first); } /* Starting-point info. */ @@ -4746,14 +4793,37 @@ reStudy: if (RExC_seen & REG_SEEN_CUTGROUP) r->intflags |= PREGf_CUTGROUP_SEEN; if (RExC_paren_names) - r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); + RXp_PAREN_NAMES(r) = (HV*)SvREFCNT_inc(RExC_paren_names); else - r->paren_names = NULL; - if (r->prelen == 3 && strnEQ("\\s+", r->precomp, 3)) /* precomp = "\\s+)" */ - r->extflags |= RXf_WHITE; - else if (r->prelen == 1 && r->precomp[0] == '^') + RXp_PAREN_NAMES(r) = NULL; + +#ifdef STUPID_PATTERN_CHECKS + if (RX_PRELEN(rx) == 0) + r->extflags |= RXf_NULL; + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) + r->extflags |= RXf_WHITE; + else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^') r->extflags |= RXf_START_ONLY; - +#else + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else { + regnode *first = ri->program + 1; + U8 fop = OP(first); + U8 nop = OP(NEXTOPER(first)); + + if (PL_regkind[fop] == NOTHING && nop == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && nop == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END) + r->extflags |= RXf_WHITE; + } +#endif #ifdef DEBUGGING if (RExC_paren_names) { ri->name_list_idx = add_data( pRExC_state, 1, "p" ); @@ -4789,33 +4859,76 @@ reStudy: PerlIO_printf(Perl_debug_log, "\n"); }); #endif - return(r); + return rx; } #undef RE_ENGINE_PTR SV* -Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags) +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) +{ + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak(aTHX_ PL_no_modify); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, + const U32 flags) { AV *retarray = NULL; SV *ret; - if (flags & 1) + struct regexp *const rx = (struct regexp *)SvANY(r); + if (flags & RXapif_ALL) retarray=newAV(); - if (rx && rx->paren_names) { - HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); + if (rx && RXp_PAREN_NAMES(rx)) { + HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); if (he_str) { IV i; SV* sv_dat=HeVAL(he_str); I32 *nums=(I32*)SvPVX(sv_dat); for ( i=0; inparens) >= nums[i] - && rx->offs[nums[i]].start != -1 - && rx->offs[nums[i]].end != -1) + if ((I32)(rx->nparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) { ret = newSVpvs(""); - CALLREG_NUMBUF_FETCH(rx,nums[i],ret); + CALLREG_NUMBUF_FETCH(r,nums[i],ret); if (!retarray) return ret; } else { @@ -4827,15 +4940,138 @@ Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 } } if (retarray) - return (SV*)retarray; + return newRV((SV*)retarray); } } return NULL; } +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, + const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & RXapif_ALL) { + return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); + if (sv) { + SvREFCNT_dec(sv); + return TRUE; + } else { + return FALSE; + } + } + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + if ( rx && RXp_PAREN_NAMES(rx) ) { + (void)hv_iterinit(RXp_PAREN_NAMES(rx)); + + return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv = RXp_PAREN_NAMES(rx); + HE *temphe; + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastcloseparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + return newSVhek(HeKEY_hek(temphe)); + } + } + } + return NULL; +} + +SV* +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) +{ + SV *ret; + AV *av; + I32 length; + struct regexp *const rx = (struct regexp *)SvANY(r); + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); + av = (AV*)SvRV(ret); + length = av_len(av); + return newSViv(length + 1); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + AV *av = newAV(); + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv= RXp_PAREN_NAMES(rx); + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastcloseparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + av_push(av, newSVhek(HeKEY_hek(temphe))); + } + } + } + + return newRV((SV*)av); +} + void -Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv) +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, + SV * const sv) { + struct regexp *const rx = (struct regexp *)SvANY(r); char *s = NULL; I32 i = 0; I32 s1, t1; @@ -4845,13 +5081,13 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * cons return; } else - if (paren == -2 && rx->offs[0].start != -1) { + if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { /* $` */ i = rx->offs[0].start; s = rx->subbeg; } else - if (paren == -1 && rx->offs[0].end != -1) { + if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { /* $' */ s = rx->subbeg + rx->offs[0].end; i = rx->sublen - rx->offs[0].end; @@ -4875,16 +5111,16 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * cons sv_setpvn(sv, s, i); PL_tainted = oldtainted; if ( (rx->extflags & RXf_CANY_SEEN) - ? (RX_MATCH_UTF8(rx) + ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) - : (RX_MATCH_UTF8(rx)) ) + : (RXp_MATCH_UTF8(rx)) ) { SvUTF8_on(sv); } else SvUTF8_off(sv); if (PL_tainting) { - if (RX_MATCH_TAINTED(rx)) { + if (RXp_MATCH_TAINTED(rx)) { if (SvTYPE(sv) >= SVt_PVMG) { MAGIC* const mg = SvMAGIC(sv); MAGIC* mgt; @@ -4921,15 +5157,17 @@ Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, } I32 -Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, const I32 paren) { + struct regexp *const rx = (struct regexp *)SvANY(r); I32 i; I32 s1, t1; /* Some of this code was originally in C in F */ switch (paren) { - case -2: /* $` */ + /* $` / ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: if (rx->offs[0].start != -1) { i = rx->offs[0].start; if (i > 0) { @@ -4939,7 +5177,8 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, } } return 0; - case -1: /* $' */ + /* $' / ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; if (i > 0) { @@ -4949,7 +5188,8 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, } } return 0; - default: /* $&, $1, $2, ... */ + /* $& / ${^MATCH}, $1, $2, ... */ + default: if (paren <= (I32)rx->nparens && (s1 = rx->offs[paren].start) != -1 && (t1 = rx->offs[paren].end) != -1) @@ -4963,7 +5203,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, } } getlen: - if (i > 0 && RX_MATCH_UTF8(rx)) { + if (i > 0 && RXp_MATCH_UTF8(rx)) { const char * const s = rx->subbeg + s1; const U8 *ep; STRLEN el; @@ -4979,7 +5219,7 @@ SV* Perl_reg_qr_package(pTHX_ REGEXP * const rx) { PERL_UNUSED_ARG(rx); - return newSVpvs("Regexp"); + return NULL; } /* Scans the name of a named buffer from the pattern. @@ -5011,10 +5251,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { } if ( flags ) { - SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start, - (int)(RExC_parse - name_start))); - if (UTF) - SvUTF8_on(sv_name); + SV* sv_name + = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), + SVs_TEMP | (UTF ? SVf_UTF8 : 0)); if ( flags == REG_RSN_RETURN_NAME) return sv_name; else if (flags==REG_RSN_RETURN_DATA) { @@ -5110,7 +5349,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) register regnode *ender = NULL; register I32 parno = 0; I32 flags; - const I32 oregflags = RExC_flags; + U32 oregflags = RExC_flags; bool have_branch = 0; bool is_open = 0; I32 freeze_paren = 0; @@ -5709,8 +5948,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) and must be globally applied -- japhy */ switch (*RExC_parse) { CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); - case 'o': - case 'g': + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; if (! (wastedflags & wflagbit) ) { @@ -5727,7 +5966,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; - case 'c': + case CONTINUE_PAT_MOD: /* 'c' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; @@ -5740,10 +5979,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } break; - case 'k': + case KEEPCOPY_PAT_MOD: /* 'p' */ if (flagsp == &negflags) { if (SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(RExC_parse + 1,"Useless use of (?-k)"); + vWARN(RExC_parse + 1,"Useless use of (?-p)"); } else { *flagsp |= RXf_PMf_KEEPCOPY; } @@ -5763,6 +6002,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case ')': RExC_flags |= posflags; RExC_flags &= ~negflags; + if (paren != ':') { + oregflags |= posflags; + oregflags &= ~negflags; + } nextchar(pRExC_state); if (paren != ':') { *flagp = TRYAGAIN; @@ -6277,11 +6520,12 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) /* RExC_parse points at the beginning brace, endbrace points at the last */ if ( name[0]=='U' && name[1]=='+' ) { - /* its a "unicode hex" notation {U+89AB} */ + /* its a "Unicode hex" notation {U+89AB} */ I32 fl = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); UV cp; + char string; len = (STRLEN)(endbrace - name - 2); cp = grok_hex(name + 2, &len, &fl, NULL); if ( len != (STRLEN)(endbrace - name - 2) ) { @@ -6293,7 +6537,8 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) *valuep = cp; return NULL; } - sv_str= Perl_newSVpvf_nocontext("%c",(int)cp); + string = (char)cp; + sv_str= newSVpvn(&string, 1); } else { /* fetch the charnames handler for this scope */ HV * const table = GvHV(PL_hintgv); @@ -6500,7 +6745,7 @@ STATIC UV S_reg_recode(pTHX_ const char value, SV **encp) { STRLEN numlen = 1; - SV * const sv = sv_2mortal(newSVpvn(&value, numlen)); + SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); const STRLEN newlen = SvCUR(sv); UV uv = UNICODE_REPLACEMENT; @@ -6640,8 +6885,10 @@ tryagain: case 0xDF: case 0xC3: case 0xCE: + do_foldchar: if (!LOC && FOLD) { U32 len,cp; + len=0; /* silence a spurious compiler warning */ if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ @@ -6665,7 +6912,11 @@ tryagain: required, as the default for this switch is to jump to the literal text handling code. */ - switch (*++RExC_parse) { + switch ((U8)*++RExC_parse) { + case 0xDF: + case 0xC3: + case 0xCE: + goto do_foldchar; /* Special Escapes */ case 'A': RExC_seen_zerolen++; @@ -6681,6 +6932,11 @@ tryagain: RExC_seen_zerolen++; ret = reg_node(pRExC_state, KEEPS); *flagp |= SIMPLE; + /* XXX:dmq : disabling in-place substitution seems to + * be necessary here to avoid cases of memory corruption, as + * with: C<$_="x" x 80; s/x\K/y/> -- rgs + */ + RExC_seen |= REG_SEEN_LOOKBEHIND; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -6867,6 +7123,8 @@ tryagain: goto parse_named_seq; } } num = atoi(RExC_parse); + if (isg && num == 0) + vFAIL("Reference to invalid group 0"); if (isrel) { num = RExC_npar - num; if (num < 1) @@ -6976,8 +7234,13 @@ tryagain: an unescaped equivalent literal. */ - switch (*++p) { + switch ((U8)*++p) { /* These are all the special escapes. */ + case 0xDF: + case 0xC3: + case 0xCE: + if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) + goto normal_default; case 'A': /* Start assertion */ case 'b': case 'B': /* Word-boundary assertion*/ case 'C': /* Single char !DANGEROUS! */ @@ -6986,7 +7249,7 @@ tryagain: case 'h': case 'H': /* HORIZWS */ case 'k': case 'K': /* named backref, keep marker */ case 'N': /* named char sequence */ - case 'p': case 'P': /* unicode property */ + case 'p': case 'P': /* Unicode property */ case 'R': /* LNBREAK */ case 's': case 'S': /* space class */ case 'v': case 'V': /* VERTWS */ @@ -7850,12 +8113,16 @@ parseit: { if (isLOWER(prevvalue)) { for (i = prevvalue; i <= ceilvalue; i++) - if (isLOWER(i)) + if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } else { for (i = prevvalue; i <= ceilvalue; i++) - if (isUPPER(i)) + if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } } else @@ -7920,8 +8187,8 @@ parseit: if (!unicode_alternate) unicode_alternate = newAV(); - sv = newSVpvn((char*)foldbuf, foldlen); - SvUTF8_on(sv); + sv = newSVpvn_utf8((char*)foldbuf, foldlen, + TRUE); av_push(unicode_alternate, sv); } } @@ -8452,6 +8719,27 @@ S_regcurly(register const char *s) /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ +#ifdef DEBUGGING +void +S_regdump_extflags(pTHX_ const char *lead, const U32 flags) { + int bit; + int set=0; + for (bit=0; bit<32; bit++) { + if (flags & (1<program, ri->program + 1, NULL, NULL, sv, 0, 0); @@ -8533,6 +8822,7 @@ Perl_regdump(pTHX_ const regexp *r) if (r->extflags & RXf_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); #else PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(r); @@ -8563,19 +8853,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) k = PL_regkind[OP(o)]; if (k == EXACT) { - SV * const dsv = sv_2mortal(newSVpvs("")); + sv_catpvs(sv, " "); /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) * is a crude hack but it may be the best for now since * we have no flag "this EXACTish node was UTF-8" * --jhi */ - const char * const s = - pv_pretty(dsv, STRING(o), STR_LEN(o), 60, - PL_colors[0], PL_colors[1], - PERL_PV_ESCAPE_UNI_DETECT | - PERL_PV_PRETTY_ELIPSES | - PERL_PV_PRETTY_LTGT - ); - Perl_sv_catpvf(aTHX_ sv, " %s", s ); + pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); } else if (k == TRIE) { /* print the details of the trie in dumpuntil instead, as * progi->data isn't available here */ @@ -8604,7 +8892,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) int i; int rangestart = -1; U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); - Perl_sv_catpvf(aTHX_ sv, "["); + sv_catpvs(sv, "["); for (i = 0; i <= 256; i++) { if (i < 256 && BITMAP_TEST(bitmap,i)) { if (rangestart == -1) @@ -8621,7 +8909,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) rangestart = -1; } } - Perl_sv_catpvf(aTHX_ sv, "]"); + sv_catpvs(sv, "]"); } } else if (k == CURLY) { @@ -8633,7 +8921,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ - if ( prog->paren_names ) { + if ( RXp_PAREN_NAMES(prog) ) { if ( k != REF || OP(o) < NREF) { AV *list= (AV *)progi->data->data[progi->name_list_idx]; SV **name= av_fetch(list, ARG(o), 0 ); @@ -8664,7 +8952,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == FOLDCHAR) - Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]",ARG(o) ); + Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); @@ -8815,9 +9103,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } SV * -Perl_re_intuit_string(pTHX_ REGEXP * const prog) +Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ dVAR; + struct regexp *const prog = (struct regexp *)SvANY(r); GET_RE_DEBUG_FLAGS_DECL; PERL_UNUSED_CONTEXT; @@ -8854,20 +9143,24 @@ Perl_re_intuit_string(pTHX_ REGEXP * const prog) */ #ifndef PERL_IN_XSUB_RE void -Perl_pregfree(pTHX_ struct regexp *r) +Perl_pregfree(pTHX_ REGEXP *r) +{ + SvREFCNT_dec(r); +} + +void +Perl_pregfree2(pTHX_ REGEXP *rx) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); GET_RE_DEBUG_FLAGS_DECL; - if (!r || (--r->refcnt > 0)) - return; if (r->mother_re) { ReREFCNT_dec(r->mother_re); } else { - CALLREGFREE_PVT(r); /* free the private data */ - if (r->paren_names) - SvREFCNT_dec(r->paren_names); - Safefree(r->wrapped); + CALLREGFREE_PVT(rx); /* free the private data */ + if (RXp_PAREN_NAMES(r)) + SvREFCNT_dec(RXp_PAREN_NAMES(r)); } if (r->substrs) { if (r->anchored_substr) @@ -8880,14 +9173,13 @@ Perl_pregfree(pTHX_ struct regexp *r) SvREFCNT_dec(r->float_utf8); Safefree(r->substrs); } - RX_MATCH_COPY_FREE(r); + RX_MATCH_COPY_FREE(rx); #ifdef PERL_OLD_COPY_ON_WRITE if (r->saved_copy) SvREFCNT_dec(r->saved_copy); #endif Safefree(r->swap); Safefree(r->offs); - Safefree(r); } /* reg_temp_copy() @@ -8907,16 +9199,23 @@ Perl_pregfree(pTHX_ struct regexp *r) */ -regexp * -Perl_reg_temp_copy (pTHX_ struct regexp *r) { - regexp *ret; +REGEXP * +Perl_reg_temp_copy (pTHX_ REGEXP *rx) { + REGEXP *ret_x = (REGEXP*) newSV_type(SVt_REGEXP); + struct regexp *ret = (struct regexp *)SvANY(ret_x); + struct regexp *const r = (struct regexp *)SvANY(rx); register const I32 npar = r->nparens+1; - (void)ReREFCNT_inc(r); - Newx(ret, 1, regexp); - StructCopy(r, ret, regexp); + (void)ReREFCNT_inc(rx); + /* We can take advantage of the existing "copied buffer" mechanism in SVs + by pointing directly at the buffer, but flagging that the allocated + space in the copy is zero. As we've just done a struct copy, it's now + a case of zero-ing that, rather than copying the current length. */ + SvPV_set(ret_x, RX_WRAPPED(rx)); + SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8); + StructCopy(&(r->xpv_cur), &(ret->xpv_cur), struct regexp_allocated); + SvLEN_set(ret_x, 0); Newx(ret->offs, npar, regexp_paren_pair); Copy(r->offs, ret->offs, npar, regexp_paren_pair); - ret->refcnt = 1; if (r->substrs) { Newx(ret->substrs, 1, struct reg_substr_data); StructCopy(r->substrs, ret->substrs, struct reg_substr_data); @@ -8929,14 +9228,14 @@ Perl_reg_temp_copy (pTHX_ struct regexp *r) { /* check_substr and check_utf8, if non-NULL, point to either their anchored or float namesakes, and don't hold a second reference. */ } - RX_MATCH_COPIED_off(ret); + RX_MATCH_COPIED_off(ret_x); #ifdef PERL_OLD_COPY_ON_WRITE ret->saved_copy = NULL; #endif - ret->mother_re = r; + ret->mother_re = rx; ret->swap = NULL; - return ret; + return ret_x; } #endif @@ -8953,9 +9252,10 @@ Perl_reg_temp_copy (pTHX_ struct regexp *r) { */ void -Perl_regfree_internal(pTHX_ struct regexp *r) +Perl_regfree_internal(pTHX_ REGEXP * const rx) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; @@ -8964,8 +9264,8 @@ Perl_regfree_internal(pTHX_ struct regexp *r) reginitcolors(); { SV *dsv= sv_newmortal(); - RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8), - dsv, r->precomp, r->prelen, 60); + RE_PV_QUOTED_DECL(s, RX_UTF8(rx), + dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", PL_colors[4],PL_colors[5],s); } @@ -9086,23 +9386,15 @@ Perl_regfree_internal(pTHX_ struct regexp *r) */ #if defined(USE_ITHREADS) #ifndef PERL_IN_XSUB_RE -regexp * -Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) +void +Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) { dVAR; - regexp *ret; I32 npar; - - if (!r) - return (REGEXP *)NULL; - - if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r))) - return ret; - + const struct regexp *r = (const struct regexp *)SvANY(sstr); + struct regexp *ret = (struct regexp *)SvANY(dstr); npar = r->nparens+1; - Newx(ret, 1, regexp); - StructCopy(r, ret, regexp); Newx(ret->offs, npar, regexp_paren_pair); Copy(r->offs, ret->offs, npar, regexp_paren_pair); if(ret->swap) { @@ -9140,14 +9432,12 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) } } - ret->wrapped = SAVEPVN(ret->wrapped, ret->wraplen+1); - ret->precomp = ret->wrapped + (ret->precomp - ret->wrapped); - ret->paren_names = hv_dup_inc(ret->paren_names, param); + RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); if (ret->pprivate) - RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); + RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); - if (RX_MATCH_COPIED(ret)) + if (RX_MATCH_COPIED(dstr)) ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); else ret->subbeg = NULL; @@ -9158,9 +9448,6 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) ret->mother_re = NULL; ret->gofs = 0; ret->seen_evals = 0; - - ptr_table_store(PL_ptr_table, r, ret); - return ret; } #endif /* PERL_IN_XSUB_RE */ @@ -9179,9 +9466,10 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) */ void * -Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) +Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); regexp_internal *reti; int len, npar; RXi_GET_DECL(r,ri); @@ -9303,14 +9591,14 @@ Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) char * Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { dVAR; - const regexp * const re = (regexp *)mg->mg_obj; + const REGEXP * const re = (REGEXP *)mg->mg_obj; if (haseval) - *haseval = re->seen_evals; + *haseval = RX_SEEN_EVALS(re); if (flags) - *flags = ((re->extflags & RXf_UTF8) ? 1 : 0); + *flags = RX_UTF8(re) ? 1 : 0; if (lp) - *lp = re->wraplen; - return re->wrapped; + *lp = RX_WRAPLEN(re); + return RX_WRAPPED(re); } /* @@ -9403,7 +9691,7 @@ Perl_save_re_context(pTHX) const REGEXP * const rx = PM_GETRE(PL_curpm); if (rx) { U32 i; - for (i = 1; i <= rx->nparens; i++) { + for (i = 1; i <= RX_NPARENS(rx); i++) { char digits[TYPE_CHARS(long)]; const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); GV *const *const gvp @@ -9424,7 +9712,7 @@ static void clear_re(pTHX_ void *r) { dVAR; - ReREFCNT_dec((regexp *)r); + ReREFCNT_dec((REGEXP *)r); } #ifdef DEBUGGING @@ -9432,12 +9720,24 @@ clear_re(pTHX_ void *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (isCNTRL(c) || c == 255 || !isPRINT(c)) + /* Our definition of isPRINT() ignores locales, so only bytes that are + not part of UTF-8 are considered printable. I assume that the same + holds for UTF-EBCDIC. + Also, code point 255 is not printable in either (it's E0 in EBCDIC, + which Wikipedia says: + + EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all + ones (binary 1111 1111, hexadecimal FF). It is similar, but not + identical, to the ASCII delete (DEL) or rubout control character. + ) So the old condition can be simplified to !isPRINT(c) */ + if (!isPRINT(c)) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); - else if (c == '-' || c == ']' || c == '\\' || c == '^') - Perl_sv_catpvf(aTHX_ sv, "\\%c", c); - else - Perl_sv_catpvf(aTHX_ sv, "%c", c); + else { + const char string = c; + if (c == '-' || c == ']' || c == '\\' || c == '^') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } } @@ -9540,7 +9840,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, PL_colors[0], PL_colors[1], (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELIPSES | + PERL_PV_PRETTY_ELLIPSES | PERL_PV_PRETTY_LTGT ) : "???"