X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=05b54ee0124988d47ef1f92a9251ca88b2188f51;hb=c762d24b2c953a47d439fb0fc6f1b9966169fe00;hp=441d762e6e2742456d3736308a387696b2519b5b;hpb=bd94e8870784246bb7caab6a441839b05491de37;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index 441d762..05b54ee 100644 --- a/regcomp.c +++ b/regcomp.c @@ -102,6 +102,7 @@ typedef struct RExC_state_t { U32 flags; /* are we folding, multilining? */ char *precomp; /* uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ regexp_internal *rxi; /* internal data for regexp object pprivate field */ char *start; /* Start of input for compile */ @@ -149,6 +150,7 @@ typedef struct RExC_state_t { #define RExC_flags (pRExC_state->flags) #define RExC_precomp (pRExC_state->precomp) +#define RExC_rx_sv (pRExC_state->rx_sv) #define RExC_rx (pRExC_state->rx) #define RExC_rxi (pRExC_state->rxi) #define RExC_start (pRExC_state->start) @@ -389,7 +391,7 @@ static const scan_data_t zero_scan_data = IV len = RExC_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ if (len > RegexLengthToShowInErrorMessages) { \ /* chop 10 shorter than the max, to ensure meaning of "..." */ \ len = RegexLengthToShowInErrorMessages - 10; \ @@ -420,7 +422,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL(m) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL(m); \ } STMT_END @@ -438,7 +440,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL2(m,a1) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL2(m, a1); \ } STMT_END @@ -457,7 +459,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL3(m,a1,a2) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL3(m, a1, a2); \ } STMT_END @@ -630,6 +632,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min const STRLEN old_l = CHR_SVLEN(*data->longest); GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_SCAN_COMMIT; + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { SvSetMagicSV(*data->longest, data->last_found); if (*data->longest == data->longest_fixed) { @@ -676,6 +680,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min STATIC void S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_ANYTHING; + ANYOF_CLASS_ZERO(cl); ANYOF_BITMAP_SETALL(cl); cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; @@ -689,6 +695,8 @@ S_cl_is_anything(const struct regnode_charclass_class *cl) { int value; + PERL_ARGS_ASSERT_CL_IS_ANYTHING; + for (value = 0; value <= ANYOF_MAX; value += 2) if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) return 1; @@ -703,6 +711,8 @@ S_cl_is_anything(const struct regnode_charclass_class *cl) STATIC void S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_INIT; + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(pRExC_state, cl); @@ -711,6 +721,8 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) STATIC void S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_INIT_ZERO; + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(pRExC_state, cl); @@ -724,6 +736,7 @@ STATIC void S_cl_and(struct regnode_charclass_class *cl, const struct regnode_charclass_class *and_with) { + PERL_ARGS_ASSERT_CL_AND; assert(and_with->type == ANYOF); if (!(and_with->flags & ANYOF_CLASS) @@ -762,6 +775,8 @@ S_cl_and(struct regnode_charclass_class *cl, STATIC void S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) { + PERL_ARGS_ASSERT_CL_OR; + if (or_with->flags & ANYOF_INVERT) { /* We do not use * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) @@ -853,6 +868,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_DUMP_TRIE; PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", (int)depth * 2 + 2,"", @@ -935,6 +951,9 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, SV *sv=sv_newmortal(); int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; + /* print out the table precompression. */ PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", @@ -990,6 +1009,8 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, SV *sv=sv_newmortal(); int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; /* print out the table precompression so that we can do a visual check @@ -1163,11 +1184,19 @@ is the recommended Unicode-aware way of saying #define TRIE_STORE_REVCHAR \ STMT_START { \ - SV *tmp = newSVpvs(""); \ - if (UTF) SvUTF8_on(tmp); \ - Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \ - av_push( revcharmap, tmp ); \ - } STMT_END + if (UTF) { \ + SV *zlopp = newSV(2); \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)uvc; \ + av_push(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END #define TRIE_READ_CHAR STMT_START { \ wordlen++; \ @@ -1222,10 +1251,9 @@ is the recommended Unicode-aware way of saying /* store the word for dumping */ \ SV* tmp; \ if (OP(noper) != NOTHING) \ - tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \ + tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ else \ - tmp = newSVpvn( "", 0 ); \ - if ( UTF ) SvUTF8_on( tmp ); \ + tmp = newSVpvn_utf8( "", 0, UTF ); \ av_push( trie_words, tmp ); \ }); \ \ @@ -1310,6 +1338,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #endif SV *re_trie_maxbuff; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -1405,7 +1435,20 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* store the codepoint in the bitmap, and if its ascii also store its folded equivelent. */ TRIE_BITMAP_SET(trie,uvc); - if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + codepoints in the 127 < uvc < 256 range */ + if (127 < uvc && uvc < 192) { + TRIE_BITMAP_SET(trie,194); + } else if (191 < uvc ) { + TRIE_BITMAP_SET(trie,195); + /* && uvc < 256 -- we know uvc is < 256 already */ + } + } set_bit = 0; /* We've done our bit :-) */ } } else { @@ -2003,24 +2046,35 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->startstate = state; trie->minlen -= (state - 1); trie->maxlen -= (state - 1); - DEBUG_r({ - regnode *fix = convert; - U32 word = trie->wordcount; - mjd_nodelen++; - Set_Node_Offset_Length(convert, mjd_offset, state - 1); - while( ++fix < n ) { - Set_Node_Offset_Length(fix, 0, 0); - } - while (word--) { - SV ** const tmp = av_fetch( trie_words, word, 0 ); - if (tmp) { - if ( STR_LEN(convert) <= SvCUR(*tmp) ) - sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); - else - sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); - } - } - }); +#ifdef DEBUGGING + /* At least the UNICOS C compiler choked on this + * being argument to DEBUG_r(), so let's just have + * it right here. */ + if ( +#ifdef PERL_EXT_RE_BUILD + 1 +#else + DEBUG_r_TEST +#endif + ) { + regnode *fix = convert; + U32 word = trie->wordcount; + mjd_nodelen++; + Set_Node_Offset_Length(convert, mjd_offset, state - 1); + while( ++fix < n ) { + Set_Node_Offset_Length(fix, 0, 0); + } + while (word--) { + SV ** const tmp = av_fetch( trie_words, word, 0 ); + if (tmp) { + if ( STR_LEN(convert) <= SvCUR(*tmp) ) + sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); + else + sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); + } + } + } +#endif if (trie->maxlen) { convert = n; } else { @@ -2130,6 +2184,8 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode reg_ac_data *aho; const U32 data_slot = add_data( pRExC_state, 1, "T" ); GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -2246,6 +2302,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags #else PERL_UNUSED_ARG(depth); #endif + + PERL_ARGS_ASSERT_JOIN_EXACT; #ifndef EXPERIMENTAL_INPLACESCAN PERL_UNUSED_ARG(flags); PERL_UNUSED_ARG(val); @@ -2456,9 +2514,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, regnode *first_non_open = scan; I32 stopmin = I32_MAX; scan_frame *frame = NULL; - GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_STUDY_CHUNK; + #ifdef DEBUGGING StructCopy(&zero_scan_data, &data_fake, scan_data_t); #endif @@ -2759,7 +2818,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, last = cur; } } else { - if ( last ) { +/* + Currently we assume that the trie can handle unicode and ascii + matches fold cased matches. If this proves true then the following + define will prevent tries in this situation. + + #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) +*/ +#define TRIE_TYPE_IS_SAFE 1 + if ( last && TRIE_TYPE_IS_SAFE ) { make_trie( pRExC_state, startbranch, first, cur, tail, count, optype, depth+1 ); @@ -2787,7 +2854,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - if ( last ) { + + if ( last && TRIE_TYPE_IS_SAFE ) { made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && @@ -3277,9 +3345,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l -= old; /* Get the added string: */ - last_str = newSVpvn(s + old, l); - if (UTF) - SvUTF8_on(last_str); + last_str = newSVpvn_utf8(s + old, l, UTF); if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { @@ -4009,6 +4075,8 @@ S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; + PERL_ARGS_ASSERT_ADD_DATA; + Renewc(RExC_rxi->data, sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), char, struct reg_data); @@ -4092,6 +4160,9 @@ Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags) { dVAR; HV * const table = GvHV(PL_hintgv); + + PERL_ARGS_ASSERT_PREGCOMP; + /* Dispatch a request to compile a regexp to correct regexp engine. */ if (table) { @@ -4111,10 +4182,11 @@ Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags) #endif REGEXP * -Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) +Perl_re_compile(pTHX_ const SV * const pattern, U32 pm_flags) { dVAR; - register REGEXP *r; + REGEXP *rx; + struct regexp *r; register regexp_internal *ri; STRLEN plen; char* exp = SvPV((SV*)pattern, plen); @@ -4132,9 +4204,12 @@ Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) RExC_state_t copyRExC_state; #endif GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_COMPILE; + DEBUG_r(if (!PL_colorset) reginitcolors()); - RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8; + RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); @@ -4223,7 +4298,8 @@ redo_first_pass: /* Allocate space and zero-initialize. Note, the two step process of zeroing when in debug mode, thus anything assigned has to happen after that */ - Newxz(r, 1, regexp); + rx = (REGEXP*) newSV_type(SVt_REGEXP); + r = (struct regexp*)SvANY(rx); Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char, regexp_internal); if ( r == NULL || ri == NULL ) @@ -4239,22 +4315,23 @@ redo_first_pass: /* non-zero initialization begins here */ RXi_SET( r, ri ); r->engine= RE_ENGINE_PTR; - r->refcnt = 1; - r->prelen = plen; r->extflags = pm_flags; { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); - U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ char *p; - r->wraplen = r->prelen + has_minus + has_p + has_runon + const STRLEN wraplen = plen + has_minus + has_p + has_runon + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); - Newx(r->wrapped, r->wraplen + 1, char ); - p = r->wrapped; + p = sv_grow((SV *)rx, wraplen + 1); + SvCUR_set(rx, wraplen); + SvPOK_on(rx); + SvFLAGS(rx) |= SvUTF8(pattern); *p++='('; *p++='?'; if (has_p) *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ @@ -4277,9 +4354,10 @@ redo_first_pass: } *p++ = ':'; - Copy(RExC_precomp, p, r->prelen, char); - r->precomp = p; - p += r->prelen; + Copy(RExC_precomp, p, plen, char); + assert ((RX_WRAPPED(rx) - p) < 16); + r->pre_prefix = p - RX_WRAPPED(rx); + p += plen; if (has_runon) *p++ = '\n'; *p++ = ')'; @@ -4305,6 +4383,7 @@ redo_first_pass: (UV)((2*RExC_size+1) * sizeof(U32)))); #endif SetProgLen(ri,RExC_size); + RExC_rx_sv = rx; RExC_rx = r; RExC_rxi = ri; @@ -4322,7 +4401,7 @@ redo_first_pass: RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) { - ReREFCNT_dec(r); + ReREFCNT_dec(rx); return(NULL); } /* XXXX To minimize changes to RE engine we always allocate @@ -4338,7 +4417,10 @@ reStudy: Zero(r->substrs, 1, struct reg_substr_data); #ifdef TRIE_STUDY_OPT - if ( restudied ) { + if (!restudied) { + StructCopy(&zero_scan_data, &data, scan_data_t); + copyRExC_state = RExC_state; + } else { U32 seen=RExC_seen; DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); @@ -4353,9 +4435,6 @@ reStudy: SvREFCNT_dec(data.last_found); } StructCopy(&zero_scan_data, &data, scan_data_t); - } else { - StructCopy(&zero_scan_data, &data, scan_data_t); - copyRExC_state = RExC_state; } #else StructCopy(&zero_scan_data, &data, scan_data_t); @@ -4366,7 +4445,7 @@ reStudy: /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ if (UTF) - r->extflags |= RXf_UTF8; /* Unicode in it? */ + SvUTF8_on(rx); /* Unicode in it? */ ri->regstclass = NULL; if (RExC_naughty >= 10) /* Probably an expensive pattern. */ r->intflags |= PREGf_NAUGHTY; @@ -4383,7 +4462,17 @@ reStudy: regnode *first= scan; regnode *first_next= regnext(first); - /* Skip introductions and multiplicators >= 1. */ + /* + * Skip introductions and multiplicators >= 1 + * so that we can extract the 'meat' of the pattern that must + * match in the large if() sequence following. + * NOTE that EXACT is NOT covered here, as it is normally + * picked up by the optimiser separately. + * + * This is unfortunate as the optimiser isnt handling lookahead + * properly currently. + * + */ while ((OP(first) == OPEN && (sawopen = 1)) || /* An OR of *one* alternative - should not happen now. */ (OP(first) == BRANCH && OP(first_next) != BRANCH) || @@ -4395,16 +4484,17 @@ reStudy: (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) { - + /* + * the only op that could be a regnode is PLUS, all the rest + * will be regnode_1 or regnode_2. + * + */ if (OP(first) == PLUS) sawplus = 1; else first += regarglen[OP(first)]; - if (OP(first) == IFMATCH) { - first = NEXTOPER(first); - first += EXTRA_STEP_2ARGS; - } else /* XXX possible optimisation for /(?=)/ */ - first = NEXTOPER(first); + + first = NEXTOPER(first); first_next= regnext(first); } @@ -4748,22 +4838,22 @@ reStudy: if (RExC_seen & REG_SEEN_CUTGROUP) r->intflags |= PREGf_CUTGROUP_SEEN; if (RExC_paren_names) - r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); + RXp_PAREN_NAMES(r) = (HV*)SvREFCNT_inc(RExC_paren_names); else - r->paren_names = NULL; + RXp_PAREN_NAMES(r) = NULL; #ifdef STUPID_PATTERN_CHECKS - if (r->prelen == 0) + if (RX_PRELEN(rx) == 0) r->extflags |= RXf_NULL; - if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') /* XXX: this should happen BEFORE we compile */ r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); - else if (r->prelen == 3 && memEQ("\\s+", r->precomp, 3)) + else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) r->extflags |= RXf_WHITE; - else if (r->prelen == 1 && r->precomp[0] == '^') + else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^') r->extflags |= RXf_START_ONLY; #else - if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') /* XXX: this should happen BEFORE we compile */ r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); else { @@ -4814,7 +4904,7 @@ reStudy: PerlIO_printf(Perl_debug_log, "\n"); }); #endif - return(r); + return rx; } #undef RE_ENGINE_PTR @@ -4824,6 +4914,8 @@ SV* Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, const U32 flags) { + PERL_ARGS_ASSERT_REG_NAMED_BUFF; + PERL_UNUSED_ARG(value); if (flags & RXapif_FETCH) { @@ -4849,6 +4941,7 @@ SV* Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, const U32 flags) { + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; PERL_UNUSED_ARG(lastkey); if (flags & RXapif_FIRSTKEY) @@ -4862,15 +4955,20 @@ Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, } SV* -Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags) +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, + const U32 flags) { AV *retarray = NULL; SV *ret; + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; + if (flags & RXapif_ALL) retarray=newAV(); - if (rx && rx->paren_names) { - HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); + if (rx && RXp_PAREN_NAMES(rx)) { + HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); if (he_str) { IV i; SV* sv_dat=HeVAL(he_str); @@ -4881,7 +4979,7 @@ Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 && rx->offs[nums[i]].end != -1) { ret = newSVpvs(""); - CALLREG_NUMBUF_FETCH(rx,nums[i],ret); + CALLREG_NUMBUF_FETCH(r,nums[i],ret); if (!retarray) return ret; } else { @@ -4900,14 +4998,18 @@ Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 } bool -Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key, +Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, const U32 flags) { - if (rx && rx->paren_names) { + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; + + if (rx && RXp_PAREN_NAMES(rx)) { if (flags & RXapif_ALL) { - return hv_exists_ent(rx->paren_names, key, 0); + return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); } else { - SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags); + SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); if (sv) { SvREFCNT_dec(sv); return TRUE; @@ -4921,18 +5023,31 @@ Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key, } SV* -Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags) +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) { - (void)hv_iterinit(rx->paren_names); + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; + + if ( rx && RXp_PAREN_NAMES(rx) ) { + (void)hv_iterinit(RXp_PAREN_NAMES(rx)); - return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY); + return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); + } else { + return FALSE; + } } SV* -Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) { - if (rx && rx->paren_names) { - HV *hv = rx->paren_names; + struct regexp *const rx = (struct regexp *)SvANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv = RXp_PAREN_NAMES(rx); HE *temphe; while ( (temphe = hv_iternext_flags(hv,0)) ) { IV i; @@ -4940,7 +5055,7 @@ Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) SV* sv_dat = HeVAL(temphe); I32 *nums = (I32*)SvPVX(sv_dat); for ( i = 0; i < SvIVX(sv_dat); i++ ) { - if ((I32)(rx->lastcloseparen) >= nums[i] && + if ((I32)(rx->lastparen) >= nums[i] && rx->offs[nums[i]].start != -1 && rx->offs[nums[i]].end != -1) { @@ -4949,9 +5064,7 @@ Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) } } if (parno || flags & RXapif_ALL) { - STRLEN len; - char *pv = HePV(temphe, len); - return newSVpvn(pv,len); + return newSVhek(HeKEY_hek(temphe)); } } } @@ -4959,17 +5072,20 @@ Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) } SV* -Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags) +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) { SV *ret; AV *av; I32 length; + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; - if (rx && rx->paren_names) { + if (rx && RXp_PAREN_NAMES(rx)) { if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { - return newSViv(HvTOTALKEYS(rx->paren_names)); + return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); } else if (flags & RXapif_ONE) { - ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES)); + ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); av = (AV*)SvRV(ret); length = av_len(av); return newSViv(length + 1); @@ -4982,12 +5098,15 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags) } SV* -Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) +Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) { + struct regexp *const rx = (struct regexp *)SvANY(r); AV *av = newAV(); - if (rx && rx->paren_names) { - HV *hv= rx->paren_names; + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv= RXp_PAREN_NAMES(rx); HE *temphe; (void)hv_iterinit(hv); while ( (temphe = hv_iternext_flags(hv,0)) ) { @@ -4996,7 +5115,7 @@ Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) SV* sv_dat = HeVAL(temphe); I32 *nums = (I32*)SvPVX(sv_dat); for ( i = 0; i < SvIVX(sv_dat); i++ ) { - if ((I32)(rx->lastcloseparen) >= nums[i] && + if ((I32)(rx->lastparen) >= nums[i] && rx->offs[nums[i]].start != -1 && rx->offs[nums[i]].end != -1) { @@ -5005,9 +5124,7 @@ Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) } } if (parno || flags & RXapif_ALL) { - STRLEN len; - char *pv = HePV(temphe, len); - av_push(av, newSVpvn(pv,len)); + av_push(av, newSVhek(HeKEY_hek(temphe))); } } } @@ -5016,11 +5133,15 @@ Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) } void -Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv) +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, + SV * const sv) { + struct regexp *const rx = (struct regexp *)SvANY(r); char *s = NULL; I32 i = 0; I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; if (!rx->subbeg) { sv_setsv(sv,&PL_sv_undef); @@ -5057,16 +5178,16 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * cons sv_setpvn(sv, s, i); PL_tainted = oldtainted; if ( (rx->extflags & RXf_CANY_SEEN) - ? (RX_MATCH_UTF8(rx) + ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) - : (RX_MATCH_UTF8(rx)) ) + : (RXp_MATCH_UTF8(rx)) ) { SvUTF8_on(sv); } else SvUTF8_off(sv); if (PL_tainting) { - if (RX_MATCH_TAINTED(rx)) { + if (RXp_MATCH_TAINTED(rx)) { if (SvTYPE(sv) >= SVt_PVMG) { MAGIC* const mg = SvMAGIC(sv); MAGIC* mgt; @@ -5094,6 +5215,8 @@ void Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, SV const * const value) { + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; + PERL_UNUSED_ARG(rx); PERL_UNUSED_ARG(paren); PERL_UNUSED_ARG(value); @@ -5103,12 +5226,15 @@ Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, } I32 -Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, const I32 paren) { + struct regexp *const rx = (struct regexp *)SvANY(r); I32 i; I32 s1, t1; + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + /* Some of this code was originally in C in F */ switch (paren) { /* $` / ${^PREMATCH} */ @@ -5148,7 +5274,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, } } getlen: - if (i > 0 && RX_MATCH_UTF8(rx)) { + if (i > 0 && RXp_MATCH_UTF8(rx)) { const char * const s = rx->subbeg + s1; const U8 *ep; STRLEN el; @@ -5163,8 +5289,12 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, SV* Perl_reg_qr_package(pTHX_ REGEXP * const rx) { + PERL_ARGS_ASSERT_REG_QR_PACKAGE; PERL_UNUSED_ARG(rx); - return newSVpvs("Regexp"); + if (0) + return NULL; + else + return newSVpvs("Regexp"); } /* Scans the name of a named buffer from the pattern. @@ -5180,9 +5310,12 @@ Perl_reg_qr_package(pTHX_ REGEXP * const rx) #define REG_RSN_RETURN_DATA 2 STATIC SV* -S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { +S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) +{ char *name_start = RExC_parse; + PERL_ARGS_ASSERT_REG_SCAN_NAME; + if (isIDFIRST_lazy_if(RExC_parse, UTF)) { /* skip IDFIRST by using do...while */ if (UTF) @@ -5196,10 +5329,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { } if ( flags ) { - SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start, - (int)(RExC_parse - name_start))); - if (UTF) - SvUTF8_on(sv_name); + SV* sv_name + = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), + SVs_TEMP | (UTF ? SVf_UTF8 : 0)); if ( flags == REG_RSN_RETURN_NAME) return sv_name; else if (flags==REG_RSN_RETURN_DATA) { @@ -5314,6 +5446,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char * const oregcomp_parse = RExC_parse; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG; DEBUG_PARSE("reg "); *flagp = 0; /* Tentatively. */ @@ -6158,6 +6292,9 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) register regnode *latest; I32 flags = 0, c = 0; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGBRANCH; + DEBUG_PARSE("brnc"); if (first) @@ -6233,6 +6370,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start; const char *maxpos = NULL; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPIECE; + DEBUG_PARSE("piec"); ret = regatom(pRExC_state, &flags,depth+1); @@ -6450,7 +6590,9 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) STRLEN len; /* this has various purposes throughout the code */ bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */ regnode *ret = NULL; - + + PERL_ARGS_ASSERT_REG_NAMEDSEQ; + if (*RExC_parse != '{') { vFAIL("Missing braces on \\N{}"); } @@ -6471,6 +6613,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) | PERL_SCAN_DISALLOW_PREFIX | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); UV cp; + char string; len = (STRLEN)(endbrace - name - 2); cp = grok_hex(name + 2, &len, &fl, NULL); if ( len != (STRLEN)(endbrace - name - 2) ) { @@ -6482,7 +6625,8 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) *valuep = cp; return NULL; } - sv_str= Perl_newSVpvf_nocontext("%c",(int)cp); + string = (char)cp; + sv_str= newSVpvn(&string, 1); } else { /* fetch the charnames handler for this scope */ HV * const table = GvHV(PL_hintgv); @@ -6689,11 +6833,13 @@ STATIC UV S_reg_recode(pTHX_ const char value, SV **encp) { STRLEN numlen = 1; - SV * const sv = sv_2mortal(newSVpvn(&value, numlen)); + SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); const STRLEN newlen = SvCUR(sv); UV uv = UNICODE_REPLACEMENT; + PERL_ARGS_ASSERT_REG_RECODE; + if (newlen) uv = SvUTF8(sv) ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) @@ -6740,6 +6886,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) DEBUG_PARSE("atom"); *flagp = WORST; /* Tentatively. */ + PERL_ARGS_ASSERT_REGATOM; tryagain: switch ((U8)*RExC_parse) { @@ -6829,8 +6976,10 @@ tryagain: case 0xDF: case 0xC3: case 0xCE: + do_foldchar: if (!LOC && FOLD) { U32 len,cp; + len=0; /* silence a spurious compiler warning */ if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ @@ -6854,7 +7003,11 @@ tryagain: required, as the default for this switch is to jump to the literal text handling code. */ - switch (*++RExC_parse) { + switch ((U8)*++RExC_parse) { + case 0xDF: + case 0xC3: + case 0xCE: + goto do_foldchar; /* Special Escapes */ case 'A': RExC_seen_zerolen++; @@ -6870,6 +7023,11 @@ tryagain: RExC_seen_zerolen++; ret = reg_node(pRExC_state, KEEPS); *flagp |= SIMPLE; + /* XXX:dmq : disabling in-place substitution seems to + * be necessary here to avoid cases of memory corruption, as + * with: C<$_="x" x 80; s/x\K/y/> -- rgs + */ + RExC_seen |= REG_SEEN_LOOKBEHIND; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -7167,8 +7325,13 @@ tryagain: an unescaped equivalent literal. */ - switch (*++p) { + switch ((U8)*++p) { /* These are all the special escapes. */ + case 0xDF: + case 0xC3: + case 0xCE: + if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) + goto normal_default; case 'A': /* Start assertion */ case 'b': case 'B': /* Word-boundary assertion*/ case 'C': /* Single char !DANGEROUS! */ @@ -7404,6 +7567,9 @@ STATIC char * S_regwhite( RExC_state_t *pRExC_state, char *p ) { const char *e = RExC_end; + + PERL_ARGS_ASSERT_REGWHITE; + while (p < e) { if (isSPACE(*p)) ++p; @@ -7440,6 +7606,8 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) dVAR; I32 namedclass = OOB_NAMEDCLASS; + PERL_ARGS_ASSERT_REGPPOSIXCC; + if (value == '[' && RExC_parse + 1 < RExC_end && /* I smell either [: or [= or [. -- POSIX has been here, right? */ POSIXCC(UCHARAT(RExC_parse))) { @@ -7554,6 +7722,9 @@ STATIC void S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) { dVAR; + + PERL_ARGS_ASSERT_CHECKPOSIXCC; + if (POSIXCC(UCHARAT(RExC_parse))) { const char *s = RExC_parse; const char c = *s++; @@ -7652,6 +7823,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCLASS; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -8115,8 +8288,8 @@ parseit: if (!unicode_alternate) unicode_alternate = newAV(); - sv = newSVpvn((char*)foldbuf, foldlen); - SvUTF8_on(sv); + sv = newSVpvn_utf8((char*)foldbuf, foldlen, + TRUE); av_push(unicode_alternate, sv); } } @@ -8239,6 +8412,9 @@ STATIC bool S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) { bool ended = 0; + + PERL_ARGS_ASSERT_REG_SKIPCOMMENT; + while (RExC_parse < RExC_end) if (*RExC_parse++ == '\n') { ended = 1; @@ -8271,6 +8447,8 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) { char* const retval = RExC_parse++; + PERL_ARGS_ASSERT_NEXTCHAR; + for (;;) { if (*RExC_parse == '(' && RExC_parse[1] == '?' && RExC_parse[2] == '#') { @@ -8307,6 +8485,8 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REG_NODE; + if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 1; @@ -8346,6 +8526,8 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGANODE; + if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 2; @@ -8396,6 +8578,9 @@ STATIC STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) { dVAR; + + PERL_ARGS_ASSERT_REGUNI; + return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); } @@ -8414,6 +8599,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) const int offset = regarglen[(U8)op]; const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGINSERT; PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); @@ -8496,6 +8683,8 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de dVAR; register regnode *scan; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -8556,9 +8745,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, #ifdef EXPERIMENTAL_INPLACESCAN I32 min = 0; #endif - GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGTAIL_STUDY; + if (SIZE_ONLY) return exact; @@ -8628,6 +8818,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, STATIC I32 S_regcurly(register const char *s) { + PERL_ARGS_ASSERT_REGCURLY; + if (*s++ != '{') return FALSE; if (!isDIGIT(*s)) @@ -8649,9 +8841,11 @@ S_regcurly(register const char *s) */ #ifdef DEBUGGING void -S_regdump_extflags(pTHX_ const char *lead, const U32 flags) { +S_regdump_extflags(pTHX_ const char *lead, const U32 flags) +{ int bit; int set=0; + for (bit=0; bit<32; bit++) { if (flags & (1<program, ri->program + 1, NULL, NULL, sv, 0, 0); /* Header fields of interest. */ @@ -8752,6 +8948,7 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_printf(Perl_debug_log, "\n"); DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); #else + PERL_ARGS_ASSERT_REGDUMP; PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(r); #endif /* DEBUGGING */ @@ -8769,6 +8966,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGPROP; sv_setpvn(sv, "", 0); @@ -8781,19 +8979,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) k = PL_regkind[OP(o)]; if (k == EXACT) { - SV * const dsv = sv_2mortal(newSVpvs("")); + sv_catpvs(sv, " "); /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) * is a crude hack but it may be the best for now since * we have no flag "this EXACTish node was UTF-8" * --jhi */ - const char * const s = - pv_pretty(dsv, STRING(o), STR_LEN(o), 60, - PL_colors[0], PL_colors[1], - PERL_PV_ESCAPE_UNI_DETECT | - PERL_PV_PRETTY_ELIPSES | - PERL_PV_PRETTY_LTGT - ); - Perl_sv_catpvf(aTHX_ sv, " %s", s ); + pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); } else if (k == TRIE) { /* print the details of the trie in dumpuntil instead, as * progi->data isn't available here */ @@ -8822,7 +9018,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) int i; int rangestart = -1; U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); - Perl_sv_catpvf(aTHX_ sv, "["); + sv_catpvs(sv, "["); for (i = 0; i <= 256; i++) { if (i < 256 && BITMAP_TEST(bitmap,i)) { if (rangestart == -1) @@ -8839,7 +9035,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) rangestart = -1; } } - Perl_sv_catpvf(aTHX_ sv, "]"); + sv_catpvs(sv, "]"); } } else if (k == CURLY) { @@ -8851,7 +9047,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ - if ( prog->paren_names ) { + if ( RXp_PAREN_NAMES(prog) ) { if ( k != REF || OP(o) < NREF) { AV *list= (AV *)progi->data->data[progi->name_list_idx]; SV **name= av_fetch(list, ARG(o), 0 ); @@ -9033,10 +9229,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } SV * -Perl_re_intuit_string(pTHX_ REGEXP * const prog) +Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ dVAR; + struct regexp *const prog = (struct regexp *)SvANY(r); GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_STRING; PERL_UNUSED_CONTEXT; DEBUG_COMPILE_r( @@ -9072,20 +9271,26 @@ Perl_re_intuit_string(pTHX_ REGEXP * const prog) */ #ifndef PERL_IN_XSUB_RE void -Perl_pregfree(pTHX_ struct regexp *r) +Perl_pregfree(pTHX_ REGEXP *r) +{ + SvREFCNT_dec(r); +} + +void +Perl_pregfree2(pTHX_ REGEXP *rx) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); GET_RE_DEBUG_FLAGS_DECL; - if (!r || (--r->refcnt > 0)) - return; + PERL_ARGS_ASSERT_PREGFREE2; + if (r->mother_re) { ReREFCNT_dec(r->mother_re); } else { - CALLREGFREE_PVT(r); /* free the private data */ - if (r->paren_names) - SvREFCNT_dec(r->paren_names); - Safefree(r->wrapped); + CALLREGFREE_PVT(rx); /* free the private data */ + if (RXp_PAREN_NAMES(r)) + SvREFCNT_dec(RXp_PAREN_NAMES(r)); } if (r->substrs) { if (r->anchored_substr) @@ -9098,14 +9303,13 @@ Perl_pregfree(pTHX_ struct regexp *r) SvREFCNT_dec(r->float_utf8); Safefree(r->substrs); } - RX_MATCH_COPY_FREE(r); + RX_MATCH_COPY_FREE(rx); #ifdef PERL_OLD_COPY_ON_WRITE if (r->saved_copy) SvREFCNT_dec(r->saved_copy); #endif Safefree(r->swap); Safefree(r->offs); - Safefree(r); } /* reg_temp_copy() @@ -9125,16 +9329,27 @@ Perl_pregfree(pTHX_ struct regexp *r) */ -regexp * -Perl_reg_temp_copy (pTHX_ struct regexp *r) { - regexp *ret; +REGEXP * +Perl_reg_temp_copy (pTHX_ REGEXP *rx) +{ + REGEXP *ret_x = (REGEXP*) newSV_type(SVt_REGEXP); + struct regexp *ret = (struct regexp *)SvANY(ret_x); + struct regexp *const r = (struct regexp *)SvANY(rx); register const I32 npar = r->nparens+1; - (void)ReREFCNT_inc(r); - Newx(ret, 1, regexp); - StructCopy(r, ret, regexp); + + PERL_ARGS_ASSERT_REG_TEMP_COPY; + + (void)ReREFCNT_inc(rx); + /* We can take advantage of the existing "copied buffer" mechanism in SVs + by pointing directly at the buffer, but flagging that the allocated + space in the copy is zero. As we've just done a struct copy, it's now + a case of zero-ing that, rather than copying the current length. */ + SvPV_set(ret_x, RX_WRAPPED(rx)); + SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8); + StructCopy(&(r->xpv_cur), &(ret->xpv_cur), struct regexp_allocated); + SvLEN_set(ret_x, 0); Newx(ret->offs, npar, regexp_paren_pair); Copy(r->offs, ret->offs, npar, regexp_paren_pair); - ret->refcnt = 1; if (r->substrs) { Newx(ret->substrs, 1, struct reg_substr_data); StructCopy(r->substrs, ret->substrs, struct reg_substr_data); @@ -9147,14 +9362,14 @@ Perl_reg_temp_copy (pTHX_ struct regexp *r) { /* check_substr and check_utf8, if non-NULL, point to either their anchored or float namesakes, and don't hold a second reference. */ } - RX_MATCH_COPIED_off(ret); + RX_MATCH_COPIED_off(ret_x); #ifdef PERL_OLD_COPY_ON_WRITE ret->saved_copy = NULL; #endif - ret->mother_re = r; + ret->mother_re = rx; ret->swap = NULL; - return ret; + return ret_x; } #endif @@ -9171,19 +9386,22 @@ Perl_reg_temp_copy (pTHX_ struct regexp *r) { */ void -Perl_regfree_internal(pTHX_ REGEXP * const r) +Perl_regfree_internal(pTHX_ REGEXP * const rx) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; - + + PERL_ARGS_ASSERT_REGFREE_INTERNAL; + DEBUG_COMPILE_r({ if (!PL_colorset) reginitcolors(); { SV *dsv= sv_newmortal(); - RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8), - dsv, r->precomp, r->prelen, 60); + RE_PV_QUOTED_DECL(s, RX_UTF8(rx), + dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", PL_colors[4],PL_colors[5],s); } @@ -9304,23 +9522,17 @@ Perl_regfree_internal(pTHX_ REGEXP * const r) */ #if defined(USE_ITHREADS) #ifndef PERL_IN_XSUB_RE -regexp * -Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) +void +Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) { dVAR; - regexp *ret; I32 npar; - - if (!r) - return (REGEXP *)NULL; - - if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r))) - return ret; - + const struct regexp *r = (const struct regexp *)SvANY(sstr); + struct regexp *ret = (struct regexp *)SvANY(dstr); + PERL_ARGS_ASSERT_RE_DUP_GUTS; + npar = r->nparens+1; - Newx(ret, 1, regexp); - StructCopy(r, ret, regexp); Newx(ret->offs, npar, regexp_paren_pair); Copy(r->offs, ret->offs, npar, regexp_paren_pair); if(ret->swap) { @@ -9332,7 +9544,9 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) /* Do it this way to avoid reading from *r after the StructCopy(). That way, if any of the sv_dup_inc()s dislodge *r from the L1 cache, it doesn't matter. */ - const bool anchored = r->check_substr == r->anchored_substr; + const bool anchored = r->check_substr + ? r->check_substr == r->anchored_substr + : r->check_utf8 == r->anchored_utf8; Newx(ret->substrs, 1, struct reg_substr_data); StructCopy(r->substrs, ret->substrs, struct reg_substr_data); @@ -9355,17 +9569,21 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) ret->check_substr = ret->float_substr; ret->check_utf8 = ret->float_utf8; } + } else if (ret->check_utf8) { + if (anchored) { + ret->check_utf8 = ret->anchored_utf8; + } else { + ret->check_utf8 = ret->float_utf8; + } } } - ret->wrapped = SAVEPVN(ret->wrapped, ret->wraplen+1); - ret->precomp = ret->wrapped + (ret->precomp - ret->wrapped); - ret->paren_names = hv_dup_inc(ret->paren_names, param); + RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); if (ret->pprivate) - RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); + RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); - if (RX_MATCH_COPIED(ret)) + if (RX_MATCH_COPIED(dstr)) ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); else ret->subbeg = NULL; @@ -9375,10 +9593,6 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) ret->mother_re = NULL; ret->gofs = 0; - ret->seen_evals = 0; - - ptr_table_store(PL_ptr_table, r, ret); - return ret; } #endif /* PERL_IN_XSUB_RE */ @@ -9397,12 +9611,15 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) */ void * -Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) +Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); regexp_internal *reti; int len, npar; RXi_GET_DECL(r,ri); + + PERL_ARGS_ASSERT_REGDUPE_INTERNAL; npar = r->nparens+1; len = ProgLen(ri); @@ -9489,48 +9706,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) #endif /* USE_ITHREADS */ -/* - reg_stringify() - - converts a regexp embedded in a MAGIC struct to its stringified form, - caching the converted form in the struct and returns the cached - string. - - If lp is nonnull then it is used to return the length of the - resulting string - - If flags is nonnull and the returned string contains UTF8 then - (*flags & 1) will be true. - - If haseval is nonnull then it is used to return whether the pattern - contains evals. - - Normally called via macro: - - CALLREG_STRINGIFY(mg,&len,&utf8); - - And internally with - - CALLREG_AS_STR(mg,&lp,&flags,&haseval) - - See sv_2pv_flags() in sv.c for an example of internal usage. - - */ #ifndef PERL_IN_XSUB_RE -char * -Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { - dVAR; - const regexp * const re = (regexp *)mg->mg_obj; - if (haseval) - *haseval = re->seen_evals; - if (flags) - *flags = ((re->extflags & RXf_UTF8) ? 1 : 0); - if (lp) - *lp = re->wraplen; - return re->wrapped; -} - /* - regnext - dig the "next" pointer out of a node */ @@ -9561,6 +9738,8 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) SV *msv; const char *message; + PERL_ARGS_ASSERT_RE_CROAK2; + if (l1 > 510) l1 = 510; if (l1 + l2 > 510) @@ -9621,7 +9800,7 @@ Perl_save_re_context(pTHX) const REGEXP * const rx = PM_GETRE(PL_curpm); if (rx) { U32 i; - for (i = 1; i <= rx->nparens; i++) { + for (i = 1; i <= RX_NPARENS(rx); i++) { char digits[TYPE_CHARS(long)]; const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); GV *const *const gvp @@ -9642,7 +9821,7 @@ static void clear_re(pTHX_ void *r) { dVAR; - ReREFCNT_dec((regexp *)r); + ReREFCNT_dec((REGEXP *)r); } #ifdef DEBUGGING @@ -9650,12 +9829,26 @@ clear_re(pTHX_ void *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (isCNTRL(c) || c == 255 || !isPRINT(c)) + PERL_ARGS_ASSERT_PUT_BYTE; + + /* Our definition of isPRINT() ignores locales, so only bytes that are + not part of UTF-8 are considered printable. I assume that the same + holds for UTF-EBCDIC. + Also, code point 255 is not printable in either (it's E0 in EBCDIC, + which Wikipedia says: + + EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all + ones (binary 1111 1111, hexadecimal FF). It is similar, but not + identical, to the ASCII delete (DEL) or rubout control character. + ) So the old condition can be simplified to !isPRINT(c) */ + if (!isPRINT(c)) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); - else if (c == '-' || c == ']' || c == '\\' || c == '^') - Perl_sv_catpvf(aTHX_ sv, "\\%c", c); - else - Perl_sv_catpvf(aTHX_ sv, "%c", c); + else { + const char string = c; + if (c == '-' || c == ']' || c == '\\' || c == '^') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } } @@ -9679,7 +9872,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; - + + PERL_ARGS_ASSERT_DUMPUNTIL; + #ifdef DEBUG_DUMPUNTIL PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, last ? last-start : 0,plast ? plast-start : 0); @@ -9758,7 +9953,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, PL_colors[0], PL_colors[1], (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELIPSES | + PERL_PV_PRETTY_ELLIPSES | PERL_PV_PRETTY_LTGT ) : "???"