X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=337f0c435a51c3a4801a49b2c4faa2eaf733d840;hb=2b63e250843b907e476587f037c0784d701fca62;hp=8b8cec492230c2fc25bdec7f38bd40a13cff4d02;hpb=568a785aea241a073c29598a27f44ca0be3e7591;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index 8b8cec4..337f0c4 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2,7 +2,9 @@ */ /* - * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee + * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee + * + * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] */ /* This file contains functions for compiling a regular expression. See @@ -57,7 +59,8 @@ **** Alterations to Henry's code are... **** **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, - **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others + **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + **** by Larry Wall and others **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -102,6 +105,7 @@ typedef struct RExC_state_t { U32 flags; /* are we folding, multilining? */ char *precomp; /* uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ regexp_internal *rxi; /* internal data for regexp object pprivate field */ char *start; /* Start of input for compile */ @@ -149,6 +153,7 @@ typedef struct RExC_state_t { #define RExC_flags (pRExC_state->flags) #define RExC_precomp (pRExC_state->precomp) +#define RExC_rx_sv (pRExC_state->rx_sv) #define RExC_rx (pRExC_state->rx) #define RExC_rxi (pRExC_state->rxi) #define RExC_start (pRExC_state->start) @@ -389,7 +394,7 @@ static const scan_data_t zero_scan_data = IV len = RExC_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ if (len > RegexLengthToShowInErrorMessages) { \ /* chop 10 shorter than the max, to ensure meaning of "..." */ \ len = RegexLengthToShowInErrorMessages - 10; \ @@ -420,7 +425,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL(m) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL(m); \ } STMT_END @@ -438,7 +443,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL2(m,a1) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL2(m, a1); \ } STMT_END @@ -457,7 +462,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL3(m,a1,a2) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ Simple_vFAIL3(m, a1, a2); \ } STMT_END @@ -470,23 +475,22 @@ static const scan_data_t zero_scan_data = (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END -#define vWARN(loc,m) STMT_START { \ +#define ckWARNreg(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END -#define vWARNdep(loc,m) STMT_START { \ +#define ckWARNregdep(loc,m) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ - "%s" REPORT_LOCATION, \ - m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + m REPORT_LOCATION, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END - -#define vWARN2(loc, m, a1) STMT_START { \ +#define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -496,12 +500,24 @@ static const scan_data_t zero_scan_data = a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END +#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END +#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ + a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ @@ -630,6 +646,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min const STRLEN old_l = CHR_SVLEN(*data->longest); GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_SCAN_COMMIT; + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { SvSetMagicSV(*data->longest, data->last_found); if (*data->longest == data->longest_fixed) { @@ -676,6 +694,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min STATIC void S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_ANYTHING; + ANYOF_CLASS_ZERO(cl); ANYOF_BITMAP_SETALL(cl); cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; @@ -689,6 +709,8 @@ S_cl_is_anything(const struct regnode_charclass_class *cl) { int value; + PERL_ARGS_ASSERT_CL_IS_ANYTHING; + for (value = 0; value <= ANYOF_MAX; value += 2) if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) return 1; @@ -703,6 +725,8 @@ S_cl_is_anything(const struct regnode_charclass_class *cl) STATIC void S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_INIT; + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(pRExC_state, cl); @@ -711,6 +735,8 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) STATIC void S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + PERL_ARGS_ASSERT_CL_INIT_ZERO; + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(pRExC_state, cl); @@ -724,6 +750,7 @@ STATIC void S_cl_and(struct regnode_charclass_class *cl, const struct regnode_charclass_class *and_with) { + PERL_ARGS_ASSERT_CL_AND; assert(and_with->type == ANYOF); if (!(and_with->flags & ANYOF_CLASS) @@ -762,6 +789,8 @@ S_cl_and(struct regnode_charclass_class *cl, STATIC void S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) { + PERL_ARGS_ASSERT_CL_OR; + if (or_with->flags & ANYOF_INVERT) { /* We do not use * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) @@ -853,6 +882,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_DUMP_TRIE; PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", (int)depth * 2 + 2,"", @@ -935,6 +965,9 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, SV *sv=sv_newmortal(); int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; + /* print out the table precompression. */ PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", @@ -990,6 +1023,8 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, SV *sv=sv_newmortal(); int colwidth= widecharmap ? 6 : 4; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; /* print out the table precompression so that we can do a visual check @@ -1163,11 +1198,19 @@ is the recommended Unicode-aware way of saying #define TRIE_STORE_REVCHAR \ STMT_START { \ - SV *tmp = newSVpvs(""); \ - if (UTF) SvUTF8_on(tmp); \ - Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \ - av_push( revcharmap, tmp ); \ - } STMT_END + if (UTF) { \ + SV *zlopp = newSV(2); \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)uvc; \ + av_push(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END #define TRIE_READ_CHAR STMT_START { \ wordlen++; \ @@ -1222,10 +1265,9 @@ is the recommended Unicode-aware way of saying /* store the word for dumping */ \ SV* tmp; \ if (OP(noper) != NOTHING) \ - tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \ + tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ else \ - tmp = newSVpvn( "", 0 ); \ - if ( UTF ) SvUTF8_on( tmp ); \ + tmp = newSVpvn_utf8( "", 0, UTF ); \ av_push( trie_words, tmp ); \ }); \ \ @@ -1310,6 +1352,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #endif SV *re_trie_maxbuff; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -1356,7 +1400,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store unicode characters. We use the + (trie->charmap) and we use a an HV* to store Unicode characters. We use the native representation of the character value as the key and IV's for the coded index. @@ -1405,7 +1449,20 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* store the codepoint in the bitmap, and if its ascii also store its folded equivelent. */ TRIE_BITMAP_SET(trie,uvc); - if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + codepoints in the 127 < uvc < 256 range */ + if (127 < uvc && uvc < 192) { + TRIE_BITMAP_SET(trie,194); + } else if (191 < uvc ) { + TRIE_BITMAP_SET(trie,195); + /* && uvc < 256 -- we know uvc is < 256 already */ + } + } set_bit = 0; /* We've done our bit :-) */ } } else { @@ -1952,7 +2009,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( folder ) TRIE_BITMAP_SET(trie, folder[ *ch ]); DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, (char*)ch) + PerlIO_printf(Perl_debug_log, "%s", (char*)ch) ); } } @@ -1966,7 +2023,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( count == 1 ) { SV **tmp = av_fetch( revcharmap, idx, 0); - char *ch = SvPV_nolen( *tmp ); + STRLEN len; + char *ch = SvPV( *tmp, len ); DEBUG_OPTIMISE_r({ SV *sv=sv_newmortal(); PerlIO_printf( Perl_debug_log, @@ -1985,11 +2043,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs str=STRING(convert); STR_LEN(convert)=0; } - while (*ch) { + STR_LEN(convert) += len; + while (len--) *str++ = *ch++; - STR_LEN(convert)++; - } - } else { #ifdef DEBUGGING if (state>1) @@ -2004,24 +2060,35 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->startstate = state; trie->minlen -= (state - 1); trie->maxlen -= (state - 1); - DEBUG_r({ - regnode *fix = convert; - U32 word = trie->wordcount; - mjd_nodelen++; - Set_Node_Offset_Length(convert, mjd_offset, state - 1); - while( ++fix < n ) { - Set_Node_Offset_Length(fix, 0, 0); - } - while (word--) { - SV ** const tmp = av_fetch( trie_words, word, 0 ); - if (tmp) { - if ( STR_LEN(convert) <= SvCUR(*tmp) ) - sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); - else - sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); - } - } - }); +#ifdef DEBUGGING + /* At least the UNICOS C compiler choked on this + * being argument to DEBUG_r(), so let's just have + * it right here. */ + if ( +#ifdef PERL_EXT_RE_BUILD + 1 +#else + DEBUG_r_TEST +#endif + ) { + regnode *fix = convert; + U32 word = trie->wordcount; + mjd_nodelen++; + Set_Node_Offset_Length(convert, mjd_offset, state - 1); + while( ++fix < n ) { + Set_Node_Offset_Length(fix, 0, 0); + } + while (word--) { + SV ** const tmp = av_fetch( trie_words, word, 0 ); + if (tmp) { + if ( STR_LEN(convert) <= SvCUR(*tmp) ) + sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); + else + sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); + } + } + } +#endif if (trie->maxlen) { convert = n; } else { @@ -2131,6 +2198,8 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode reg_ac_data *aho; const U32 data_slot = add_data( pRExC_state, 1, "T" ); GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -2247,6 +2316,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags #else PERL_UNUSED_ARG(depth); #endif + + PERL_ARGS_ASSERT_JOIN_EXACT; #ifndef EXPERIMENTAL_INPLACESCAN PERL_UNUSED_ARG(flags); PERL_UNUSED_ARG(val); @@ -2400,6 +2471,34 @@ typedef struct scan_frame { #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) +#define CASE_SYNST_FNC(nAmE) \ +case nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break; \ +case N ## nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break + + + STATIC I32 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, I32 *deltap, @@ -2429,9 +2528,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, regnode *first_non_open = scan; I32 stopmin = I32_MAX; scan_frame *frame = NULL; - GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_STUDY_CHUNK; + #ifdef DEBUGGING StructCopy(&zero_scan_data, &data_fake, scan_data_t); #endif @@ -2732,7 +2832,20 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, last = cur; } } else { - if ( last ) { +/* + Currently we do not believe that the trie logic can + handle case insensitive matching properly when the + pattern is not unicode (thus forcing unicode semantics). + + If/when this is fixed the following define can be swapped + in below to fully enable trie logic. + +#define TRIE_TYPE_IS_SAFE 1 + +*/ +#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) + + if ( last && TRIE_TYPE_IS_SAFE ) { make_trie( pRExC_state, startbranch, first, cur, tail, count, optype, depth+1 ); @@ -2760,7 +2873,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - if ( last ) { + + if ( last && TRIE_TYPE_IS_SAFE ) { made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && @@ -3081,11 +3195,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) - && maxcount <= REG_INFTY/3 /* Complement check for big count */ - && ckWARN(WARN_REGEXP)) + && maxcount <= REG_INFTY/3) /* Complement check for big count */ { - vWARN(RExC_parse, - "Quantifier unexpected on zero-length expression"); + ckWARNreg(RExC_parse, + "Quantifier unexpected on zero-length expression"); } min += minnext * mincount; @@ -3250,9 +3363,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l -= old; /* Get the added string: */ - last_str = newSVpvn(s + old, l); - if (UTF) - SvUTF8_on(last_str); + last_str = newSVpvn_utf8(s + old, l, UTF); if (deltanext == 0 && pos_before == b) { /* What was added is a constant string */ if (mincount > 1) { @@ -3270,7 +3381,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str); + mg->mg_len += CHR_SVLEN(last_str) - l; } data->last_end += l * (mincount - 1); } @@ -3330,6 +3441,46 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, break; } } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + int value = 0; + data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ + if (flags & SCF_DO_STCLASS_AND) { + for (value = 0; value < 256; value++) + if (!is_VERTWS_cp(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { + for (value = 0; value < 256; value++) + if (is_VERTWS_cp(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + if (flags & SCF_DO_STCLASS_OR) + cl_and(data->start_class, and_withp); + flags &= ~SCF_DO_STCLASS; + } + min += 1; + delta += 1; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += 1; + data->longest = &(data->longest_float); + } + + } + else if (OP(scan) == FOLDCHAR) { + int d = ARG(scan)==0xDF ? 1 : 2; + flags &= ~SCF_DO_STCLASS; + min += 1; + delta += d; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += d; + data->longest = &(data->longest_float); + } + } else if (strchr((const char*)PL_simple,OP(scan))) { int value = 0; @@ -3524,6 +3675,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } break; + CASE_SYNST_FNC(VERTWS); + CASE_SYNST_FNC(HORIZWS); + } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, and_withp); @@ -3588,11 +3742,22 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (f & SCF_DO_STCLASS_AND) { - const int was = (data->start_class->flags & ANYOF_EOS); - - cl_and(data->start_class, &intrnl); - if (was) - data->start_class->flags |= ANYOF_EOS; + if (flags & SCF_DO_STCLASS_OR) { + /* OR before, AND after: ideally we would recurse with + * data_fake to get the AND applied by study of the + * remainder of the pattern, and then derecurse; + * *** HACK *** for now just treat as "no information". + * See [perl #56690]. + */ + cl_init(pRExC_state, data->start_class); + } else { + /* AND before and after: combine and continue */ + const int was = (data->start_class->flags & ANYOF_EOS); + + cl_and(data->start_class, &intrnl); + if (was) + data->start_class->flags |= ANYOF_EOS; + } } } #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY @@ -3894,6 +4059,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ + /* Else: zero-length, ignore. */ scan = regnext(scan); } @@ -3938,6 +4104,8 @@ S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0; + PERL_ARGS_ASSERT_ADD_DATA; + Renewc(RExC_rxi->data, sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1), char, struct reg_data); @@ -4016,11 +4184,14 @@ extern const struct regexp_engine my_reg_engine; #endif #ifndef PERL_IN_XSUB_RE -regexp * -Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) +REGEXP * +Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) { dVAR; HV * const table = GvHV(PL_hintgv); + + PERL_ARGS_ASSERT_PREGCOMP; + /* Dispatch a request to compile a regexp to correct regexp engine. */ if (table) { @@ -4032,21 +4203,24 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", SvIV(*ptr)); }); - return CALLREGCOMP_ENG(eng, exp, xend, pm); + return CALLREGCOMP_ENG(eng, pattern, flags); } } - return Perl_re_compile(aTHX_ exp, xend, pm); + return Perl_re_compile(aTHX_ pattern, flags); } #endif -regexp * -Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) +REGEXP * +Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) { dVAR; - register regexp *r; + REGEXP *rx; + struct regexp *r; register regexp_internal *ri; + STRLEN plen; + char *exp = SvPV(pattern, plen); + char* xend = exp + plen; regnode *scan; - regnode *first; I32 flags; I32 minlen = 0; I32 sawplus = 0; @@ -4059,24 +4233,24 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) RExC_state_t copyRExC_state; #endif GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_COMPILE; + DEBUG_r(if (!PL_colorset) reginitcolors()); - - if (exp == NULL) - FAIL("NULL regexp argument"); - RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; + RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RExC_utf8, - dsv, exp, (xend - exp), 60); + dsv, exp, plen, 60); PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", PL_colors[4],PL_colors[5],s); }); redo_first_pass: RExC_precomp = exp; - RExC_flags = pm->op_pmflags; + RExC_flags = pm_flags; RExC_sawback = 0; RExC_seen = 0; @@ -4115,7 +4289,7 @@ redo_first_pass: return(NULL); } if (RExC_utf8 && !RExC_orig_utf8) { - /* It's possible to write a regexp in ascii that represents unicode + /* It's possible to write a regexp in ascii that represents Unicode codepoints outside of the byte range, such as via \x{100}. If we detect such a sequence we have to convert the entire pattern to utf8 and then recompile, as our sizing calculation will have been based @@ -4124,7 +4298,7 @@ redo_first_pass: thing. XXX: somehow figure out how to make this less expensive... -- dmq */ - STRLEN len = xend-exp; + STRLEN len = plen; DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); @@ -4153,7 +4327,8 @@ redo_first_pass: /* Allocate space and zero-initialize. Note, the two step process of zeroing when in debug mode, thus anything assigned has to happen after that */ - Newxz(r, 1, regexp); + rx = (REGEXP*) newSV_type(SVt_REGEXP); + r = (struct regexp*)SvANY(rx); Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char, regexp_internal); if ( r == NULL || ri == NULL ) @@ -4169,25 +4344,26 @@ redo_first_pass: /* non-zero initialization begins here */ RXi_SET( r, ri ); r->engine= RE_ENGINE_PTR; - r->refcnt = 1; - r->prelen = xend - exp; - r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; + r->extflags = pm_flags; { - bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); - U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ char *p; - r->wraplen = r->prelen + has_minus + has_k + has_runon + const STRLEN wraplen = plen + has_minus + has_p + has_runon + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); - Newx(r->wrapped, r->wraplen + 1, char ); - p = r->wrapped; + p = sv_grow(MUTABLE_SV(rx), wraplen + 1); + SvCUR_set(rx, wraplen); + SvPOK_on(rx); + SvFLAGS(rx) |= SvUTF8(pattern); *p++='('; *p++='?'; - if (has_k) - *p++ = KEEPCOPY_PAT_MOD; /*'k'*/ + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ { char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; char *colon = r + 1; @@ -4207,9 +4383,10 @@ redo_first_pass: } *p++ = ':'; - Copy(RExC_precomp, p, r->prelen, char); - r->precomp = p; - p += r->prelen; + Copy(RExC_precomp, p, plen, char); + assert ((RX_WRAPPED(rx) - p) < 16); + r->pre_prefix = p - RX_WRAPPED(rx); + p += plen; if (has_runon) *p++ = '\n'; *p++ = ')'; @@ -4235,11 +4412,12 @@ redo_first_pass: (UV)((2*RExC_size+1) * sizeof(U32)))); #endif SetProgLen(ri,RExC_size); + RExC_rx_sv = rx; RExC_rx = r; RExC_rxi = ri; /* Second pass: emit code. */ - RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */ + RExC_flags = pm_flags; /* don't let top level (?i) bleed */ RExC_parse = exp; RExC_end = xend; RExC_naughty = 0; @@ -4251,9 +4429,10 @@ redo_first_pass: /* Store the count of eval-groups for security checks: */ RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); - if (reg(pRExC_state, 0, &flags,1) == NULL) + if (reg(pRExC_state, 0, &flags,1) == NULL) { + ReREFCNT_dec(rx); return(NULL); - + } /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ Newx(r->substrs, 1, struct reg_substr_data); @@ -4267,7 +4446,10 @@ reStudy: Zero(r->substrs, 1, struct reg_substr_data); #ifdef TRIE_STUDY_OPT - if ( restudied ) { + if (!restudied) { + StructCopy(&zero_scan_data, &data, scan_data_t); + copyRExC_state = RExC_state; + } else { U32 seen=RExC_seen; DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); @@ -4282,19 +4464,17 @@ reStudy: SvREFCNT_dec(data.last_found); } StructCopy(&zero_scan_data, &data, scan_data_t); - } else { - StructCopy(&zero_scan_data, &data, scan_data_t); - copyRExC_state = RExC_state; } #else StructCopy(&zero_scan_data, &data, scan_data_t); #endif /* Dig out information for optimizations. */ - r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */ - pm->op_pmflags = RExC_flags; + r->extflags = RExC_flags; /* was pm_op */ + /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ + if (UTF) - r->extflags |= RXf_UTF8; /* Unicode in it? */ + SvUTF8_on(rx); /* Unicode in it? */ ri->regstclass = NULL; if (RExC_naughty >= 10) /* Probably an expensive pattern. */ r->intflags |= PREGf_NAUGHTY; @@ -4308,29 +4488,43 @@ reStudy: struct regnode_charclass_class ch_class; /* pointed to by data */ int stclass_flag; I32 last_close = 0; /* pointed to by data */ - - first = scan; - /* Skip introductions and multiplicators >= 1. */ + regnode *first= scan; + regnode *first_next= regnext(first); + + /* + * Skip introductions and multiplicators >= 1 + * so that we can extract the 'meat' of the pattern that must + * match in the large if() sequence following. + * NOTE that EXACT is NOT covered here, as it is normally + * picked up by the optimiser separately. + * + * This is unfortunate as the optimiser isnt handling lookahead + * properly currently. + * + */ while ((OP(first) == OPEN && (sawopen = 1)) || /* An OR of *one* alternative - should not happen now. */ - (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || + (OP(first) == BRANCH && OP(first_next) != BRANCH) || /* for now we can't handle lookbehind IFMATCH*/ (OP(first) == IFMATCH && !first->flags) || (OP(first) == PLUS) || (OP(first) == MINMOD) || /* An {n,m} with n>0 */ - (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) + (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || + (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) { - + /* + * the only op that could be a regnode is PLUS, all the rest + * will be regnode_1 or regnode_2. + * + */ if (OP(first) == PLUS) sawplus = 1; else first += regarglen[OP(first)]; - if (OP(first) == IFMATCH) { - first = NEXTOPER(first); - first += EXTRA_STEP_2ARGS; - } else /* XXX possible optimisation for /(?=)/ */ - first = NEXTOPER(first); + + first = NEXTOPER(first); + first_next= regnext(first); } /* Starting-point info. */ @@ -4673,14 +4867,37 @@ reStudy: if (RExC_seen & REG_SEEN_CUTGROUP) r->intflags |= PREGf_CUTGROUP_SEEN; if (RExC_paren_names) - r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); + RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); else - r->paren_names = NULL; - if (r->prelen == 3 && strEQ("\\s+", r->precomp)) - r->extflags |= RXf_WHITE; - else if (r->prelen == 1 && r->precomp[0] == '^') + RXp_PAREN_NAMES(r) = NULL; + +#ifdef STUPID_PATTERN_CHECKS + if (RX_PRELEN(rx) == 0) + r->extflags |= RXf_NULL; + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) + r->extflags |= RXf_WHITE; + else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^') r->extflags |= RXf_START_ONLY; - +#else + if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else { + regnode *first = ri->program + 1; + U8 fop = OP(first); + U8 nop = OP(NEXTOPER(first)); + + if (PL_regkind[fop] == NOTHING && nop == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && nop == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END) + r->extflags |= RXf_WHITE; + } +#endif #ifdef DEBUGGING if (RExC_paren_names) { ri->name_list_idx = add_data( pRExC_state, 1, "p" ); @@ -4695,8 +4912,7 @@ reStudy: ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); } } - Newxz(r->startp, RExC_npar, I32); - Newxz(r->endp, RExC_npar, I32); + Newxz(r->offs, RExC_npar, regexp_paren_pair); /* assume we don't need to swap parens around before we match */ DEBUG_DUMP_r({ @@ -4717,84 +4933,271 @@ reStudy: PerlIO_printf(Perl_debug_log, "\n"); }); #endif - return(r); + return rx; } #undef RE_ENGINE_PTR SV* -Perl_reg_named_buff_get(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags) +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF; + + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak(aTHX_ "%s", PL_no_modify); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, + const U32 flags) { AV *retarray = NULL; SV *ret; - if (flags & 1) + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; + + if (flags & RXapif_ALL) retarray=newAV(); - if (rx && rx->paren_names) { - HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); + if (rx && RXp_PAREN_NAMES(rx)) { + HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); if (he_str) { IV i; SV* sv_dat=HeVAL(he_str); I32 *nums=(I32*)SvPVX(sv_dat); for ( i=0; inparens) >= nums[i] - && rx->startp[nums[i]] != -1 - && rx->endp[nums[i]] != -1) + if ((I32)(rx->nparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) { - ret = CALLREG_NUMBUF(rx,nums[i],NULL); + ret = newSVpvs(""); + CALLREG_NUMBUF_FETCH(r,nums[i],ret); if (!retarray) return ret; } else { ret = newSVsv(&PL_sv_undef); } - if (retarray) { - SvREFCNT_inc(ret); + if (retarray) av_push(retarray, ret); - } } if (retarray) - return (SV*)retarray; + return newRV_noinc(MUTABLE_SV(retarray)); + } + } + return NULL; +} + +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, + const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & RXapif_ALL) { + return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); + if (sv) { + SvREFCNT_dec(sv); + return TRUE; + } else { + return FALSE; + } + } + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; + + if ( rx && RXp_PAREN_NAMES(rx) ) { + (void)hv_iterinit(RXp_PAREN_NAMES(rx)); + + return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv = RXp_PAREN_NAMES(rx); + HE *temphe; + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + return newSVhek(HeKEY_hek(temphe)); + } } } return NULL; } SV* -Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) +{ + SV *ret; + AV *av; + I32 length; + struct regexp *const rx = (struct regexp *)SvANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); + av = MUTABLE_AV(SvRV(ret)); + length = av_len(av); + SvREFCNT_dec(ret); + return newSViv(length + 1); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + AV *av = newAV(); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv= RXp_PAREN_NAMES(rx); + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + av_push(av, newSVhek(HeKEY_hek(temphe))); + } + } + } + + return newRV_noinc(MUTABLE_SV(av)); +} + +void +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, + SV * const sv) { + struct regexp *const rx = (struct regexp *)SvANY(r); char *s = NULL; I32 i = 0; I32 s1, t1; - SV *sv = usesv ? usesv : newSVpvs(""); + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; if (!rx->subbeg) { sv_setsv(sv,&PL_sv_undef); - return sv; + return; } else - if (paren == -2 && rx->startp[0] != -1) { + if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { /* $` */ - i = rx->startp[0]; + i = rx->offs[0].start; s = rx->subbeg; } else - if (paren == -1 && rx->endp[0] != -1) { + if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { /* $' */ - s = rx->subbeg + rx->endp[0]; - i = rx->sublen - rx->endp[0]; + s = rx->subbeg + rx->offs[0].end; + i = rx->sublen - rx->offs[0].end; } else if ( 0 <= paren && paren <= (I32)rx->nparens && - (s1 = rx->startp[paren]) != -1 && - (t1 = rx->endp[paren]) != -1) + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) { /* $& $1 ... */ i = t1 - s1; s = rx->subbeg + s1; } else { sv_setsv(sv,&PL_sv_undef); - return sv; + return; } assert(rx->sublen >= (s - rx->subbeg) + i ); if (i >= 0) { @@ -4803,16 +5206,16 @@ Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) sv_setpvn(sv, s, i); PL_tainted = oldtainted; if ( (rx->extflags & RXf_CANY_SEEN) - ? (RX_MATCH_UTF8(rx) + ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) - : (RX_MATCH_UTF8(rx)) ) + : (RXp_MATCH_UTF8(rx)) ) { SvUTF8_on(sv); } else SvUTF8_off(sv); if (PL_tainting) { - if (RX_MATCH_TAINTED(rx)) { + if (RXp_MATCH_TAINTED(rx)) { if (SvTYPE(sv) >= SVt_PVMG) { MAGIC* const mg = SvMAGIC(sv); MAGIC* mgt; @@ -4832,10 +5235,95 @@ Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) } } else { sv_setsv(sv,&PL_sv_undef); + return; + } +} + +void +Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, + SV const * const value) +{ + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; + + PERL_UNUSED_ARG(rx); + PERL_UNUSED_ARG(paren); + PERL_UNUSED_ARG(value); + + if (!PL_localizing) + Perl_croak(aTHX_ "%s", PL_no_modify); +} + +I32 +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, + const I32 paren) +{ + struct regexp *const rx = (struct regexp *)SvANY(r); + I32 i; + I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + + /* Some of this code was originally in C in F */ + switch (paren) { + /* $` / ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: + if (rx->offs[0].start != -1) { + i = rx->offs[0].start; + if (i > 0) { + s1 = 0; + t1 = i; + goto getlen; + } + } + return 0; + /* $' / ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: + if (rx->offs[0].end != -1) { + i = rx->sublen - rx->offs[0].end; + if (i > 0) { + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; + } + } + return 0; + /* $& / ${^MATCH}, $1, $2, ... */ + default: + if (paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + i = t1 - s1; + goto getlen; + } else { + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((const SV *)sv); + return 0; + } + } + getlen: + if (i > 0 && RXp_MATCH_UTF8(rx)) { + const char * const s = rx->subbeg + s1; + const U8 *ep; + STRLEN el; + + i = t1 - s1; + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; } - return sv; + return i; } +SV* +Perl_reg_qr_package(pTHX_ REGEXP * const rx) +{ + PERL_ARGS_ASSERT_REG_QR_PACKAGE; + PERL_UNUSED_ARG(rx); + if (0) + return NULL; + else + return newSVpvs("Regexp"); +} /* Scans the name of a named buffer from the pattern. * If flags is REG_RSN_RETURN_NULL returns null. @@ -4850,9 +5338,12 @@ Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) #define REG_RSN_RETURN_DATA 2 STATIC SV* -S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { +S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) +{ char *name_start = RExC_parse; + PERL_ARGS_ASSERT_REG_SCAN_NAME; + if (isIDFIRST_lazy_if(RExC_parse, UTF)) { /* skip IDFIRST by using do...while */ if (UTF) @@ -4866,10 +5357,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { } if ( flags ) { - SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start, - (int)(RExC_parse - name_start))); - if (UTF) - SvUTF8_on(sv_name); + SV* sv_name + = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), + SVs_TEMP | (UTF ? SVf_UTF8 : 0)); if ( flags == REG_RSN_RETURN_NAME) return sv_name; else if (flags==REG_RSN_RETURN_DATA) { @@ -4965,7 +5455,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) register regnode *ender = NULL; register I32 parno = 0; I32 flags; - const I32 oregflags = RExC_flags; + U32 oregflags = RExC_flags; bool have_branch = 0; bool is_open = 0; I32 freeze_paren = 0; @@ -4984,6 +5474,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char * const oregcomp_parse = RExC_parse; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REG; DEBUG_PARSE("reg "); *flagp = 0; /* Tentatively. */ @@ -5125,7 +5617,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } RExC_sawback = 1; ret = reganode(pRExC_state, @@ -5173,10 +5665,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) "panic: reg_scan_name returned NULL"); if (!RExC_paren_names) { RExC_paren_names= newHV(); - sv_2mortal((SV*)RExC_paren_names); + sv_2mortal(MUTABLE_SV(RExC_paren_names)); #ifdef DEBUGGING RExC_paren_name_list= newAV(); - sv_2mortal((SV*)RExC_paren_name_list); + sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); #endif } he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); @@ -5205,13 +5697,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); pv[count] = RExC_npar; - SvIVX(sv_dat)++; + SvIV_set(sv_dat, SvIVX(sv_dat) + 1); } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); SvIOK_on(sv_dat); - SvIVX(sv_dat)= 1; + SvIV_set(sv_dat, 1); } #ifdef DEBUGGING if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) @@ -5227,6 +5719,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_seen |= REG_SEEN_LOOKBEHIND; RExC_parse++; case '=': /* (?=...) */ + RExC_seen_zerolen++; + break; case '!': /* (?!...) */ RExC_seen_zerolen++; if (*RExC_parse == ')') { @@ -5460,7 +5954,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } ret = reganode(pRExC_state,NGROUPP,num); goto insert_if_check_paren; @@ -5564,8 +6058,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) and must be globally applied -- japhy */ switch (*RExC_parse) { CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); - case 'o': - case 'g': + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; if (! (wastedflags & wflagbit) ) { @@ -5582,7 +6076,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; - case 'c': + case CONTINUE_PAT_MOD: /* 'c' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; @@ -5595,10 +6089,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } break; - case 'k': + case KEEPCOPY_PAT_MOD: /* 'p' */ if (flagsp == &negflags) { - if (SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(RExC_parse + 1,"Useless use of (?-k)"); + if (SIZE_ONLY) + ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); } else { *flagsp |= RXf_PMf_KEEPCOPY; } @@ -5618,6 +6112,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case ')': RExC_flags |= posflags; RExC_flags &= ~negflags; + if (paren != ':') { + oregflags |= posflags; + oregflags &= ~negflags; + } nextchar(pRExC_state); if (paren != ':') { *flagp = TRYAGAIN; @@ -5666,6 +6164,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* Pick up the branches, linking them together. */ parse_start = RExC_parse; /* MJD */ br = regbranch(pRExC_state, &flags, 1,depth+1); + + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } + /* branch_len = (paren != 0); */ if (br == NULL) @@ -5824,6 +6329,9 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) register regnode *latest; I32 flags = 0, c = 0; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGBRANCH; + DEBUG_PARSE("brnc"); if (first) @@ -5899,6 +6407,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start; const char *maxpos = NULL; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGPIECE; + DEBUG_PARSE("piec"); ret = regatom(pRExC_state, &flags,depth+1); @@ -5976,7 +6487,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = WORST; if (max > 0) *flagp |= HASWIDTH; - if (max && max < min) + if (max < min) vFAIL("Can't do {n,m} with n > m"); if (!SIZE_ONLY) { ARG1_SET(ret, (U16)min); @@ -6036,11 +6547,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto do_curly; } nest_check: - if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { - vWARN3(RExC_parse, - "%.*s matches null string many times", - (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), - origparse); + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { + ckWARN3reg(RExC_parse, + "%.*s matches null string many times", + (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), + origparse); } if (RExC_parse < RExC_end && *RExC_parse == '?') { @@ -6075,7 +6586,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* reg_namedseq(pRExC_state,UVp) This is expected to be called by a parser routine that has - recognized'\N' and needs to handle the rest. RExC_parse is + recognized '\N' and needs to handle the rest. RExC_parse is expected to point at the first char following the N at the time of the call. @@ -6089,11 +6600,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) be returned to indicate failure. (This will NOT be a valid pointer to a regnode.) - If value is null then it is assumed that we are parsing normal text + If valuep is null then it is assumed that we are parsing normal text and inserts a new EXACT node into the program containing the resolved string and returns a pointer to the new node. If the string is zerolength a NOTHING node is emitted. - + On success RExC_parse is set to the char following the endbrace. Parsing failures will generate a fatal errorvia vFAIL(...) @@ -6107,7 +6618,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) */ STATIC regnode * -S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) +S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) { char * name; /* start of the content of the name */ char * endbrace; /* endbrace following the name */ @@ -6116,9 +6627,25 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) STRLEN len; /* this has various purposes throughout the code */ bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */ regnode *ret = NULL; - - if (*RExC_parse != '{') { - vFAIL("Missing braces on \\N{}"); + + PERL_ARGS_ASSERT_REG_NAMEDSEQ; + + if (*RExC_parse != '{' || + (*RExC_parse == '{' && RExC_parse[1] + && strchr("0123456789", RExC_parse[1]))) + { + GET_RE_DEBUG_FLAGS_DECL; + if (valuep) + /* no bare \N in a charclass */ + vFAIL("Missing braces on \\N{}"); + GET_RE_DEBUG_FLAGS; + nextchar(pRExC_state); + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + RExC_parse--; + Set_Node_Length(ret, 1); /* MJD */ + return ret; } name = RExC_parse+1; endbrace = strchr(RExC_parse, '}'); @@ -6132,7 +6659,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) /* RExC_parse points at the beginning brace, endbrace points at the last */ if ( name[0]=='U' && name[1]=='+' ) { - /* its a "unicode hex" notation {U+89AB} */ + /* its a "Unicode hex" notation {U+89AB} */ I32 fl = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); @@ -6142,13 +6669,25 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) if ( len != (STRLEN)(endbrace - name - 2) ) { cp = 0xFFFD; } - if (cp > 0xff) - RExC_utf8 = 1; if ( valuep ) { + if (cp > 0xff) RExC_utf8 = 1; *valuep = cp; return NULL; } - sv_str= Perl_newSVpvf_nocontext("%c",(int)cp); + + /* Need to convert to utf8 if either: won't fit into a byte, or the re + * is going to be in utf8 and the representation changes under utf8. */ + if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) { + U8 string[UTF8_MAXBYTES+1]; + U8 *tmps; + RExC_utf8 = 1; + tmps = uvuni_to_utf8(string, cp); + sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE); + } else { /* Otherwise, no need for utf8, can skip that step */ + char string; + string = (char)cp; + sv_str= newSVpvn(&string, 1); + } } else { /* fetch the charnames handler for this scope */ HV * const table = GvHV(PL_hintgv); @@ -6162,13 +6701,13 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) sv_name = newSVpvn(name, endbrace - name); if (!table || !(PL_hints & HINT_LOCALIZE_HH)) { - vFAIL2("Constant(\\N{%s}) unknown: " + vFAIL2("Constant(\\N{%" SVf "}) unknown: " "(possibly a missing \"use charnames ...\")", - SvPVX(sv_name)); + SVfARG(sv_name)); } if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */ - vFAIL2("Constant(\\N{%s}): " - "$^H{charnames} is not defined",SvPVX(sv_name)); + vFAIL2("Constant(\\N{%" SVf "}): " + "$^H{charnames} is not defined", SVfARG(sv_name)); } @@ -6176,7 +6715,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) if (!RExC_charnames) { /* make sure our cache is allocated */ RExC_charnames = newHV(); - sv_2mortal((SV*)RExC_charnames); + sv_2mortal(MUTABLE_SV(RExC_charnames)); } /* see if we have looked this one up before */ he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 ); @@ -6207,8 +6746,8 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) LEAVE ; if ( !sv_str || !SvOK(sv_str) ) { - vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} " - "did not return a defined value",SvPVX(sv_name)); + vFAIL2("Constant(\\N{%" SVf "}): Call to &{$^H{charnames}} " + "did not return a defined value", SVfARG(sv_name)); } if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0)) cached = 1; @@ -6239,20 +6778,19 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) *valuep = (UV)*p; /* warn if we havent used the whole string? */ } - if (numlen -- rgs + */ + RExC_seen |= REG_SEEN_LOOKBEHIND; goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -6579,15 +7141,25 @@ tryagain: ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; + case 'R': + ret = reg_node(pRExC_state, LNBREAK); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'h': + ret = reg_node(pRExC_state, HORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'H': + ret = reg_node(pRExC_state, NHORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; case 'v': - ret = reganode(pRExC_state, PRUNE, 0); - ret->flags = 1; - *flagp |= SIMPLE; + ret = reg_node(pRExC_state, VERTWS); + *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'V': - ret = reganode(pRExC_state, SKIP, 0); - ret->flags = 1; - *flagp |= SIMPLE; + ret = reg_node(pRExC_state, NVERTWS); + *flagp |= HASWIDTH|SIMPLE; finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ @@ -6630,12 +7202,12 @@ tryagain: } break; case 'N': - /* Handle \N{NAME} here and not below because it can be + /* Handle \N and \N{NAME} here and not below because it can be multicharacter. join_exact() will join them up later on. Also this makes sure that things like /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq*/ ++RExC_parse; - ret= reg_namedseq(pRExC_state, NULL); + ret= reg_namedseq(pRExC_state, NULL, flagp); break; case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: @@ -6658,7 +7230,7 @@ tryagain: if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } RExC_sawback = 1; @@ -6699,6 +7271,8 @@ tryagain: goto parse_named_seq; } } num = atoi(RExC_parse); + if (isg && num == 0) + vFAIL("Reference to invalid group 0"); if (isrel) { num = RExC_npar - num; if (num < 1) @@ -6754,7 +7328,8 @@ tryagain: } /* FALL THROUGH */ - default: { + default: + outer_default:{ register STRLEN len; register UV ender; register char *p; @@ -6779,7 +7354,12 @@ tryagain: if (RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - switch (*p) { + switch ((U8)*p) { + case 0xDF: + case 0xC3: + case 0xCE: + if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) + goto normal_default; case '^': case '$': case '.': @@ -6802,18 +7382,25 @@ tryagain: an unescaped equivalent literal. */ - switch (*++p) { + switch ((U8)*++p) { /* These are all the special escapes. */ + case 0xDF: + case 0xC3: + case 0xCE: + if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) + goto normal_default; case 'A': /* Start assertion */ case 'b': case 'B': /* Word-boundary assertion*/ case 'C': /* Single char !DANGEROUS! */ case 'd': case 'D': /* digit class */ case 'g': case 'G': /* generic-backref, pos assertion */ + case 'h': case 'H': /* HORIZWS */ case 'k': case 'K': /* named backref, keep marker */ case 'N': /* named char sequence */ - case 'p': case 'P': /* unicode property */ + case 'p': case 'P': /* Unicode property */ + case 'R': /* LNBREAK */ case 's': case 'S': /* space class */ - case 'v': case 'V': /* (*PRUNE) and (*SKIP) */ + case 'v': case 'V': /* VERTWS */ case 'w': case 'W': /* word class */ case 'X': /* eXtended Unicode "combining character sequence" */ case 'z': case 'Z': /* End of line/string assertion */ @@ -6886,6 +7473,18 @@ tryagain: I32 flags = 0; STRLEN numlen = 3; ender = grok_oct(p, &numlen, &flags, NULL); + + /* An octal above 0xff is interpreted differently + * depending on if the re is in utf8 or not. If it + * is in utf8, the value will be itself, otherwise + * it is interpreted as modulo 0x100. It has been + * decided to discourage the use of octal above the + * single-byte range. For now, warn only when + * it ends up modulo */ + if (SIZE_ONLY && ender >= 0x100 + && ! UTF && ! PL_encoding) { + ckWARNregdep(p, "Use of octal value above 377 is deprecated"); + } p += numlen; } else { @@ -6899,8 +7498,8 @@ tryagain: { SV* enc = PL_encoding; ender = reg_recode((const char)(U8)ender, &enc); - if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(p, "Invalid escape in the specified encoding"); + if (!enc && SIZE_ONLY) + ckWARNreg(p, "Invalid escape in the specified encoding"); RExC_utf8 = 1; } break; @@ -6909,8 +7508,8 @@ tryagain: FAIL("Trailing \\"); /* FALL THROUGH */ default: - if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP)) - vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); + if (!SIZE_ONLY&& isALPHA(*p)) + ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); goto normal_default; } break; @@ -7037,6 +7636,9 @@ STATIC char * S_regwhite( RExC_state_t *pRExC_state, char *p ) { const char *e = RExC_end; + + PERL_ARGS_ASSERT_REGWHITE; + while (p < e) { if (isSPACE(*p)) ++p; @@ -7073,6 +7675,8 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) dVAR; I32 namedclass = OOB_NAMEDCLASS; + PERL_ARGS_ASSERT_REGPPOSIXCC; + if (value == '[' && RExC_parse + 1 < RExC_end && /* I smell either [: or [= or [. -- POSIX has been here, right? */ POSIXCC(UCHARAT(RExC_parse))) { @@ -7187,6 +7791,9 @@ STATIC void S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) { dVAR; + + PERL_ARGS_ASSERT_CHECKPOSIXCC; + if (POSIXCC(UCHARAT(RExC_parse))) { const char *s = RExC_parse; const char c = *s++; @@ -7194,10 +7801,9 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) while (isALNUM(*s)) s++; if (*s && c == *s && s[1] == ']') { - if (ckWARN(WARN_REGEXP)) - vWARN3(s+2, - "POSIX syntax [%c %c] belongs inside character classes", - c, c); + ckWARN3reg(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); /* [[=foo=]] and [[.foo.]] are still future. */ if (POSIXCC_NOTYET(c)) { @@ -7236,6 +7842,37 @@ case ANYOF_N##NAME: \ what = WORD; \ break +#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ +ANYOF_##NAME: \ + for (value = 0; value < 256; value++) \ + if (TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '+'; \ + what = WORD; \ + break; \ +case ANYOF_N##NAME: \ + for (value = 0; value < 256; value++) \ + if (!TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '!'; \ + what = WORD; \ + break + +/* + We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test + so that it is possible to override the option here without having to + rebuild the entire core. as we are required to do if we change regcomp.h + which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. +*/ +#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS +#define BROKEN_UNICODE_CHARCLASS_MAPPINGS +#endif + +#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS +#define POSIX_CC_UNI_NAME(CCNAME) CCNAME +#else +#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME +#endif /* parse a class specification and produce either an ANYOF node that @@ -7248,10 +7885,10 @@ STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) { dVAR; - register UV value = 0; register UV nextvalue; register IV prevvalue = OOB_UNICODE; register IV range = 0; + UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ register regnode *ret; STRLEN numlen; IV namedclass; @@ -7270,6 +7907,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGCLASS; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -7354,6 +7993,10 @@ parseit: case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { /* We only pay attention to the first char of @@ -7362,7 +8005,7 @@ parseit: from earlier versions, OTOH that behaviour was broken as well. */ UV v; /* value is register so we cant & it /grrr */ - if (reg_namedseq(pRExC_state, &v)) { + if (reg_namedseq(pRExC_state, &v, NULL)) { goto parseit; } value= v; @@ -7456,16 +8099,16 @@ parseit: { SV* enc = PL_encoding; value = reg_recode((const char)(U8)value, &enc); - if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(RExC_parse, - "Invalid escape in the specified encoding"); + if (!enc && SIZE_ONLY) + ckWARNreg(RExC_parse, + "Invalid escape in the specified encoding"); break; } default: - if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP)) - vWARN2(RExC_parse, - "Unrecognized escape \\%c in character class passed through", - (int)value); + if (!SIZE_ONLY && isALPHA(value)) + ckWARN2reg(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); break; } } /* end of \blah */ @@ -7484,14 +8127,13 @@ parseit: /* a bad range like a-\d, a-[:digit:] ? */ if (range) { if (!SIZE_ONLY) { - if (ckWARN(WARN_REGEXP)) { - const int w = - RExC_parse >= rangebegin ? - RExC_parse - rangebegin : 0; - vWARN4(RExC_parse, + const int w = + RExC_parse >= rangebegin ? + RExC_parse - rangebegin : 0; + ckWARN4reg(RExC_parse, "False [] range \"%*.*s\"", w, w, rangebegin); - } + if (prevvalue < 256) { ANYOF_BITMAP_SET(ret, prevvalue); ANYOF_BITMAP_SET(ret, '-'); @@ -7519,19 +8161,27 @@ parseit: * A similar issue a little earlier when switching on value. * --jhi */ switch ((I32)namedclass) { + + case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum")); + case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha")); + case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank")); + case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl")); + case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph")); + case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower")); + case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print")); + case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space")); + case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct")); + case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper")); +#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS case _C_C_T_(ALNUM, isALNUM(value), "Word"); - case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum"); - case _C_C_T_(ALPHA, isALPHA(value), "Alpha"); - case _C_C_T_(BLANK, isBLANK(value), "Blank"); - case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl"); - case _C_C_T_(GRAPH, isGRAPH(value), "Graph"); - case _C_C_T_(LOWER, isLOWER(value), "Lower"); - case _C_C_T_(PRINT, isPRINT(value), "Print"); - case _C_C_T_(PSXSPC, isPSXSPC(value), "Space"); - case _C_C_T_(PUNCT, isPUNCT(value), "Punct"); case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); - case _C_C_T_(UPPER, isUPPER(value), "Upper"); +#else + case _C_C_T_(SPACE, isSPACE(value), "PerlSpace"); + case _C_C_T_(ALNUM, isALNUM(value), "PerlWord"); +#endif case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); + case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); + case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); case ANYOF_ASCII: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_ASCII); @@ -7575,7 +8225,7 @@ parseit: ANYOF_BITMAP_SET(ret, value); } yesno = '+'; - what = "Digit"; + what = POSIX_CC_UNI_NAME("Digit"); break; case ANYOF_NDIGIT: if (LOC) @@ -7588,7 +8238,7 @@ parseit: ANYOF_BITMAP_SET(ret, value); } yesno = '!'; - what = "Digit"; + what = POSIX_CC_UNI_NAME("Digit"); break; case ANYOF_MAX: /* this is to handle \p and \P */ @@ -7653,12 +8303,16 @@ parseit: { if (isLOWER(prevvalue)) { for (i = prevvalue; i <= ceilvalue; i++) - if (isLOWER(i)) + if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } else { for (i = prevvalue; i <= ceilvalue; i++) - if (isUPPER(i)) + if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } } else @@ -7723,8 +8377,8 @@ parseit: if (!unicode_alternate) unicode_alternate = newAV(); - sv = newSVpvn((char*)foldbuf, foldlen); - SvUTF8_on(sv); + sv = newSVpvn_utf8((char*)foldbuf, foldlen, + TRUE); av_push(unicode_alternate, sv); } } @@ -7784,6 +8438,7 @@ parseit: *STRING(ret)= (char)value; STR_LEN(ret)= 1; RExC_emit += STR_SZ(1); + SvREFCNT_dec(listsv); return ret; } /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ @@ -7820,8 +8475,8 @@ parseit: * used later (regexec.c:S_reginclass()). */ av_store(av, 0, listsv); av_store(av, 1, NULL); - av_store(av, 2, (SV*)unicode_alternate); - rv = newRV_noinc((SV*)av); + av_store(av, 2, MUTABLE_SV(unicode_alternate)); + rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, 1, "s"); RExC_rxi->data->data[n] = (void*)rv; ARG_SET(ret, n); @@ -7847,6 +8502,9 @@ STATIC bool S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) { bool ended = 0; + + PERL_ARGS_ASSERT_REG_SKIPCOMMENT; + while (RExC_parse < RExC_end) if (*RExC_parse++ == '\n') { ended = 1; @@ -7879,6 +8537,8 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) { char* const retval = RExC_parse++; + PERL_ARGS_ASSERT_NEXTCHAR; + for (;;) { if (*RExC_parse == '(' && RExC_parse[1] == '?' && RExC_parse[2] == '#') { @@ -7915,6 +8575,8 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REG_NODE; + if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 1; @@ -7954,6 +8616,8 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGANODE; + if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); RExC_size += 2; @@ -8004,6 +8668,9 @@ STATIC STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) { dVAR; + + PERL_ARGS_ASSERT_REGUNI; + return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); } @@ -8022,6 +8689,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) const int offset = regarglen[(U8)op]; const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGINSERT; PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); @@ -8104,6 +8773,8 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de dVAR; register regnode *scan; GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_REGTAIL; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -8164,9 +8835,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, #ifdef EXPERIMENTAL_INPLACESCAN I32 min = 0; #endif - GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGTAIL_STUDY; + if (SIZE_ONLY) return exact; @@ -8236,6 +8908,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, STATIC I32 S_regcurly(register const char *s) { + PERL_ARGS_ASSERT_REGCURLY; + if (*s++ != '{') return FALSE; if (!isDIGIT(*s)) @@ -8255,6 +8929,29 @@ S_regcurly(register const char *s) /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ +#ifdef DEBUGGING +static void +S_regdump_extflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + + for (bit=0; bit<32; bit++) { + if (flags & (1<program, ri->program + 1, NULL, NULL, sv, 0, 0); @@ -8336,7 +9036,9 @@ Perl_regdump(pTHX_ const regexp *r) if (r->extflags & RXf_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); #else + PERL_ARGS_ASSERT_REGDUMP; PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(r); #endif /* DEBUGGING */ @@ -8345,6 +9047,17 @@ Perl_regdump(pTHX_ const regexp *r) /* - regprop - printable representation of opcode */ +#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \ +STMT_START { \ + if (do_sep) { \ + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \ + if (flags & ANYOF_INVERT) \ + /*make sure the invert info is in each */ \ + sv_catpvs(sv, "^"); \ + do_sep = 0; \ + } \ +} STMT_END + void Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) { @@ -8354,8 +9067,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGPROP; - sv_setpvn(sv, "", 0); + sv_setpvs(sv, ""); if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ /* It would be nice to FAIL() here, but this may be called from @@ -8366,19 +9080,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) k = PL_regkind[OP(o)]; if (k == EXACT) { - SV * const dsv = sv_2mortal(newSVpvs("")); + sv_catpvs(sv, " "); /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) * is a crude hack but it may be the best for now since * we have no flag "this EXACTish node was UTF-8" * --jhi */ - const char * const s = - pv_pretty(dsv, STRING(o), STR_LEN(o), 60, - PL_colors[0], PL_colors[1], - PERL_PV_ESCAPE_UNI_DETECT | - PERL_PV_PRETTY_ELIPSES | - PERL_PV_PRETTY_LTGT - ); - Perl_sv_catpvf(aTHX_ sv, " %s", s ); + pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); } else if (k == TRIE) { /* print the details of the trie in dumpuntil instead, as * progi->data isn't available here */ @@ -8407,7 +9119,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) int i; int rangestart = -1; U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); - Perl_sv_catpvf(aTHX_ sv, "["); + sv_catpvs(sv, "["); for (i = 0; i <= 256; i++) { if (i < 256 && BITMAP_TEST(bitmap,i)) { if (rangestart == -1) @@ -8424,7 +9136,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) rangestart = -1; } } - Perl_sv_catpvf(aTHX_ sv, "]"); + sv_catpvs(sv, "]"); } } else if (k == CURLY) { @@ -8436,16 +9148,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ - if ( prog->paren_names ) { + if ( RXp_PAREN_NAMES(prog) ) { if ( k != REF || OP(o) < NREF) { - AV *list= (AV *)progi->data->data[progi->name_list_idx]; + AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); SV **name= av_fetch(list, ARG(o), 0 ); if (name) Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); } else { - AV *list= (AV *)progi->data->data[ progi->name_list_idx ]; - SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ]; + AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); + SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); I32 *nums=(I32*)SvPVX(sv_dat); SV **name= av_fetch(list, nums[0], 0 ); I32 n; @@ -8463,12 +9175,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) else if (k == VERB) { if (!o->flags) Perl_sv_catpvf(aTHX_ sv, ":%"SVf, - SVfARG((SV*)progi->data->data[ ARG( o ) ])); + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + else if (k == FOLDCHAR) + Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); + int do_sep = 0; /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ static const char * const anyofs[] = { @@ -8484,8 +9199,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) "[:^alpha:]", "[:ascii:]", "[:^ascii:]", - "[:ctrl:]", - "[:^ctrl:]", + "[:cntrl:]", + "[:^cntrl:]", "[:graph:]", "[:^graph:]", "[:lower:]", @@ -8511,6 +9226,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); if (flags & ANYOF_INVERT) sv_catpvs(sv, "^"); + + /* output what the standard cp 0-255 bitmap matches */ for (i = 0; i <= 256; i++) { if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { if (rangestart == -1) @@ -8524,15 +9241,23 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) sv_catpvs(sv, "-"); put_byte(sv, i - 1); } + do_sep = 1; rangestart = -1; } } - + + EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); + /* output any special charclass tests (used mostly under use locale) */ if (o->flags & ANYOF_CLASS) for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) - if (ANYOF_CLASS_TEST(o,i)) + if (ANYOF_CLASS_TEST(o,i)) { sv_catpv(sv, anyofs[i]); - + do_sep = 1; + } + + EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); + + /* output information about the unicode matching */ if (flags & ANYOF_UNICODE) sv_catpvs(sv, "{unicode}"); else if (flags & ANYOF_UNICODE_ALL) @@ -8545,7 +9270,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (lv) { if (sw) { U8 s[UTF8_MAXBYTES_CASE+1]; - + for (i = 0; i <= 256; i++) { /* just the first 256 */ uvchr_to_utf8(s, i); @@ -8616,10 +9341,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } SV * -Perl_re_intuit_string(pTHX_ regexp *prog) +Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ dVAR; + struct regexp *const prog = (struct regexp *)SvANY(r); GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_RE_INTUIT_STRING; PERL_UNUSED_CONTEXT; DEBUG_COMPILE_r( @@ -8655,45 +9383,38 @@ Perl_re_intuit_string(pTHX_ regexp *prog) */ #ifndef PERL_IN_XSUB_RE void -Perl_pregfree(pTHX_ struct regexp *r) +Perl_pregfree(pTHX_ REGEXP *r) +{ + SvREFCNT_dec(r); +} + +void +Perl_pregfree2(pTHX_ REGEXP *rx) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); GET_RE_DEBUG_FLAGS_DECL; - if (!r || (--r->refcnt > 0)) - return; + PERL_ARGS_ASSERT_PREGFREE2; + if (r->mother_re) { ReREFCNT_dec(r->mother_re); } else { - CALLREGFREE_PVT(r); /* free the private data */ - if (r->paren_names) - SvREFCNT_dec(r->paren_names); - Safefree(r->wrapped); + CALLREGFREE_PVT(rx); /* free the private data */ + SvREFCNT_dec(RXp_PAREN_NAMES(r)); } if (r->substrs) { - if (r->anchored_substr) - SvREFCNT_dec(r->anchored_substr); - if (r->anchored_utf8) - SvREFCNT_dec(r->anchored_utf8); - if (r->float_substr) - SvREFCNT_dec(r->float_substr); - if (r->float_utf8) - SvREFCNT_dec(r->float_utf8); + SvREFCNT_dec(r->anchored_substr); + SvREFCNT_dec(r->anchored_utf8); + SvREFCNT_dec(r->float_substr); + SvREFCNT_dec(r->float_utf8); Safefree(r->substrs); } - RX_MATCH_COPY_FREE(r); + RX_MATCH_COPY_FREE(rx); #ifdef PERL_OLD_COPY_ON_WRITE - if (r->saved_copy) - SvREFCNT_dec(r->saved_copy); + SvREFCNT_dec(r->saved_copy); #endif - if (r->swap) { - Safefree(r->swap->startp); - Safefree(r->swap->endp); - Safefree(r->swap); - } - Safefree(r->startp); - Safefree(r->endp); - Safefree(r); + Safefree(r->offs); } /* reg_temp_copy() @@ -8713,41 +9434,50 @@ Perl_pregfree(pTHX_ struct regexp *r) */ -regexp * -Perl_reg_temp_copy (pTHX_ struct regexp *r) { - regexp *ret; +REGEXP * +Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) +{ + struct regexp *ret; + struct regexp *const r = (struct regexp *)SvANY(rx); register const I32 npar = r->nparens+1; - (void)ReREFCNT_inc(r); - Newx(ret, 1, regexp); - StructCopy(r, ret, regexp); - Newx(ret->startp, npar, I32); - Copy(r->startp, ret->startp, npar, I32); - Newx(ret->endp, npar, I32); - Copy(r->endp, ret->endp, npar, I32); - ret->refcnt = 1; + + PERL_ARGS_ASSERT_REG_TEMP_COPY; + + if (!ret_x) + ret_x = (REGEXP*) newSV_type(SVt_REGEXP); + ret = (struct regexp *)SvANY(ret_x); + + (void)ReREFCNT_inc(rx); + /* We can take advantage of the existing "copied buffer" mechanism in SVs + by pointing directly at the buffer, but flagging that the allocated + space in the copy is zero. As we've just done a struct copy, it's now + a case of zero-ing that, rather than copying the current length. */ + SvPV_set(ret_x, RX_WRAPPED(rx)); + SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8); + memcpy(&(ret->xpv_cur), &(r->xpv_cur), + sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); + SvLEN_set(ret_x, 0); + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); if (r->substrs) { - struct reg_substr_datum *s; - I32 i; Newx(ret->substrs, 1, struct reg_substr_data); - for (s = ret->substrs->data, i = 0; i < 3; i++, s++) { - s->min_offset = r->substrs->data[i].min_offset; - s->max_offset = r->substrs->data[i].max_offset; - s->end_shift = r->substrs->data[i].end_shift; - s->substr = SvREFCNT_inc(r->substrs->data[i].substr); - s->utf8_substr = SvREFCNT_inc(r->substrs->data[i].utf8_substr); - } - } - RX_MATCH_COPIED_off(ret); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + SvREFCNT_inc_void(ret->anchored_substr); + SvREFCNT_inc_void(ret->anchored_utf8); + SvREFCNT_inc_void(ret->float_substr); + SvREFCNT_inc_void(ret->float_utf8); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + } + RX_MATCH_COPIED_off(ret_x); #ifdef PERL_OLD_COPY_ON_WRITE - /* this is broken. */ - assert(0); - if (ret->saved_copy) - ret->saved_copy=NULL; + ret->saved_copy = NULL; #endif - ret->mother_re = r; - ret->swap = NULL; + ret->mother_re = rx; - return ret; + return ret_x; } #endif @@ -8764,19 +9494,22 @@ Perl_reg_temp_copy (pTHX_ struct regexp *r) { */ void -Perl_regfree_internal(pTHX_ struct regexp *r) +Perl_regfree_internal(pTHX_ REGEXP * const rx) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; - + + PERL_ARGS_ASSERT_REGFREE_INTERNAL; + DEBUG_COMPILE_r({ if (!PL_colorset) reginitcolors(); { SV *dsv= sv_newmortal(); - RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8), - dsv, r->precomp, r->prelen, 60); + RE_PV_QUOTED_DECL(s, RX_UTF8(rx), + dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", PL_colors[4],PL_colors[5],s); } @@ -8797,13 +9530,13 @@ Perl_regfree_internal(pTHX_ struct regexp *r) case 's': case 'S': case 'u': - SvREFCNT_dec((SV*)ri->data->data[n]); + SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); break; case 'f': Safefree(ri->data->data[n]); break; case 'p': - new_comppad = (AV*)ri->data->data[n]; + new_comppad = MUTABLE_AV(ri->data->data[n]); break; case 'o': if (new_comppad == NULL) @@ -8819,7 +9552,7 @@ Perl_regfree_internal(pTHX_ struct regexp *r) op_free((OP_4tree*)ri->data->data[n]); PAD_RESTORE_LOCAL(old_comppad); - SvREFCNT_dec((SV*)new_comppad); + SvREFCNT_dec(MUTABLE_SV(new_comppad)); new_comppad = NULL; break; case 'n': @@ -8878,17 +9611,16 @@ Perl_regfree_internal(pTHX_ struct regexp *r) } #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t)) -#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t)) -#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t)) +#define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t))) +#define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t))) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) /* - regdupe - duplicate a regexp. - - This routine is called by sv.c's re_dup and is expected to clone a - given regexp structure. It is a no-op when not under USE_ITHREADS. - (Originally this *was* re_dup() for change history see sv.c) + re_dup - duplicate a regexp. + This routine is expected to clone a given regexp structure. It is only + compiled under USE_ITHREADS. + After all of the core data stored in struct regexp is duplicated the regexp_engine.dupe method is used to copy any private data stored in the *pprivate pointer. This allows extensions to handle @@ -8898,83 +9630,90 @@ Perl_regfree_internal(pTHX_ struct regexp *r) */ #if defined(USE_ITHREADS) #ifndef PERL_IN_XSUB_RE -regexp * -Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) +void +Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) { dVAR; - regexp *ret; - I32 i, npar; - struct reg_substr_datum *s; - - if (!r) - return (REGEXP *)NULL; - - if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r))) - return ret; - + I32 npar; + const struct regexp *r = (const struct regexp *)SvANY(sstr); + struct regexp *ret = (struct regexp *)SvANY(dstr); + PERL_ARGS_ASSERT_RE_DUP_GUTS; + npar = r->nparens+1; - Newxz(ret, 1, regexp); - Newx(ret->startp, npar, I32); - Copy(r->startp, ret->startp, npar, I32); - Newx(ret->endp, npar, I32); - Copy(r->endp, ret->endp, npar, I32); - if(r->swap) { - Newx(ret->swap, 1, regexp_paren_ofs); + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + if(ret->swap) { /* no need to copy these */ - Newx(ret->swap->startp, npar, I32); - Newx(ret->swap->endp, npar, I32); - } else { - ret->swap = NULL; + Newx(ret->swap, npar, regexp_paren_pair); } - if (r->substrs) { + if (ret->substrs) { + /* Do it this way to avoid reading from *r after the StructCopy(). + That way, if any of the sv_dup_inc()s dislodge *r from the L1 + cache, it doesn't matter. */ + const bool anchored = r->check_substr + ? r->check_substr == r->anchored_substr + : r->check_utf8 == r->anchored_utf8; Newx(ret->substrs, 1, struct reg_substr_data); - for (s = ret->substrs->data, i = 0; i < 3; i++, s++) { - s->min_offset = r->substrs->data[i].min_offset; - s->max_offset = r->substrs->data[i].max_offset; - s->end_shift = r->substrs->data[i].end_shift; - s->substr = sv_dup_inc(r->substrs->data[i].substr, param); - s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param); - } - } else - ret->substrs = NULL; - - ret->wrapped = SAVEPVN(r->wrapped, r->wraplen+1); - ret->precomp = ret->wrapped + (r->precomp - r->wrapped); - ret->prelen = r->prelen; - ret->wraplen = r->wraplen; - - ret->mother_re = NULL; - ret->refcnt = r->refcnt; - ret->minlen = r->minlen; - ret->minlenret = r->minlenret; - ret->nparens = r->nparens; - ret->lastparen = r->lastparen; - ret->lastcloseparen = r->lastcloseparen; - ret->intflags = r->intflags; - ret->extflags = r->extflags; - - ret->sublen = r->sublen; - - ret->engine = r->engine; - - ret->paren_names = hv_dup_inc(r->paren_names, param); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); - if (RX_MATCH_COPIED(ret)) - ret->subbeg = SAVEPVN(r->subbeg, r->sublen); + ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); + ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); + ret->float_substr = sv_dup_inc(ret->float_substr, param); + ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + + if (ret->check_substr) { + if (anchored) { + assert(r->check_utf8 == r->anchored_utf8); + ret->check_substr = ret->anchored_substr; + ret->check_utf8 = ret->anchored_utf8; + } else { + assert(r->check_substr == r->float_substr); + assert(r->check_utf8 == r->float_utf8); + ret->check_substr = ret->float_substr; + ret->check_utf8 = ret->float_utf8; + } + } else if (ret->check_utf8) { + if (anchored) { + ret->check_utf8 = ret->anchored_utf8; + } else { + ret->check_utf8 = ret->float_utf8; + } + } + } + + RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); + + if (ret->pprivate) + RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); + + if (RX_MATCH_COPIED(dstr)) + ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); else ret->subbeg = NULL; #ifdef PERL_OLD_COPY_ON_WRITE ret->saved_copy = NULL; #endif - - ret->pprivate = r->pprivate; - if (ret->pprivate) - RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); - - ptr_table_store(PL_ptr_table, r, ret); - return ret; + + if (ret->mother_re) { + if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) { + /* Our storage points directly to our mother regexp, but that's + 1: a buffer in a different thread + 2: something we no longer hold a reference on + so we need to copy it locally. */ + /* Note we need to sue SvCUR() on our mother_re, because it, in + turn, may well be pointing to its own mother_re. */ + SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re), + SvCUR(ret->mother_re)+1)); + SvLEN_set(dstr, SvCUR(ret->mother_re)+1); + } + ret->mother_re = NULL; + } + ret->gofs = 0; } #endif /* PERL_IN_XSUB_RE */ @@ -8993,17 +9732,20 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) */ void * -Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) +Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) { dVAR; + struct regexp *const r = (struct regexp *)SvANY(rx); regexp_internal *reti; int len, npar; RXi_GET_DECL(r,ri); + + PERL_ARGS_ASSERT_REGDUPE_INTERNAL; npar = r->nparens+1; len = ProgLen(ri); - Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal); + Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); @@ -9028,7 +9770,7 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) case 'S': case 'p': /* actually an AV, but the dup function is identical. */ case 'u': /* actually an HV, but the dup function is identical. */ - d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param); + d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); break; case 'f': /* This is cheating. */ @@ -9085,48 +9827,8 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) #endif /* USE_ITHREADS */ -/* - reg_stringify() - - converts a regexp embedded in a MAGIC struct to its stringified form, - caching the converted form in the struct and returns the cached - string. - - If lp is nonnull then it is used to return the length of the - resulting string - - If flags is nonnull and the returned string contains UTF8 then - (*flags & 1) will be true. - - If haseval is nonnull then it is used to return whether the pattern - contains evals. - - Normally called via macro: - - CALLREG_STRINGIFY(mg,&len,&utf8); - - And internally with - - CALLREG_AS_STR(mg,&lp,&flags,&haseval) - - See sv_2pv_flags() in sv.c for an example of internal usage. - - */ #ifndef PERL_IN_XSUB_RE -char * -Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { - dVAR; - const regexp * const re = (regexp *)mg->mg_obj; - if (haseval) - *haseval = re->seen_evals; - if (flags) - *flags = ((re->extflags & RXf_UTF8) ? 1 : 0); - if (lp) - *lp = re->wraplen; - return re->wrapped; -} - /* - regnext - dig the "next" pointer out of a node */ @@ -9157,6 +9859,8 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) SV *msv; const char *message; + PERL_ARGS_ASSERT_RE_CROAK2; + if (l1 > 510) l1 = 510; if (l1 + l2 > 510) @@ -9217,7 +9921,7 @@ Perl_save_re_context(pTHX) const REGEXP * const rx = PM_GETRE(PL_curpm); if (rx) { U32 i; - for (i = 1; i <= rx->nparens; i++) { + for (i = 1; i <= RX_NPARENS(rx); i++) { char digits[TYPE_CHARS(long)]; const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i); GV *const *const gvp @@ -9238,7 +9942,7 @@ static void clear_re(pTHX_ void *r) { dVAR; - ReREFCNT_dec((regexp *)r); + ReREFCNT_dec((REGEXP *)r); } #ifdef DEBUGGING @@ -9246,12 +9950,26 @@ clear_re(pTHX_ void *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (isCNTRL(c) || c == 255 || !isPRINT(c)) + PERL_ARGS_ASSERT_PUT_BYTE; + + /* Our definition of isPRINT() ignores locales, so only bytes that are + not part of UTF-8 are considered printable. I assume that the same + holds for UTF-EBCDIC. + Also, code point 255 is not printable in either (it's E0 in EBCDIC, + which Wikipedia says: + + EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all + ones (binary 1111 1111, hexadecimal FF). It is similar, but not + identical, to the ASCII delete (DEL) or rubout control character. + ) So the old condition can be simplified to !isPRINT(c) */ + if (!isPRINT(c)) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); - else if (c == '-' || c == ']' || c == '\\' || c == '^') - Perl_sv_catpvf(aTHX_ sv, "\\%c", c); - else - Perl_sv_catpvf(aTHX_ sv, "%c", c); + else { + const char string = c; + if (c == '-' || c == ']' || c == '\\' || c == '^') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } } @@ -9275,7 +9993,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; - + + PERL_ARGS_ASSERT_DUMPUNTIL; + #ifdef DEBUG_DUMPUNTIL PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, last ? last-start : 0,plast ? plast-start : 0); @@ -9341,11 +10061,11 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const reg_trie_data * const trie = (reg_trie_data*)ri->data->data[optrie]; #ifdef DEBUGGING - AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET]; + AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); #endif const regnode *nextbranch= NULL; I32 word_idx; - sv_setpvn(sv, "", 0); + sv_setpvs(sv, ""); for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); @@ -9354,7 +10074,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, PL_colors[0], PL_colors[1], (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELIPSES | + PERL_PV_PRETTY_ELLIPSES | PERL_PV_PRETTY_LTGT ) : "???"