From: Yves Orton Date: Fri, 12 Jan 2007 02:31:12 +0000 (+0100) Subject: Add support for /k modfier for matching along with ${^PREMATCH}, ${^MATCH}, ${^POSTMATCH} X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=cde0cee5716418bb58782f073048ee9685ed2368;p=p5sagit%2Fp5-mst-13.2.git Add support for /k modfier for matching along with ${^PREMATCH}, ${^MATCH}, ${^POSTMATCH} Message-ID: <9b18b3110701111731x29b1c63i57b1698f769b3bbc@mail.gmail.com> (with tweaks) p4raw-id: //depot/perl@29831 --- diff --git a/MANIFEST b/MANIFEST index 9b8753a..7a8aa79 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3569,6 +3569,7 @@ t/op/regexp_notrie.t See if regular expressions work without trie optimisation t/op/regexp_qr_embed.t See if regular expressions work with embedded qr// t/op/regexp_qr.t See if regular expressions work as qr// t/op/regexp.t See if regular expressions work +t/op/regexp_kmod.t See if regexp /k modifier works as expected t/op/regexp_trielist.t See if regular expressions work with trie optimisation t/op/regmesg.t See if one can get regular expression errors t/op/repeat.t See if x operator works diff --git a/ext/B/t/concise-xs.t b/ext/B/t/concise-xs.t index b4f053f..d3711cc 100644 --- a/ext/B/t/concise-xs.t +++ b/ext/B/t/concise-xs.t @@ -117,7 +117,7 @@ use Getopt::Std; use Carp; use Test::More tests => ( # per-pkg tests (function ct + require_ok) 40 + 16 # Data::Dumper, Digest::MD5 - + 517 + 238 # B::Deparse, B + + 517 + 239 # B::Deparse, B + 595 + 190 # POSIX, IO::Socket + 3 * ($] > 5.009) + 16 * ($] >= 5.009003) diff --git a/gv.c b/gv.c index 8630c1b..4332df9 100644 --- a/gv.c +++ b/gv.c @@ -1109,10 +1109,16 @@ Perl_gv_fetchpvn_flags(pTHX_ const char *nambeg, STRLEN full_len, I32 flags, if (strEQ(name2, "NCODING")) goto magicalize; break; + case '\015': /* $^MATCH */ + if (strEQ(name2, "ATCH")) + goto ro_magicalize; case '\017': /* $^OPEN */ if (strEQ(name2, "PEN")) goto magicalize; break; + case '\020': /* $^PREMATCH $^POSTMATCH */ + if (strEQ(name2, "REMATCH") || strEQ(name2, "OSTMATCH")) + goto ro_magicalize; case '\024': /* ${^TAINT} */ if (strEQ(name2, "AINT")) goto ro_magicalize; diff --git a/mg.c b/mg.c index 2bb9b66..86f0e12 100644 --- a/mg.c +++ b/mg.c @@ -779,8 +779,14 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) } } break; - case '\020': /* ^P */ - sv_setiv(sv, (IV)PL_perldb); + case '\020': + if (nextchar == '\0') { /* ^P */ + sv_setiv(sv, (IV)PL_perldb); + } else if (strEQ(remaining, "REMATCH")) { /* $^PREMATCH */ + goto do_prematch_fetch; + } else if (strEQ(remaining, "OSTMATCH")) { /* $^POSTMATCH */ + goto do_postmatch_fetch; + } break; case '\023': /* ^S */ if (nextchar == '\0') { @@ -847,18 +853,21 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) SvPOK_only(sv); } break; + case '\015': /* $^MATCH */ + if (strEQ(remaining, "ATCH")) { case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '&': - if (PL_curpm && (rx = PM_GETRE(PL_curpm))) { - /* - * Pre-threads, this was paren = atoi(GvENAME((GV*)mg->mg_obj)); - * XXX Does the new way break anything? - */ - paren = atoi(mg->mg_ptr); /* $& is in [0] */ - reg_numbered_buff_get( paren, rx, sv, 0); - break; + if (PL_curpm && (rx = PM_GETRE(PL_curpm))) { + /* + * Pre-threads, this was paren = atoi(GvENAME((GV*)mg->mg_obj)); + * XXX Does the new way break anything? + */ + paren = atoi(mg->mg_ptr); /* $& is in [0] */ + reg_numbered_buff_get( paren, rx, sv, 0); + break; + } + sv_setsv(sv,&PL_sv_undef); } - sv_setsv(sv,&PL_sv_undef); break; case '+': if (PL_curpm && (rx = PM_GETRE(PL_curpm))) { @@ -880,6 +889,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) sv_setsv(sv,&PL_sv_undef); break; case '`': + do_prematch_fetch: if (PL_curpm && (rx = PM_GETRE(PL_curpm))) { reg_numbered_buff_get( -2, rx, sv, 0); break; @@ -887,6 +897,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) sv_setsv(sv,&PL_sv_undef); break; case '\'': + do_postmatch_fetch: if (PL_curpm && (rx = PM_GETRE(PL_curpm))) { reg_numbered_buff_get( -1, rx, sv, 0); break; diff --git a/op.h b/op.h index 1ac4aa0..d8c5478 100644 --- a/op.h +++ b/op.h @@ -371,14 +371,15 @@ struct pmop { /* The following flags have exact equivalents in regcomp.h with the prefix RXf_ * which are stored in the regexp->extflags member. */ -#define PMf_LOCALE 0x0800 /* use locale for character types */ -#define PMf_MULTILINE 0x1000 /* assume multiple lines */ -#define PMf_SINGLELINE 0x2000 /* assume single line */ -#define PMf_FOLD 0x4000 /* case insensitivity */ -#define PMf_EXTENDED 0x8000 /* chuck embedded whitespace */ +#define PMf_LOCALE 0x00800 /* use locale for character types */ +#define PMf_MULTILINE 0x01000 /* assume multiple lines */ +#define PMf_SINGLELINE 0x02000 /* assume single line */ +#define PMf_FOLD 0x04000 /* case insensitivity */ +#define PMf_EXTENDED 0x08000 /* chuck embedded whitespace */ +#define PMf_KEEPCOPY 0x10000 /* copy the string when matching */ /* mask of bits that need to be transfered to re->extflags */ -#define PMf_COMPILETIME (PMf_MULTILINE|PMf_SINGLELINE|PMf_LOCALE|PMf_FOLD|PMf_EXTENDED) +#define PMf_COMPILETIME (PMf_MULTILINE|PMf_SINGLELINE|PMf_LOCALE|PMf_FOLD|PMf_EXTENDED|PMf_KEEPCOPY) #ifdef USE_ITHREADS diff --git a/pod/perlop.pod b/pod/perlop.pod index 46af19b..7b84a68 100644 --- a/pod/perlop.pod +++ b/pod/perlop.pod @@ -1067,7 +1067,7 @@ X X X X X X X X X X X X -=item /PATTERN/cgimosx +=item /PATTERN/cgimosxk Searches a string for a pattern match, and in scalar context returns true if it succeeds, false if it fails. If no string is specified @@ -1080,13 +1080,15 @@ is in effect. Options are: - c Do not reset search position on a failed match when /g is in effect. - g Match globally, i.e., find all occurrences. i Do case-insensitive pattern matching. m Treat string as multiple lines. - o Compile pattern only once. s Treat string as single line. x Use extended regular expressions. + g Match globally, i.e., find all occurrences. + c Do not reset search position on a failed match when /g is in effect. + o Compile pattern only once. + k Keep a copy of the matched string so that ${^MATCH} and friends + will be defined. If "/" is the delimiter then the initial C is optional. With the C you can use any pair of non-alphanumeric, non-whitespace characters @@ -1449,7 +1451,7 @@ put comments into a multi-line C-string. For this reason, the C pragma and the B<-w> switch (that is, the C<$^W> variable) produces warnings if the STRING contains the "," or the "#" character. -=item s/PATTERN/REPLACEMENT/egimosx +=item s/PATTERN/REPLACEMENT/egimosxk X X X X X X X X X X X X @@ -1475,13 +1477,16 @@ when C is in effect. Options are: - e Evaluate the right side as an expression. - g Replace globally, i.e., all occurrences. i Do case-insensitive pattern matching. m Treat string as multiple lines. - o Compile pattern only once. s Treat string as single line. x Use extended regular expressions. + g Replace globally, i.e., all occurrences. + o Compile pattern only once. + k Keep a copy of the original string so ${^MATCH} and friends + will be defined. + e Evaluate the right side as an expression. + Any non-alphanumeric, non-whitespace delimiter may replace the slashes. If single quotes are used, no interpretation is done on the diff --git a/pod/perlre.pod b/pod/perlre.pod index 7133a02..d886d09 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -586,6 +586,15 @@ already paid the price. As of 5.005, C<$&> is not so costly as the other two. X<$&> X<$`> X<$'> +As a workaround for this problem, Perl 5.10 introduces C<${^PREMATCH}>, +C<${^MATCH}> and C<${^POSTMATCH}>, which are equivalent to C<$`>, C<$&> +and C<$'>, B that they are only guaranteed to be defined after a +successful match that was executed with the C (keep-copy) modifier. +The use of these variables incurs no global performance penalty, unlike +their punctuation char equivalents, however at the trade-off that you +have to tell perl when you want to use them. +X X + Backslashed metacharacters in Perl are alphanumeric, such as C<\b>, C<\w>, C<\n>. Unlike some other regular expression languages, there are no backslashed symbols that aren't alphanumeric. So anything @@ -639,7 +648,7 @@ whitespace formatting, a simple C<#> will suffice. Note that Perl closes the comment as soon as it sees a C<)>, so there is no way to put a literal C<)> in the comment. -=item C<(?imsx-imsx)> +=item C<(?kimsx-imsx)> X<(?)> One or more embedded pattern-match modifiers, to be turned on (or @@ -667,6 +676,11 @@ will match a repeated (I!) word C in any case, assuming C modifier, and no C modifier outside this group. +Note that the C modifier is special in that it can only be enabled, +not disabled, and that its presence anywhere in a pattern has a global +effect. Thus C<(?-k)> and C<(?-k:...)> are meaningless and will warn +when executed under C. + =item C<(?:pattern)> X<(?:)> diff --git a/pod/perlvar.pod b/pod/perlvar.pod index a211c37..b4db654 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -228,6 +228,14 @@ performance penalty on all regular expression matches. See L. See L for a replacement. +=item ${^MATCH} +X<${^MATCH}> + +This is similar to C<$&> (C<$POSTMATCH>) except that it does not incur the +performance penalty associated with that variable, and is only guaranteed +to return a defined value when the pattern was compiled or executed with +the C modifier. + =item $PREMATCH =item $` @@ -243,6 +251,14 @@ performance penalty on all regular expression matches. See L. See L for a replacement. +=item ${^PREMATCH} +X<${^PREMATCH}> + +This is similar to C<$`> ($PREMATCH) except that it does not incur the +performance penalty associated with that variable, and is only guaranteed +to return a defined value when the pattern was compiled or executed with +the C modifier. + =item $POSTMATCH =item $' @@ -264,6 +280,14 @@ performance penalty on all regular expression matches. See L. See L for a replacement. +=item ${^POSTMATCH} +X<${^POSTMATCH}> + +This is similar to C<$'> (C<$POSTMATCH>) except that it does not incur the +performance penalty associated with that variable, and is only guaranteed +to return a defined value when the pattern was compiled or executed with +the C modifier. + =item $LAST_PAREN_MATCH =item $+ diff --git a/pp_hot.c b/pp_hot.c index f1ad3ed..7167311 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -1368,7 +1368,8 @@ PP(pp_match) /* remove comment to get faster /g but possibly unsafe $1 vars after a match. Test for the unsafe vars will fail as well*/ if (( /* !global && */ rx->nparens) - || SvTEMP(TARG) || PL_sawampersand || (pm->op_pmflags & PMf_EVAL)) + || SvTEMP(TARG) || PL_sawampersand || + (pm->op_pmflags & (PMf_EVAL|PMf_KEEPCOPY))) r_flags |= REXEC_COPY_STR; if (SvSCREAM(TARG)) r_flags |= REXEC_SCREAM; @@ -1391,6 +1392,7 @@ play_it_again: goto nope; if ( (rx->extflags & RXf_CHECK_ALL) && !PL_sawampersand + && !(pm->op_pmflags & PMf_KEEPCOPY) && ((rx->extflags & RXf_NOSCAN) || !((rx->extflags & RXf_INTUIT_TAIL) && (r_flags & REXEC_SCREAM))) @@ -1516,7 +1518,7 @@ yup: /* Confirmed by INTUIT */ rx->sublen = strend - truebase; goto gotcha; } - if (PL_sawampersand) { + if (PL_sawampersand || pm->op_pmflags & PMf_KEEPCOPY) { I32 off; #ifdef PERL_OLD_COPY_ON_WRITE if (SvIsCOW(TARG) || (SvFLAGS(TARG) & CAN_COW_MASK) == CAN_COW_FLAGS) { @@ -1547,6 +1549,8 @@ yup: /* Confirmed by INTUIT */ rx->startp[0] = s - truebase; rx->endp[0] = s - truebase + rx->minlenret; } + /* including rx->nparens in the below code seems highly suspicious. + -dmq */ rx->nparens = rx->lastparen = rx->lastcloseparen = 0; /* used by @-, @+, and $^N */ LEAVE_SCOPE(oldsave); RETPUSHYES; @@ -2152,7 +2156,7 @@ PP(pp_subst) rx = PM_GETRE(pm); } r_flags = (rx->nparens || SvTEMP(TARG) || PL_sawampersand - || (pm->op_pmflags & PMf_EVAL)) + || (pm->op_pmflags & (PMf_EVAL|PMf_KEEPCOPY)) ) ? REXEC_COPY_STR : 0; if (SvSCREAM(TARG)) r_flags |= REXEC_SCREAM; @@ -2167,6 +2171,7 @@ PP(pp_subst) /* How to do it in subst? */ /* if ( (rx->extflags & RXf_CHECK_ALL) && !PL_sawampersand + && !(pm->op_pmflags & PMf_KEEPCOPY) && ((rx->extflags & RXf_NOSCAN) || !((rx->extflags & RXf_INTUIT_TAIL) && (r_flags & REXEC_SCREAM)))) diff --git a/regcomp.c b/regcomp.c index 4d139f2..d07f177 100644 --- a/regcomp.c +++ b/regcomp.c @@ -172,6 +172,7 @@ typedef struct RExC_state_t { #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) + #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s))) @@ -4592,8 +4593,8 @@ reStudy: ri->name_list_idx = add_data( pRExC_state, 1, "p" ); ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); } else - ri->name_list_idx = 0; #endif + ri->name_list_idx = 0; if (RExC_recurse_count) { for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { @@ -4676,12 +4677,18 @@ Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, SV *sv = usesv ? usesv : newSVpvs(""); PERL_UNUSED_ARG(flags); - if (paren == -2 && (s = rx->subbeg) && rx->startp[0] != -1) { + if (!rx->subbeg) { + sv_setsv(sv,&PL_sv_undef); + return sv; + } + else + if (paren == -2 && rx->startp[0] != -1) { /* $` */ i = rx->startp[0]; + s = rx->subbeg; } else - if (paren == -1 && rx->subbeg && rx->endp[0] != -1) { + if (paren == -1 && rx->endp[0] != -1) { /* $' */ s = rx->subbeg + rx->endp[0]; i = rx->sublen - rx->endp[0]; @@ -4694,47 +4701,43 @@ Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, /* $& $1 ... */ i = t1 - s1; s = rx->subbeg + s1; - } - - if (s) { - assert(rx->subbeg); - assert(rx->sublen >= (s - rx->subbeg) + i ); - - if (i >= 0) { - const int oldtainted = PL_tainted; - TAINT_NOT; - sv_setpvn(sv, s, i); - PL_tainted = oldtainted; - if ( (rx->extflags & RXf_CANY_SEEN) - ? (RX_MATCH_UTF8(rx) - && (!i || is_utf8_string((U8*)s, i))) - : (RX_MATCH_UTF8(rx)) ) - { - SvUTF8_on(sv); - } - else - SvUTF8_off(sv); - if (PL_tainting) { - if (RX_MATCH_TAINTED(rx)) { - if (SvTYPE(sv) >= SVt_PVMG) { - MAGIC* const mg = SvMAGIC(sv); - MAGIC* mgt; - PL_tainted = 1; - SvMAGIC_set(sv, mg->mg_moremagic); - SvTAINT(sv); - if ((mgt = SvMAGIC(sv))) { - mg->mg_moremagic = mgt; - SvMAGIC_set(sv, mg); - } - } else { - PL_tainted = 1; - SvTAINT(sv); + } else { + sv_setsv(sv,&PL_sv_undef); + return sv; + } + assert(rx->sublen >= (s - rx->subbeg) + i ); + if (i >= 0) { + const int oldtainted = PL_tainted; + TAINT_NOT; + sv_setpvn(sv, s, i); + PL_tainted = oldtainted; + if ( (rx->extflags & RXf_CANY_SEEN) + ? (RX_MATCH_UTF8(rx) + && (!i || is_utf8_string((U8*)s, i))) + : (RX_MATCH_UTF8(rx)) ) + { + SvUTF8_on(sv); + } + else + SvUTF8_off(sv); + if (PL_tainting) { + if (RX_MATCH_TAINTED(rx)) { + if (SvTYPE(sv) >= SVt_PVMG) { + MAGIC* const mg = SvMAGIC(sv); + MAGIC* mgt; + PL_tainted = 1; + SvMAGIC_set(sv, mg->mg_moremagic); + SvTAINT(sv); + if ((mgt = SvMAGIC(sv))) { + mg->mg_moremagic = mgt; + SvMAGIC_set(sv, mg); } - } else - SvTAINTED_off(sv); - } - } else { - sv_setsv(sv,&PL_sv_undef); + } else { + PL_tainted = 1; + SvTAINT(sv); + } + } else + SvTAINTED_off(sv); } } else { sv_setsv(sv,&PL_sv_undef); @@ -5006,8 +5009,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } else if (*RExC_parse == '?') { /* (?...) */ - U32 posflags = 0, negflags = 0; - U32 *flagsp = &posflags; bool is_logical = 0; const char * const seqstart = RExC_parse; @@ -5431,13 +5432,20 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) vFAIL("Sequence (? incomplete"); break; default: - --RExC_parse; - parse_flags: /* (?i) */ - while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) { + --RExC_parse; + parse_flags: /* (?i) */ + { + U32 posflags = 0, negflags = 0; + U32 *flagsp = &posflags; + + while (*RExC_parse) { + /* && strchr("iogcmsx", *RExC_parse) */ /* (?g), (?gc) and (?o) are useless here and must be globally applied -- japhy */ - - if (*RExC_parse == 'o' || *RExC_parse == 'g') { + switch (*RExC_parse) { + CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + case 'o': + case 'g': if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; if (! (wastedflags & wflagbit) ) { @@ -5452,8 +5460,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ); } } - } - else if (*RExC_parse == 'c') { + break; + + case 'c': if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; @@ -5465,33 +5474,45 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ); } } - } - else { pmflag(flagsp, *RExC_parse); } - - ++RExC_parse; - } - if (*RExC_parse == '-') { - flagsp = &negflags; - wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case 'k': + if (flagsp == &negflags) { + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) + vWARN(RExC_parse + 1,"Useless use of (?-k)"); + } else { + *flagsp |= RXf_PMf_KEEPCOPY; + } + break; + case '-': + if (flagsp == &negflags) + goto unknown; + flagsp = &negflags; + wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case ':': + paren = ':'; + /*FALLTHROUGH*/ + case ')': + RExC_flags |= posflags; + RExC_flags &= ~negflags; + nextchar(pRExC_state); + if (paren != ':') { + *flagp = TRYAGAIN; + return NULL; + } else { + ret = NULL; + goto parse_rest; + } + /*NOTREACHED*/ + default: + unknown: + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } ++RExC_parse; - goto parse_flags; } - RExC_flags |= posflags; - RExC_flags &= ~negflags; - if (*RExC_parse == ':') { - RExC_parse++; - paren = ':'; - break; - } - unknown: - if (*RExC_parse != ')') { - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); - } - nextchar(pRExC_state); - *flagp = TRYAGAIN; - return NULL; - } + }} /* one for the default block, one for the switch */ } else { /* (...) */ capturing_parens: @@ -5516,7 +5537,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else /* ! paren */ ret = NULL; - + + parse_rest: /* Pick up the branches, linking them together. */ parse_start = RExC_parse; /* MJD */ br = regbranch(pRExC_state, &flags, 1,depth+1); @@ -8803,6 +8825,8 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) else reti->data = NULL; + reti->name_list_idx = ri->name_list_idx; + Newx(reti->offsets, 2*len+1, U32); Copy(ri->offsets, reti->offsets, 2*len+1, U32); @@ -8846,13 +8870,16 @@ Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { if (!mg->mg_ptr) { const char *fptr = "msix"; - char reflags[6]; + char reflags[7]; char ch; - int left = 0; - int right = 4; - bool need_newline = 0; - U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12); - + bool hask = ((re->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool hasm = ((re->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); + U16 reganch = (U16)((re->extflags & RXf_PMf_STD_PMMOD) >> 12); + bool need_newline = 0; + int left = 0; + int right = 4 + hask; + if (hask) + reflags[left++]='k'; while((ch = *fptr++)) { if(reganch & 1) { reflags[left++] = ch; @@ -8862,11 +8889,11 @@ Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { } reganch >>= 1; } - if(left != 4) { + if(hasm) { reflags[left] = '-'; - left = 5; + left = 5 + hask; } - + /* printf("[%*.7s]\n",left,reflags); */ mg->mg_len = re->prelen + 4 + left; /* * If /x was used, we have to worry about a regex ending with a diff --git a/regcomp.h b/regcomp.h index ad7400f..d4a5001 100644 --- a/regcomp.h +++ b/regcomp.h @@ -103,10 +103,7 @@ typedef struct regexp_paren_ofs { } regexp_paren_ofs; typedef struct regexp_internal { -#ifdef DEBUGGING int name_list_idx; /* Optional data index of an array of paren names */ -#endif - U32 *offsets; /* offset annotations 20001228 MJD data about mapping the program to the string*/ diff --git a/regexp.h b/regexp.h index c28c78e..4045fbd 100644 --- a/regexp.h +++ b/regexp.h @@ -144,11 +144,18 @@ typedef struct regexp_engine { #define RXf_PMf_SINGLELINE 0x00002000 /* /s */ #define RXf_PMf_FOLD 0x00004000 /* /i */ #define RXf_PMf_EXTENDED 0x00008000 /* /x */ +#define RXf_PMf_KEEPCOPY 0x00010000 /* /k */ /* these flags are transfered from the PMOP->op_pmflags member during compilation */ -#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_LOCALE|RXf_PMf_FOLD|RXf_PMf_EXTENDED) +#define RXf_PMf_STD_PMMOD (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED) +#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_LOCALE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY) + +#define CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl) \ + case 'i': *(pmfl) |= RXf_PMf_FOLD; break; \ + case 'm': *(pmfl) |= RXf_PMf_MULTILINE; break; \ + case 's': *(pmfl) |= RXf_PMf_SINGLELINE; break; \ + case 'x': *(pmfl) |= RXf_PMf_EXTENDED; break /* What we have seen */ -/* one bit here */ #define RXf_LOOKBEHIND_SEEN 0x00020000 #define RXf_EVAL_SEEN 0x00040000 #define RXf_CANY_SEEN 0x00080000 @@ -448,6 +455,7 @@ struct re_save_state { #define SAVESTACK_ALLOC_FOR_RE_SAVE_STATE \ (1 + ((sizeof(struct re_save_state) - 1) / sizeof(*PL_savestack))) + /* * Local variables: * c-indentation-style: bsd diff --git a/t/op/regexp.t b/t/op/regexp.t index cce19fc..a7cd5fc 100755 --- a/t/op/regexp.t +++ b/t/op/regexp.t @@ -125,7 +125,15 @@ EOFCODE } else { if (!$match || $got ne $expect) { - print "not ok $. ($study) $input => `$got', match=$match\n$code\n"; + eval { require Data::Dumper }; + if ($@) { + print "not ok $. ($study) $input => `$got', match=$match\n$code\n"; + } + else { # better diagnostics + my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump; + my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump; + print "not ok $. ($study) $input => `$got', match=$match\n$s\n$g\n$code\n"; + } next TEST; } } diff --git a/t/op/regexp_kmod.t b/t/op/regexp_kmod.t new file mode 100644 index 0000000..84efd83 --- /dev/null +++ b/t/op/regexp_kmod.t @@ -0,0 +1,39 @@ +#!./perl + +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; + require './test.pl'; +} + +use strict; +use warnings; + +our @tests = ( + # /k Pattern PRE MATCH POST + [ 'k', "456", "123-", "456", "-789"], + [ '', "(456)", "123-", "456", "-789"], + [ '', "456", undef, undef, undef ], +); + +plan tests => 4 * @tests + 2; +my $W = ""; + +$SIG{__WARN__} = sub { $W.=join("",@_); }; +sub _u($$) { "$_[0] is ".(defined $_[1] ? "'$_[1]'" : "undef") } + +$_ = '123-456-789'; +foreach my $test (@tests) { + my ($k, $pat,$l,$m,$r) = @$test; + my $test_name = "/$pat/$k"; + my $ok = ok($k ? /$pat/k : /$pat/, $test_name); + SKIP: { + skip "/$pat/$k failed to match", 3 + unless $ok; + is(${^PREMATCH}, $l,_u "$test_name: ^PREMATCH",$l); + is(${^MATCH}, $m,_u "$test_name: ^MATCH",$m ); + is(${^POSTMATCH}, $r,_u "$test_name: ^POSTMATCH",$r ); + } +} +is($W,"","No warnings should be produced"); +ok(!defined ${^MATCH}, "No /k in scope so ^MATCH is undef"); diff --git a/toke.c b/toke.c index f9f0627..dcbf3d6 100644 --- a/toke.c +++ b/toke.c @@ -10763,20 +10763,16 @@ void Perl_pmflag(pTHX_ U32* pmfl, int ch) { PERL_UNUSED_CONTEXT; - if (ch == 'i') - *pmfl |= PMf_FOLD; - else if (ch == 'g') - *pmfl |= PMf_GLOBAL; - else if (ch == 'c') - *pmfl |= PMf_CONTINUE; - else if (ch == 'o') - *pmfl |= PMf_KEEP; - else if (ch == 'm') - *pmfl |= PMf_MULTILINE; - else if (ch == 's') - *pmfl |= PMf_SINGLELINE; - else if (ch == 'x') - *pmfl |= PMf_EXTENDED; + if (ch<256) { + char c = (char)ch; + switch (c) { + CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl); + case 'g': *pmfl |= PMf_GLOBAL; break; + case 'c': *pmfl |= PMf_CONTINUE; break; + case 'o': *pmfl |= PMf_KEEP; break; + case 'k': *pmfl |= PMf_KEEPCOPY; break; + } + } } STATIC char * @@ -10786,7 +10782,7 @@ S_scan_pat(pTHX_ char *start, I32 type) PMOP *pm; char *s = scan_str(start,!!PL_madskills,FALSE); const char * const valid_flags = - (const char *)((type == OP_QR) ? "iomsx" : "iogcmsx"); + (const char *)((type == OP_QR) ? "iomsxk" : "iogcmsxk"); #ifdef PERL_MAD char *modstart; #endif @@ -10887,7 +10883,7 @@ S_scan_subst(pTHX_ char *start) s++; es++; } - else if (strchr("iogcmsx", *s)) + else if (strchr("iogcmsxk", *s)) pmflag(&pm->op_pmflags,*s++); else break;