From: Rafael Garcia-Suarez Date: Tue, 16 Jun 2009 06:27:23 +0000 (+0200) Subject: Implement new regex escape \N X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=afefe6bfcf9956c77e5f9eee351e3d13be12ea3b;p=p5sagit%2Fp5-mst-13.2.git Implement new regex escape \N \N, like in Perl 6, is equivalent to . but not influenced by /s. It matches any character except \n. Note that followed by { and a non-number, \N is still a named character. --- diff --git a/embed.fnc b/embed.fnc index 68f3817..439203c 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1593,7 +1593,7 @@ Es |UV |reg_recode |const char value|NN SV **encp Es |regnode*|regpiece |NN struct RExC_state_t *pRExC_state \ |NN I32 *flagp|U32 depth Es |regnode*|reg_namedseq |NN struct RExC_state_t *pRExC_state \ - |NULLOK UV *valuep + |NULLOK UV *valuep|NULLOK I32 *flagp Es |void |reginsert |NN struct RExC_state_t *pRExC_state \ |U8 op|NN regnode *opnd|U32 depth Es |void |regtail |NN struct RExC_state_t *pRExC_state \ diff --git a/embed.h b/embed.h index e320dc5..9af17f6 100644 --- a/embed.h +++ b/embed.h @@ -3756,7 +3756,7 @@ #define reg_node(a,b) S_reg_node(aTHX_ a,b) #define reg_recode(a,b) S_reg_recode(aTHX_ a,b) #define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c) -#define reg_namedseq(a,b) S_reg_namedseq(aTHX_ a,b) +#define reg_namedseq(a,b,c) S_reg_namedseq(aTHX_ a,b,c) #define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) #define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d) #define reg_scan_name(a,b) S_reg_scan_name(aTHX_ a,b) diff --git a/proto.h b/proto.h index 78f17dd..285e05f 100644 --- a/proto.h +++ b/proto.h @@ -5178,7 +5178,7 @@ STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp, U #define PERL_ARGS_ASSERT_REGPIECE \ assert(pRExC_state); assert(flagp) -STATIC regnode* S_reg_namedseq(pTHX_ struct RExC_state_t *pRExC_state, UV *valuep) +STATIC regnode* S_reg_namedseq(pTHX_ struct RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_REG_NAMEDSEQ \ assert(pRExC_state) diff --git a/regcomp.c b/regcomp.c index e061528..bc7086f 100644 --- a/regcomp.c +++ b/regcomp.c @@ -6553,7 +6553,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* reg_namedseq(pRExC_state,UVp) This is expected to be called by a parser routine that has - recognized'\N' and needs to handle the rest. RExC_parse is + recognized '\N' and needs to handle the rest. RExC_parse is expected to point at the first char following the N at the time of the call. @@ -6567,11 +6567,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) be returned to indicate failure. (This will NOT be a valid pointer to a regnode.) - If value is null then it is assumed that we are parsing normal text + If valuep is null then it is assumed that we are parsing normal text and inserts a new EXACT node into the program containing the resolved string and returns a pointer to the new node. If the string is zerolength a NOTHING node is emitted. - + On success RExC_parse is set to the char following the endbrace. Parsing failures will generate a fatal errorvia vFAIL(...) @@ -6585,7 +6585,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) */ STATIC regnode * -S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) +S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) { char * name; /* start of the content of the name */ char * endbrace; /* endbrace following the name */ @@ -6597,8 +6597,22 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) PERL_ARGS_ASSERT_REG_NAMEDSEQ; - if (*RExC_parse != '{') { - vFAIL("Missing braces on \\N{}"); + if (*RExC_parse != '{' || + (*RExC_parse == '{' && RExC_parse[1] + && strchr("0123456789", RExC_parse[1]))) + { + GET_RE_DEBUG_FLAGS_DECL; + if (valuep) + /* no bare \N in a charclass */ + vFAIL("Missing braces on \\N{}"); + GET_RE_DEBUG_FLAGS; + nextchar(pRExC_state); + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + RExC_parse--; + Set_Node_Length(ret, 1); /* MJD */ + return ret; } name = RExC_parse+1; endbrace = strchr(RExC_parse, '}'); @@ -7159,12 +7173,12 @@ tryagain: } break; case 'N': - /* Handle \N{NAME} here and not below because it can be + /* Handle \N and \N{NAME} here and not below because it can be multicharacter. join_exact() will join them up later on. Also this makes sure that things like /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq*/ ++RExC_parse; - ret= reg_namedseq(pRExC_state, NULL); + ret= reg_namedseq(pRExC_state, NULL, flagp); break; case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: @@ -7964,7 +7978,7 @@ parseit: from earlier versions, OTOH that behaviour was broken as well. */ UV v; /* value is register so we cant & it /grrr */ - if (reg_namedseq(pRExC_state, &v)) { + if (reg_namedseq(pRExC_state, &v, NULL)) { goto parseit; } value= v; diff --git a/t/op/re_tests b/t/op/re_tests index f9b070d..e264198 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -31,6 +31,12 @@ ab*bc abbbbc y $+[0] 6 .{3,4} abbbbc y $& abbb .{3,4} abbbbc y $-[0] 0 .{3,4} abbbbc y $+[0] 4 +\N{1} abbbbc y $& a +\N{1} abbbbc y $-[0] 0 +\N{1} abbbbc y $+[0] 1 +\N{3,4} abbbbc y $& abbb +\N{3,4} abbbbc y $-[0] 0 +\N{3,4} abbbbc y $+[0] 4 ab{0,}bc abbbbc y $& abbbbc ab{0,}bc abbbbc y $-[0] 0 ab{0,}bc abbbbc y $+[0] 6 @@ -69,8 +75,10 @@ abc$ aabcd n - - $ abc y $& a.c abc y $& abc a.c axc y $& axc +a\Nc abc y $& abc a.*c axyzc y $& axyzc a.*c axyzd n - - +a\N*c axyzd n - - a[bc]d abc n - - a[bc]d abd y $& abd a[b]d abd y $& abd @@ -78,6 +86,7 @@ a[b]d abd y $& abd .[b]. abd y $& abd .[b]. aBd n - - (?i:.[b].) abd y $& abd +(?i:\N[b]\N) abd y $& abd a[b-d]e abd n - - a[b-d]e ace y $& ace a[b-d] aac y $& ac @@ -315,6 +324,7 @@ a[-]?c ac y $& ac '$'i ABC y $& 'a.c'i ABC y $& ABC 'a.c'i AXC y $& AXC +'a\Nc'i ABC y $& ABC 'a.*?c'i AXYZC y $& AXYZC 'a.*c'i AXYZD n - - 'a[bc]d'i ABC n - - @@ -497,8 +507,11 @@ a(?:b|(c|e){1,2}?|d)+?(.) ace y $1$2 ce '(?-i:a)b'i AB n - - '((?-i:a))b'i AB n - - '((?-i:a.))b'i a\nB n - - +'((?-i:a\N))b'i a\nB n - - '((?s-i:a.))b'i a\nB y $1 a\n +'((?s-i:a\N))b'i a\nB n - - '((?s-i:a.))b'i B\nB n - - +'((?s-i:a\N))b'i B\nB n - - (?:c|d)(?:)(?:a(?:)(?:b)(?:b(?:))(?:b(?:)(?:b))) cabbbb y $& cabbbb (?:c|d)(?:)(?:aaaaaaaa(?:)(?:bbbbbbbb)(?:bbbbbbbb(?:))(?:bbbbbbbb(?:)(?:bbbbbbbb))) caaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb y $& caaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb '(ab)\d\1'i Ab4ab y $1 Ab