From: Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
Date: Tue, 16 Jun 2009 06:27:23 +0000 (+0200)
Subject: Implement new regex escape \N
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=afefe6bfcf9956c77e5f9eee351e3d13be12ea3b;p=p5sagit%2Fp5-mst-13.2.git

Implement new regex escape \N

\N, like in Perl 6, is equivalent to . but not influenced by /s.
It matches any character except \n. Note that followed by { and
a non-number, \N is still a named character.
---

diff --git a/embed.fnc b/embed.fnc
index 68f3817..439203c 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1593,7 +1593,7 @@ Es	|UV	|reg_recode	|const char value|NN SV **encp
 Es	|regnode*|regpiece	|NN struct RExC_state_t *pRExC_state \
 				|NN I32 *flagp|U32 depth
 Es	|regnode*|reg_namedseq	|NN struct RExC_state_t *pRExC_state \
-				|NULLOK UV *valuep
+				|NULLOK UV *valuep|NULLOK I32 *flagp
 Es	|void	|reginsert	|NN struct RExC_state_t *pRExC_state \
 				|U8 op|NN regnode *opnd|U32 depth
 Es	|void	|regtail	|NN struct RExC_state_t *pRExC_state \
diff --git a/embed.h b/embed.h
index e320dc5..9af17f6 100644
--- a/embed.h
+++ b/embed.h
@@ -3756,7 +3756,7 @@
 #define reg_node(a,b)		S_reg_node(aTHX_ a,b)
 #define reg_recode(a,b)		S_reg_recode(aTHX_ a,b)
 #define regpiece(a,b,c)		S_regpiece(aTHX_ a,b,c)
-#define reg_namedseq(a,b)	S_reg_namedseq(aTHX_ a,b)
+#define reg_namedseq(a,b,c)	S_reg_namedseq(aTHX_ a,b,c)
 #define reginsert(a,b,c,d)	S_reginsert(aTHX_ a,b,c,d)
 #define regtail(a,b,c,d)	S_regtail(aTHX_ a,b,c,d)
 #define reg_scan_name(a,b)	S_reg_scan_name(aTHX_ a,b)
diff --git a/proto.h b/proto.h
index 78f17dd..285e05f 100644
--- a/proto.h
+++ b/proto.h
@@ -5178,7 +5178,7 @@ STATIC regnode*	S_regpiece(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp, U
 #define PERL_ARGS_ASSERT_REGPIECE	\
 	assert(pRExC_state); assert(flagp)
 
-STATIC regnode*	S_reg_namedseq(pTHX_ struct RExC_state_t *pRExC_state, UV *valuep)
+STATIC regnode*	S_reg_namedseq(pTHX_ struct RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
 			__attribute__nonnull__(pTHX_1);
 #define PERL_ARGS_ASSERT_REG_NAMEDSEQ	\
 	assert(pRExC_state)
diff --git a/regcomp.c b/regcomp.c
index e061528..bc7086f 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -6553,7 +6553,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 /* reg_namedseq(pRExC_state,UVp)
    
    This is expected to be called by a parser routine that has 
-   recognized'\N' and needs to handle the rest. RExC_parse is 
+   recognized '\N' and needs to handle the rest. RExC_parse is
    expected to point at the first char following the N at the time
    of the call.
    
@@ -6567,11 +6567,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
    be returned to indicate failure. (This will NOT be a valid pointer 
    to a regnode.)
    
-   If value is null then it is assumed that we are parsing normal text
+   If valuep is null then it is assumed that we are parsing normal text
    and inserts a new EXACT node into the program containing the resolved
    string and returns a pointer to the new node. If the string is 
    zerolength a NOTHING node is emitted.
-   
+
    On success RExC_parse is set to the char following the endbrace.
    Parsing failures will generate a fatal errorvia vFAIL(...)
    
@@ -6585,7 +6585,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
    
  */
 STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) 
+S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
 {
     char * name;        /* start of the content of the name */
     char * endbrace;    /* endbrace following the name */
@@ -6597,8 +6597,22 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
  
     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
    
-    if (*RExC_parse != '{') {
-        vFAIL("Missing braces on \\N{}");
+    if (*RExC_parse != '{' ||
+	    (*RExC_parse == '{' && RExC_parse[1]
+	     && strchr("0123456789", RExC_parse[1])))
+    {
+	GET_RE_DEBUG_FLAGS_DECL;
+	if (valuep)
+	    /* no bare \N in a charclass */
+	    vFAIL("Missing braces on \\N{}");
+	GET_RE_DEBUG_FLAGS;
+	nextchar(pRExC_state);
+	ret = reg_node(pRExC_state, REG_ANY);
+	*flagp |= HASWIDTH|SIMPLE;
+	RExC_naughty++;
+	RExC_parse--;
+        Set_Node_Length(ret, 1); /* MJD */
+	return ret;
     }
     name = RExC_parse+1;
     endbrace = strchr(RExC_parse, '}');
@@ -7159,12 +7173,12 @@ tryagain:
 	    }
 	    break;
         case 'N': 
-            /* Handle \N{NAME} here and not below because it can be 
+            /* Handle \N and \N{NAME} here and not below because it can be
             multicharacter. join_exact() will join them up later on. 
             Also this makes sure that things like /\N{BLAH}+/ and 
             \N{BLAH} being multi char Just Happen. dmq*/
             ++RExC_parse;
-            ret= reg_namedseq(pRExC_state, NULL); 
+            ret= reg_namedseq(pRExC_state, NULL, flagp); 
             break;
 	case 'k':    /* Handle \k<NAME> and \k'NAME' */
 	parse_named_seq:
@@ -7964,7 +7978,7 @@ parseit:
                     from earlier versions, OTOH that behaviour was broken
                     as well. */
                     UV v; /* value is register so we cant & it /grrr */
-                    if (reg_namedseq(pRExC_state, &v)) {
+                    if (reg_namedseq(pRExC_state, &v, NULL)) {
                         goto parseit;
                     }
                     value= v; 
diff --git a/t/op/re_tests b/t/op/re_tests
index f9b070d..e264198 100644
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -31,6 +31,12 @@ ab*bc	abbbbc	y	$+[0]	6
 .{3,4}	abbbbc	y	$&	abbb
 .{3,4}	abbbbc	y	$-[0]	0
 .{3,4}	abbbbc	y	$+[0]	4
+\N{1}	abbbbc	y	$&	a
+\N{1}	abbbbc	y	$-[0]	0
+\N{1}	abbbbc	y	$+[0]	1
+\N{3,4}	abbbbc	y	$&	abbb
+\N{3,4}	abbbbc	y	$-[0]	0
+\N{3,4}	abbbbc	y	$+[0]	4
 ab{0,}bc	abbbbc	y	$&	abbbbc
 ab{0,}bc	abbbbc	y	$-[0]	0
 ab{0,}bc	abbbbc	y	$+[0]	6
@@ -69,8 +75,10 @@ abc$	aabcd	n	-	-
 $	abc	y	$&	
 a.c	abc	y	$&	abc
 a.c	axc	y	$&	axc
+a\Nc	abc	y	$&	abc
 a.*c	axyzc	y	$&	axyzc
 a.*c	axyzd	n	-	-
+a\N*c	axyzd	n	-	-
 a[bc]d	abc	n	-	-
 a[bc]d	abd	y	$&	abd
 a[b]d	abd	y	$&	abd
@@ -78,6 +86,7 @@ a[b]d	abd	y	$&	abd
 .[b].	abd	y	$&	abd
 .[b].	aBd	n	-	-
 (?i:.[b].)	abd	y	$&	abd
+(?i:\N[b]\N)	abd	y	$&	abd
 a[b-d]e	abd	n	-	-
 a[b-d]e	ace	y	$&	ace
 a[b-d]	aac	y	$&	ac
@@ -315,6 +324,7 @@ a[-]?c	ac	y	$&	ac
 '$'i	ABC	y	$&	
 'a.c'i	ABC	y	$&	ABC
 'a.c'i	AXC	y	$&	AXC
+'a\Nc'i	ABC	y	$&	ABC
 'a.*?c'i	AXYZC	y	$&	AXYZC
 'a.*c'i	AXYZD	n	-	-
 'a[bc]d'i	ABC	n	-	-
@@ -497,8 +507,11 @@ a(?:b|(c|e){1,2}?|d)+?(.)	ace	y	$1$2	ce
 '(?-i:a)b'i	AB	n	-	-
 '((?-i:a))b'i	AB	n	-	-
 '((?-i:a.))b'i	a\nB	n	-	-
+'((?-i:a\N))b'i	a\nB	n	-	-
 '((?s-i:a.))b'i	a\nB	y	$1	a\n
+'((?s-i:a\N))b'i	a\nB	n	-	-
 '((?s-i:a.))b'i	B\nB	n	-	-
+'((?s-i:a\N))b'i	B\nB	n	-	-
 (?:c|d)(?:)(?:a(?:)(?:b)(?:b(?:))(?:b(?:)(?:b)))	cabbbb	y	$&	cabbbb
 (?:c|d)(?:)(?:aaaaaaaa(?:)(?:bbbbbbbb)(?:bbbbbbbb(?:))(?:bbbbbbbb(?:)(?:bbbbbbbb)))	caaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb	y	$&	caaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
 '(ab)\d\1'i	Ab4ab	y	$1	Ab