X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=7e80041d0bf9da8d2b8267f8a9d740c606dfccc2;hb=55ec0dff636c2a8ee5225314d7d46f928ab7f6da;hp=016e099f58a01803569163783271401009df7d71;hpb=3339dfd8efc84c7c6b921a21d306e6133e406599;p=p5sagit%2Fp5-mst-13.2.git

diff --git a/regcomp.c b/regcomp.c
index 016e099..7e80041 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -6138,6 +6138,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
     /* Pick up the branches, linking them together. */
     parse_start = RExC_parse;   /* MJD */
     br = regbranch(pRExC_state, &flags, 1,depth+1);
+
+    if (freeze_paren) {
+        if (RExC_npar > after_freeze)
+            after_freeze = RExC_npar;
+        RExC_npar = freeze_paren;
+    }
+
     /*     branch_len = (paren != 0); */
 
     if (br == NULL)
@@ -6553,7 +6560,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 /* reg_namedseq(pRExC_state,UVp)
    
    This is expected to be called by a parser routine that has 
-   recognized'\N' and needs to handle the rest. RExC_parse is 
+   recognized '\N' and needs to handle the rest. RExC_parse is
    expected to point at the first char following the N at the time
    of the call.
    
@@ -6567,11 +6574,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
    be returned to indicate failure. (This will NOT be a valid pointer 
    to a regnode.)
    
-   If value is null then it is assumed that we are parsing normal text
+   If valuep is null then it is assumed that we are parsing normal text
    and inserts a new EXACT node into the program containing the resolved
    string and returns a pointer to the new node. If the string is 
    zerolength a NOTHING node is emitted.
-   
+
    On success RExC_parse is set to the char following the endbrace.
    Parsing failures will generate a fatal errorvia vFAIL(...)
    
@@ -6585,7 +6592,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
    
  */
 STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) 
+S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
 {
     char * name;        /* start of the content of the name */
     char * endbrace;    /* endbrace following the name */
@@ -6597,8 +6604,22 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
  
     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
    
-    if (*RExC_parse != '{') {
-        vFAIL("Missing braces on \\N{}");
+    if (*RExC_parse != '{' ||
+	    (*RExC_parse == '{' && RExC_parse[1]
+	     && strchr("0123456789", RExC_parse[1])))
+    {
+	GET_RE_DEBUG_FLAGS_DECL;
+	if (valuep)
+	    /* no bare \N in a charclass */
+	    vFAIL("Missing braces on \\N{}");
+	GET_RE_DEBUG_FLAGS;
+	nextchar(pRExC_state);
+	ret = reg_node(pRExC_state, REG_ANY);
+	*flagp |= HASWIDTH|SIMPLE;
+	RExC_naughty++;
+	RExC_parse--;
+        Set_Node_Length(ret, 1); /* MJD */
+	return ret;
     }
     name = RExC_parse+1;
     endbrace = strchr(RExC_parse, '}');
@@ -7159,12 +7180,12 @@ tryagain:
 	    }
 	    break;
         case 'N': 
-            /* Handle \N{NAME} here and not below because it can be 
+            /* Handle \N and \N{NAME} here and not below because it can be
             multicharacter. join_exact() will join them up later on. 
             Also this makes sure that things like /\N{BLAH}+/ and 
             \N{BLAH} being multi char Just Happen. dmq*/
             ++RExC_parse;
-            ret= reg_namedseq(pRExC_state, NULL); 
+            ret= reg_namedseq(pRExC_state, NULL, flagp); 
             break;
 	case 'k':    /* Handle \k<NAME> and \k'NAME' */
 	parse_named_seq:
@@ -7430,6 +7451,19 @@ tryagain:
                             I32 flags = 0;
 			    STRLEN numlen = 3;
 			    ender = grok_oct(p, &numlen, &flags, NULL);
+
+			    /* An octal above 0xff is interpreted differently
+			     * depending on if the re is in utf8 or not.  If it
+			     * is in utf8, the value will be itself, otherwise
+			     * it is interpreted as modulo 0x100.  It has been
+			     * decided to discourage the use of octal above the
+			     * single-byte range.  For now, warn only when
+			     * it ends up modulo */
+			    if (SIZE_ONLY && ender >= 0x100
+				    && ! UTF && ! PL_encoding
+				    && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) {
+				vWARNdep(p, "Use of octal value above 377 is deprecated");
+			    }
 			    p += numlen;
 			}
 			else {
@@ -7804,6 +7838,22 @@ case ANYOF_N##NAME:                                     \
     what = WORD;                                        \
     break
 
+/* 
+   We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
+   so that it is possible to override the option here without having to 
+   rebuild the entire core. as we are required to do if we change regcomp.h
+   which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
+*/
+#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
+#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#endif
+
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
+#else
+#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
+#endif
+
 /*
    parse a class specification and produce either an ANYOF node that
    matches the pattern or if the pattern matches a single char only and
@@ -7935,7 +7985,7 @@ parseit:
                     from earlier versions, OTOH that behaviour was broken
                     as well. */
                     UV v; /* value is register so we cant & it /grrr */
-                    if (reg_namedseq(pRExC_state, &v)) {
+                    if (reg_namedseq(pRExC_state, &v, NULL)) {
                         goto parseit;
                     }
                     value= v; 
@@ -8092,18 +8142,24 @@ parseit:
 		 * A similar issue a little earlier when switching on value.
 		 * --jhi */
 		switch ((I32)namedclass) {
+		
+		case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
+		case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
+		case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
+		case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
+		case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
+		case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
+		case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
+		case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
+		case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
+		case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 		case _C_C_T_(ALNUM, isALNUM(value), "Word");
-		case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
-		case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
-		case _C_C_T_(BLANK, isBLANK(value), "Blank");
-		case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
-		case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
-		case _C_C_T_(LOWER, isLOWER(value), "Lower");
-		case _C_C_T_(PRINT, isPRINT(value), "Print");
-		case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
-		case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
 		case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
-		case _C_C_T_(UPPER, isUPPER(value), "Upper");
+#else
+		case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
+		case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+#endif		
 		case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
 		case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
 		case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
@@ -8150,7 +8206,7 @@ parseit:
 			    ANYOF_BITMAP_SET(ret, value);
 		    }
 		    yesno = '+';
-		    what = "Digit";
+		    what = POSIX_CC_UNI_NAME("Digit");
 		    break;
 		case ANYOF_NDIGIT:
 		    if (LOC)
@@ -8163,7 +8219,7 @@ parseit:
 			    ANYOF_BITMAP_SET(ret, value);
 		    }
 		    yesno = '!';
-		    what = "Digit";
+		    what = POSIX_CC_UNI_NAME("Digit");
 		    break;		
 		case ANYOF_MAX:
 		    /* this is to handle \p and \P */