X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=e06152820c304553b6d15e8d00750513a0d9f2c2;hb=70b6a84e977a6211834eeab18dfebf4d45bd1688;hp=7d1c4746333065d3f1b8dc771c98982a6fb233c3;hpb=8fa23287293bbec887dca6d7e006e50e1a445bb2;p=p5sagit%2Fp5-mst-13.2.git

diff --git a/regcomp.c b/regcomp.c
index 7d1c474..e061528 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2,7 +2,9 @@
  */
 
 /*
- * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
+ * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
+ *
+ *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
  */
 
 /* This file contains functions for compiling a regular expression.  See
@@ -6615,20 +6617,30 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
             | PERL_SCAN_DISALLOW_PREFIX
             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
         UV cp;
-	char string;
         len = (STRLEN)(endbrace - name - 2);
         cp = grok_hex(name + 2, &len, &fl, NULL);
         if ( len != (STRLEN)(endbrace - name - 2) ) {
             cp = 0xFFFD;
         }    
-        if (cp > 0xff)
-            RExC_utf8 = 1;
         if ( valuep ) {
+	    if (cp > 0xff) RExC_utf8 = 1;
             *valuep = cp;
             return NULL;
         }
-	string = (char)cp;
-        sv_str= newSVpvn(&string, 1);
+
+	/* Need to convert to utf8 if either: won't fit into a byte, or the re
+	 * is going to be in utf8 and the representation changes under utf8. */
+	if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
+	    U8 string[UTF8_MAXBYTES+1];
+	    U8 *tmps;
+	    RExC_utf8 = 1;
+	    tmps = uvuni_to_utf8(string, cp);
+	    sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
+	} else {    /* Otherwise, no need for utf8, can skip that step */
+	    char string;
+	    string = (char)cp;
+	    sv_str= newSVpvn(&string, 1);
+	}
     } else {
         /* fetch the charnames handler for this scope */
         HV * const table = GvHV(PL_hintgv);
@@ -6807,7 +6819,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
         Set_Node_Cur_Length(ret); /* MJD */
         RExC_parse--; 
         nextchar(pRExC_state);
-    } else {
+    } else {	/* zero length */
         ret = reg_node(pRExC_state,NOTHING);
     }
     if (!cached) {
@@ -7418,6 +7430,19 @@ tryagain:
                             I32 flags = 0;
 			    STRLEN numlen = 3;
 			    ender = grok_oct(p, &numlen, &flags, NULL);
+
+			    /* An octal above 0xff is interpreted differently
+			     * depending on if the re is in utf8 or not.  If it
+			     * is in utf8, the value will be itself, otherwise
+			     * it is interpreted as modulo 0x100.  It has been
+			     * decided to discourage the use of octal above the
+			     * single-byte range.  For now, warn only when
+			     * it ends up modulo */
+			    if (SIZE_ONLY && ender >= 0x100
+				    && ! UTF && ! PL_encoding
+				    && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) {
+				vWARNdep(p, "Use of octal value above 377 is deprecated");
+			    }
 			    p += numlen;
 			}
 			else {
@@ -7792,6 +7817,22 @@ case ANYOF_N##NAME:                                     \
     what = WORD;                                        \
     break
 
+/* 
+   We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
+   so that it is possible to override the option here without having to 
+   rebuild the entire core. as we are required to do if we change regcomp.h
+   which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
+*/
+#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
+#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#endif
+
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
+#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
+#else
+#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
+#endif
+
 /*
    parse a class specification and produce either an ANYOF node that
    matches the pattern or if the pattern matches a single char only and
@@ -8080,18 +8121,24 @@ parseit:
 		 * A similar issue a little earlier when switching on value.
 		 * --jhi */
 		switch ((I32)namedclass) {
+		
+		case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
+		case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
+		case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
+		case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
+		case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
+		case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
+		case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
+		case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
+		case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
+		case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
+#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 		case _C_C_T_(ALNUM, isALNUM(value), "Word");
-		case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
-		case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
-		case _C_C_T_(BLANK, isBLANK(value), "Blank");
-		case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
-		case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
-		case _C_C_T_(LOWER, isLOWER(value), "Lower");
-		case _C_C_T_(PRINT, isPRINT(value), "Print");
-		case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
-		case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
 		case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
-		case _C_C_T_(UPPER, isUPPER(value), "Upper");
+#else
+		case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
+		case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
+#endif		
 		case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
 		case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
 		case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
@@ -8138,7 +8185,7 @@ parseit:
 			    ANYOF_BITMAP_SET(ret, value);
 		    }
 		    yesno = '+';
-		    what = "Digit";
+		    what = POSIX_CC_UNI_NAME("Digit");
 		    break;
 		case ANYOF_NDIGIT:
 		    if (LOC)
@@ -8151,7 +8198,7 @@ parseit:
 			    ANYOF_BITMAP_SET(ret, value);
 		    }
 		    yesno = '!';
-		    what = "Digit";
+		    what = POSIX_CC_UNI_NAME("Digit");
 		    break;		
 		case ANYOF_MAX:
 		    /* this is to handle \p and \P */
@@ -8962,6 +9009,17 @@ Perl_regdump(pTHX_ const regexp *r)
 /*
 - regprop - printable representation of opcode
 */
+#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
+STMT_START { \
+        if (do_sep) {                           \
+            Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
+            if (flags & ANYOF_INVERT)           \
+                /*make sure the invert info is in each */ \
+                sv_catpvs(sv, "^");             \
+            do_sep = 0;                         \
+        }                                       \
+} STMT_END
+
 void
 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
 {
@@ -9087,6 +9145,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
     else if (k == ANYOF) {
 	int i, rangestart = -1;
 	const U8 flags = ANYOF_FLAGS(o);
+	int do_sep = 0;
 
 	/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
 	static const char * const anyofs[] = {
@@ -9102,8 +9161,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
 	    "[:^alpha:]",
 	    "[:ascii:]",
 	    "[:^ascii:]",
-	    "[:ctrl:]",
-	    "[:^ctrl:]",
+	    "[:cntrl:]",
+	    "[:^cntrl:]",
 	    "[:graph:]",
 	    "[:^graph:]",
 	    "[:lower:]",
@@ -9129,6 +9188,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
 	Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
 	if (flags & ANYOF_INVERT)
 	    sv_catpvs(sv, "^");
+	
+	/* output what the standard cp 0-255 bitmap matches */
 	for (i = 0; i <= 256; i++) {
 	    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
 		if (rangestart == -1)
@@ -9142,15 +9203,23 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
 		    sv_catpvs(sv, "-");
 		    put_byte(sv, i - 1);
 		}
+		do_sep = 1;
 		rangestart = -1;
 	    }
 	}
-
+        
+        EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+        /* output any special charclass tests (used mostly under use locale) */
 	if (o->flags & ANYOF_CLASS)
 	    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
-		if (ANYOF_CLASS_TEST(o,i))
+		if (ANYOF_CLASS_TEST(o,i)) {
 		    sv_catpv(sv, anyofs[i]);
-
+		    do_sep = 1;
+		}
+        
+        EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+        
+        /* output information about the unicode matching */
 	if (flags & ANYOF_UNICODE)
 	    sv_catpvs(sv, "{unicode}");
 	else if (flags & ANYOF_UNICODE_ALL)
@@ -9163,7 +9232,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
 	    if (lv) {
 		if (sw) {
 		    U8 s[UTF8_MAXBYTES_CASE+1];
-		
+
 		    for (i = 0; i <= 256; i++) { /* just the first 256 */
 			uvchr_to_utf8(s, i);