X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regexec.c;h=709eef238f4b53e23f4bd37713476d63dd30d09d;hb=a365f2ce4defc0d7fecd4e9484f8f958454c9192;hp=1eb7ff28593342f769fdf67d1b31317cdb125bab;hpb=c737faaf63999e5a68ef536d362cea408ab990b7;p=p5sagit%2Fp5-mst-13.2.git

diff --git a/regexec.c b/regexec.c
index 1eb7ff2..709eef2 100644
--- a/regexec.c
+++ b/regexec.c
@@ -285,9 +285,8 @@ S_regcppop(pTHX_ const regexp *rex)
      * requiring null fields (pat.t#187 and split.t#{13,14}
      * (as of patchlevel 7877)  will fail.  Then again,
      * this code seems to be necessary or otherwise
-     * building DynaLoader will fail:
-     * "Error: '*' not in typemap in DynaLoader.xs, line 164"
-     * --jhi */
+     * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
+     * --jhi updated by dapm */
     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 	if (i > PL_regsize)
 	    PL_regoffs[i].start = -1;
@@ -308,7 +307,7 @@ S_regcppop(pTHX_ const regexp *rex)
  - pregexec - match a regexp against a string
  */
 I32
-Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
+Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 	 char *strbeg, I32 minend, SV *screamer, U32 nosave)
 /* strend: pointer to null at end of string */
 /* strbeg: real beginning of string */
@@ -372,8 +371,8 @@ Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *stren
    deleted from the finite automaton. */
 
 char *
-Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
-		     char *strend, U32 flags, re_scream_pos_data *data)
+Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos,
+		     char *strend, const U32 flags, re_scream_pos_data *data)
 {
     dVAR;
     register I32 start_shift = 0;
@@ -1110,6 +1109,15 @@ REXEC_FBC_SCAN(                                       \
 if ((!reginfo || regtry(reginfo, &s))) \
     goto got_it
 
+#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
+    if (do_utf8) {                                             \
+	REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
+    }                                                          \
+    else {                                                     \
+	REXEC_FBC_CLASS_SCAN(CoNd);                            \
+    }                                                          \
+    break
+    
 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
     if (do_utf8) {                                             \
 	UtFpReLoAd;                                            \
@@ -1425,6 +1433,31 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 		!isDIGIT_LC_utf8((U8*)s),
 		!isDIGIT_LC(*s)
 	    );
+	case LNBREAK:
+	    REXEC_FBC_CSCAN(
+		is_LNBREAK_utf8(s),
+		is_LNBREAK_latin1(s)
+	    );
+	case VERTWS:
+	    REXEC_FBC_CSCAN(
+		is_VERTWS_utf8(s),
+		is_VERTWS_latin1(s)
+	    );
+	case NVERTWS:
+	    REXEC_FBC_CSCAN(
+		!is_VERTWS_utf8(s),
+		!is_VERTWS_latin1(s)
+	    );
+	case HORIZWS:
+	    REXEC_FBC_CSCAN(
+		is_HORIZWS_utf8(s),
+		is_HORIZWS_latin1(s)
+	    );
+	case NHORIZWS:
+	    REXEC_FBC_CSCAN(
+		!is_HORIZWS_utf8(s),
+		!is_HORIZWS_latin1(s)
+	    );	    
 	case AHOCORASICKC:
 	case AHOCORASICK: 
 	    {
@@ -1448,8 +1481,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 		U8 **points; /* map of where we were in the input string
 		                when reading a given char. For ASCII this
 		                is unnecessary overhead as the relationship
-		                is always 1:1, but for unicode, especially
-		                case folded unicode this is not true. */
+		                is always 1:1, but for Unicode, especially
+		                case folded Unicode this is not true. */
 		U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
 		U8 *bitmap=NULL;
 
@@ -1671,7 +1704,7 @@ S_swap_match_buff (pTHX_ regexp *prog) {
  - regexec_flags - match a regexp against a string
  */
 I32
-Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
+Perl_regexec_flags(pTHX_ REGEXP * const prog, char *stringarg, register char *strend,
 	      char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
 /* strend: pointer to null at end of string */
 /* strbeg: real beginning of string */
@@ -1811,7 +1844,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
 		    if (regtry(&reginfo, &s))
 			goto got_it;
 		  after_try:
-		    if (s >= end)
+		    if (s > end)
 			goto phooey;
 		    if (prog->extflags & RXf_USE_INTUIT) {
 			s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
@@ -2233,13 +2266,12 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
      * Actually, the code in regcppop() (which Ilya may be meaning by
      * PL_reglastparen), is not needed at all by the test suite
-     * (op/regexp, op/pat, op/split), but that code is needed, oddly
-     * enough, for building DynaLoader, or otherwise this
-     * "Error: '*' not in typemap in DynaLoader.xs, line 164"
-     * will happen.  Meanwhile, this code *is* needed for the
+     * (op/regexp, op/pat, op/split), but that code is needed otherwise
+     * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
+     * Meanwhile, this code *is* needed for the
      * above-mentioned test suite tests to succeed.  The common theme
      * on those tests seems to be returning null fields from matches.
-     * --jhi */
+     * --jhi updated by dapm */
 #if 1
     if (prog->nparens) {
 	regexp_paren_pair *pp = PL_regoffs;
@@ -3207,8 +3239,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 		      * pack("U0U*", 0xDF) =~ /ss/i,
 		      * the 0xC3 0x9F are the UTF-8
 		      * byte sequence for the U+00DF. */
+
 		     if (!(do_utf8 &&
-			   toLOWER(s[0]) == 's' &&
+		           toLOWER(s[0]) == 's' &&
 			   ln >= 2 &&
 			   toLOWER(s[1]) == 's' &&
 			   (U8)l[0] == 0xC3 &&
@@ -3663,13 +3696,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 			re = reg_temp_copy((regexp *)mg->mg_obj); /*XXX:dmq*/
 		    }
 		    else {
-			STRLEN len;
-			const char * const t = SvPV_const(ret, len);
 			U32 pm_flags = 0;
 			const I32 osize = PL_regsize;
 
 			if (DO_UTF8(ret)) pm_flags |= RXf_UTF8;
-			re = CALLREGCOMP((char*)t, (char*)t + len, pm_flags);
+			re = CALLREGCOMP(ret, pm_flags);
 			if (!(SvFLAGS(ret)
 			      & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
 				| SVs_GMG)))
@@ -4188,12 +4219,6 @@ NULL
 
 	case BRANCH:	    /*  /(...|A|...)/ */
 	    scan = NEXTOPER(scan); /* scan now points to inner node */
-	    if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ)) 
-	        && !has_cutgroup)
-	    {
-	    	/* last branch; skip state push and jump direct to node */
-		continue;
-            }
 	    ST.lastparen = *PL_reglastparen;
 	    ST.next_branch = next;
 	    REGCP_SET(ST.cp);
@@ -4971,6 +4996,55 @@ NULL
             sayNO;
             /* NOTREACHED */
 #undef ST
+        case FOLDCHAR:
+            n = ARG(scan);
+            if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
+                locinput += ln;
+            } else if ( 0xDF == n && !do_utf8 && !UTF ) {
+                sayNO;
+            } else  {
+                U8 folded[UTF8_MAXBYTES_CASE+1];
+                STRLEN foldlen;
+                const char * const l = locinput;
+                char *e = PL_regeol;
+                to_uni_fold(n, folded, &foldlen);
+
+		if (ibcmp_utf8((const char*) folded, 0,  foldlen, 1,
+                	       l, &e, 0,  do_utf8)) {
+                        sayNO;
+                }
+                locinput = e;
+            } 
+            nextchr = UCHARAT(locinput);  
+            break;
+        case LNBREAK:
+            if ((n=is_LNBREAK(locinput,do_utf8))) {
+                locinput += n;
+                nextchr = UCHARAT(locinput);
+            } else
+                sayNO;
+            break;
+
+#define CASE_CLASS(nAmE)                              \
+        case nAmE:                                    \
+            if ((n=is_##nAmE(locinput,do_utf8))) {    \
+                locinput += n;                        \
+                nextchr = UCHARAT(locinput);          \
+            } else                                    \
+                sayNO;                                \
+            break;                                    \
+        case N##nAmE:                                 \
+            if ((n=is_##nAmE(locinput,do_utf8))) {    \
+                sayNO;                                \
+            } else {                                  \
+                locinput += UTF8SKIP(locinput);       \
+                nextchr = UCHARAT(locinput);          \
+            }                                         \
+            break
+
+        CASE_CLASS(VERTWS);
+        CASE_CLASS(HORIZWS);
+#undef CASE_CLASS
 
 	default:
 	    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
@@ -5382,7 +5456,77 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 	    while (scan < loceol && !isDIGIT(*scan))
 		scan++;
 	}
+    case LNBREAK:
+        if (do_utf8) {
+	    loceol = PL_regeol;
+	    while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
+		scan += c;
+		hardcount++;
+	    }
+	} else {
+	    /*
+	      LNBREAK can match two latin chars, which is ok,
+	      because we have a null terminated string, but we
+	      have to use hardcount in this situation
+	    */
+	    while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
+		scan+=c;
+		hardcount++;
+	    }
+	}	
 	break;
+    case HORIZWS:
+        if (do_utf8) {
+	    loceol = PL_regeol;
+	    while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
+		scan += c;
+		hardcount++;
+	    }
+	} else {
+	    while (scan < loceol && is_HORIZWS_latin1(scan)) 
+		scan++;		
+	}	
+	break;
+    case NHORIZWS:
+        if (do_utf8) {
+	    loceol = PL_regeol;
+	    while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
+		scan += UTF8SKIP(scan);
+		hardcount++;
+	    }
+	} else {
+	    while (scan < loceol && !is_HORIZWS_latin1(scan))
+		scan++;
+
+	}	
+	break;
+    case VERTWS:
+        if (do_utf8) {
+	    loceol = PL_regeol;
+	    while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
+		scan += c;
+		hardcount++;
+	    }
+	} else {
+	    while (scan < loceol && is_VERTWS_latin1(scan)) 
+		scan++;
+
+	}	
+	break;
+    case NVERTWS:
+        if (do_utf8) {
+	    loceol = PL_regeol;
+	    while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
+		scan += UTF8SKIP(scan);
+		hardcount++;
+	    }
+	} else {
+	    while (scan < loceol && !is_VERTWS_latin1(scan)) 
+		scan++;
+          
+	}	
+	break;
+
     default:		/* Called on something of 0 width. */
 	break;		/* So match right here or not at all. */
     }