From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Sat, 10 Mar 2001 21:38:30 +0000 (+0000)
Subject: Fix for ID 20010306.008, UTF-8 and \w without 'use utf8' coredump.
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=8269fa76d2972b02e844f46a88d03e7d25fb51d7;p=p5sagit%2Fp5-mst-13.2.git

Fix for ID 20010306.008, UTF-8 and \w without 'use utf8' coredump.

p4raw-id: //depot/perl@9098
---

diff --git a/regcomp.c b/regcomp.c
index 2e5aaf3..227737c 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2648,22 +2648,16 @@ tryagain:
 	    ret = reg_node(pRExC_state, CLUMP);
 	    *flagp |= HASWIDTH;
 	    nextchar(pRExC_state);
-	    if (UTF && !PL_utf8_mark)
-		is_utf8_mark((U8*)"~");		/* preload table */
 	    break;
 	case 'w':
 	    ret = reg_node(pRExC_state, LOC ? ALNUML     : ALNUM);
 	    *flagp |= HASWIDTH|SIMPLE;
 	    nextchar(pRExC_state);
-	    if (UTF && !PL_utf8_alnum)
-		is_utf8_alnum((U8*)"a");	/* preload table */
 	    break;
 	case 'W':
 	    ret = reg_node(pRExC_state, LOC ? NALNUML     : NALNUM);
 	    *flagp |= HASWIDTH|SIMPLE;
 	    nextchar(pRExC_state);
-	    if (UTF && !PL_utf8_alnum)
-		is_utf8_alnum((U8*)"a");	/* preload table */
 	    break;
 	case 'b':
 	    RExC_seen_zerolen++;
@@ -2671,8 +2665,6 @@ tryagain:
 	    ret = reg_node(pRExC_state, LOC ? BOUNDL     : BOUND);
 	    *flagp |= SIMPLE;
 	    nextchar(pRExC_state);
-	    if (UTF && !PL_utf8_alnum)
-		is_utf8_alnum((U8*)"a");	/* preload table */
 	    break;
 	case 'B':
 	    RExC_seen_zerolen++;
@@ -2680,36 +2672,26 @@ tryagain:
 	    ret = reg_node(pRExC_state, LOC ? NBOUNDL     : NBOUND);
 	    *flagp |= SIMPLE;
 	    nextchar(pRExC_state);
-	    if (UTF && !PL_utf8_alnum)
-		is_utf8_alnum((U8*)"a");	/* preload table */
 	    break;
 	case 's':
 	    ret = reg_node(pRExC_state, LOC ? SPACEL     : SPACE);
 	    *flagp |= HASWIDTH|SIMPLE;
 	    nextchar(pRExC_state);
-	    if (UTF && !PL_utf8_space)
-		is_utf8_space((U8*)" ");	/* preload table */
 	    break;
 	case 'S':
 	    ret = reg_node(pRExC_state, LOC ? NSPACEL     : NSPACE);
 	    *flagp |= HASWIDTH|SIMPLE;
 	    nextchar(pRExC_state);
-	    if (UTF && !PL_utf8_space)
-		is_utf8_space((U8*)" ");	/* preload table */
 	    break;
 	case 'd':
 	    ret = reg_node(pRExC_state, DIGIT);
 	    *flagp |= HASWIDTH|SIMPLE;
 	    nextchar(pRExC_state);
-	    if (UTF && !PL_utf8_digit)
-		is_utf8_digit((U8*)"1");	/* preload table */
 	    break;
 	case 'D':
 	    ret = reg_node(pRExC_state, NDIGIT);
 	    *flagp |= HASWIDTH|SIMPLE;
 	    nextchar(pRExC_state);
-	    if (UTF && !PL_utf8_digit)
-		is_utf8_digit((U8*)"1");	/* preload table */
 	    break;
 	case 'p':
 	case 'P':
diff --git a/regexec.c b/regexec.c
index 1fa26c9..a7b6411 100644
--- a/regexec.c
+++ b/regexec.c
@@ -123,8 +123,9 @@
 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
 
-static void restore_pos(pTHXo_ void *arg);
+#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END
 
+static void restore_pos(pTHXo_ void *arg);
 
 STATIC CHECKPOINT
 S_regcppush(pTHX_ I32 parenfloor)
@@ -953,6 +954,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		}
 		tmp = ((OP(c) == BOUND ?
 			isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
+		LOAD_UTF8_CHARCLASS(alnum,"a");
 		while (s < strend) {
 		    if (tmp == !(OP(c) == BOUND ?
 				 swash_fetch(PL_utf8_alnum, (U8*)s) :
@@ -995,6 +997,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		}
 		tmp = ((OP(c) == NBOUND ?
 			isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
+		LOAD_UTF8_CHARCLASS(alnum,"a");
 		while (s < strend) {
 		    if (tmp == !(OP(c) == NBOUND ?
 				 swash_fetch(PL_utf8_alnum, (U8*)s) :
@@ -1023,6 +1026,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    break;
 	case ALNUM:
 	    if (do_utf8) {
+		LOAD_UTF8_CHARCLASS(alnum,"a");
 		while (s < strend) {
 		    if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
 			if (tmp && (norun || regtry(prog, s)))
@@ -1080,6 +1084,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    break;
 	case NALNUM:
 	    if (do_utf8) {
+		LOAD_UTF8_CHARCLASS(alnum,"a");
 		while (s < strend) {
 		    if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
 			if (tmp && (norun || regtry(prog, s)))
@@ -1137,6 +1142,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    break;
 	case SPACE:
 	    if (do_utf8) {
+		LOAD_UTF8_CHARCLASS(space," ");
 		while (s < strend) {
 		    if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
 			if (tmp && (norun || regtry(prog, s)))
@@ -1194,6 +1200,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    break;
 	case NSPACE:
 	    if (do_utf8) {
+		LOAD_UTF8_CHARCLASS(space," ");
 		while (s < strend) {
 		    if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
 			if (tmp && (norun || regtry(prog, s)))
@@ -1251,6 +1258,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    break;
 	case DIGIT:
 	    if (do_utf8) {
+		LOAD_UTF8_CHARCLASS(digit,"0");
 		while (s < strend) {
 		    if (swash_fetch(PL_utf8_digit,(U8*)s)) {
 			if (tmp && (norun || regtry(prog, s)))
@@ -1308,6 +1316,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    break;
 	case NDIGIT:
 	    if (do_utf8) {
+		LOAD_UTF8_CHARCLASS(digit,"0");
 		while (s < strend) {
 		    if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
 			if (tmp && (norun || regtry(prog, s)))
@@ -2225,6 +2234,7 @@ S_regmatch(pTHX_ regnode *prog)
 	    if (!nextchr && locinput >= PL_regeol)
 		sayNO;
 	    if (do_utf8) {
+		LOAD_UTF8_CHARCLASS(alnum,"a");
 		if (OP(scan) == NALNUM
 		    ? swash_fetch(PL_utf8_alnum, (U8*)locinput)
 		    : isALNUM_LC_utf8((U8*)locinput))
@@ -2257,6 +2267,7 @@ S_regmatch(pTHX_ regnode *prog)
 		}
 		if (OP(scan) == BOUND || OP(scan) == NBOUND) {
 		    ln = isALNUM_uni(ln);
+		    LOAD_UTF8_CHARCLASS(alnum,"a");
 		    n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
 		}
 		else {
@@ -2288,6 +2299,7 @@ S_regmatch(pTHX_ regnode *prog)
 		sayNO;
 	    if (do_utf8) {
 		if (UTF8_IS_CONTINUED(nextchr)) {
+		    LOAD_UTF8_CHARCLASS(space," ");
 		    if (!(OP(scan) == SPACE
 			  ? swash_fetch(PL_utf8_space, (U8*)locinput)
 			  : isSPACE_LC_utf8((U8*)locinput)))
@@ -2317,6 +2329,7 @@ S_regmatch(pTHX_ regnode *prog)
 	    if (!nextchr && locinput >= PL_regeol)
 		sayNO;
 	    if (do_utf8) {
+		LOAD_UTF8_CHARCLASS(space," ");
 		if (OP(scan) == NSPACE
 		    ? swash_fetch(PL_utf8_space, (U8*)locinput)
 		    : isSPACE_LC_utf8((U8*)locinput))
@@ -2339,6 +2352,7 @@ S_regmatch(pTHX_ regnode *prog)
 	    if (!nextchr)
 		sayNO;
 	    if (do_utf8) {
+		LOAD_UTF8_CHARCLASS(digit,"0");
 		if (!(OP(scan) == DIGIT
 		      ? swash_fetch(PL_utf8_digit, (U8*)locinput)
 		      : isDIGIT_LC_utf8((U8*)locinput)))
@@ -2361,6 +2375,7 @@ S_regmatch(pTHX_ regnode *prog)
 	    if (!nextchr && locinput >= PL_regeol)
 		sayNO;
 	    if (do_utf8) {
+		LOAD_UTF8_CHARCLASS(digit,"0");
 		if (OP(scan) == NDIGIT
 		    ? swash_fetch(PL_utf8_digit, (U8*)locinput)
 		    : isDIGIT_LC_utf8((U8*)locinput))
@@ -2377,6 +2392,7 @@ S_regmatch(pTHX_ regnode *prog)
 	    nextchr = UCHARAT(++locinput);
 	    break;
 	case CLUMP:
+	    LOAD_UTF8_CHARCLASS(mark,"~");
 	    if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput))
 		sayNO;
 	    locinput += PL_utf8skip[nextchr];
@@ -3598,6 +3614,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
     case ALNUM:
 	if (do_utf8) {
 	    loceol = PL_regeol;
+	    LOAD_UTF8_CHARCLASS(alnum,"a");
 	    while (hardcount < max && scan < loceol &&
 		   swash_fetch(PL_utf8_alnum, (U8*)scan)) {
 		scan += UTF8SKIP(scan);
@@ -3625,6 +3642,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
     case NALNUM:
 	if (do_utf8) {
 	    loceol = PL_regeol;
+	    LOAD_UTF8_CHARCLASS(alnum,"a");
 	    while (hardcount < max && scan < loceol &&
 		   !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
 		scan += UTF8SKIP(scan);
@@ -3652,6 +3670,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
     case SPACE:
 	if (do_utf8) {
 	    loceol = PL_regeol;
+	    LOAD_UTF8_CHARCLASS(space," ");
 	    while (hardcount < max && scan < loceol &&
 		   (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
 		scan += UTF8SKIP(scan);
@@ -3679,6 +3698,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
     case NSPACE:
 	if (do_utf8) {
 	    loceol = PL_regeol;
+	    LOAD_UTF8_CHARCLASS(space," ");
 	    while (hardcount < max && scan < loceol &&
 		   !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
 		scan += UTF8SKIP(scan);
@@ -3706,6 +3726,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
     case DIGIT:
 	if (do_utf8) {
 	    loceol = PL_regeol;
+	    LOAD_UTF8_CHARCLASS(digit,"0");
 	    while (hardcount < max && scan < loceol &&
 		   swash_fetch(PL_utf8_digit,(U8*)scan)) {
 		scan += UTF8SKIP(scan);
@@ -3719,6 +3740,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
     case NDIGIT:
 	if (do_utf8) {
 	    loceol = PL_regeol;
+	    LOAD_UTF8_CHARCLASS(digit,"0");
 	    while (hardcount < max && scan < loceol &&
 		   !swash_fetch(PL_utf8_digit,(U8*)scan)) {
 		scan += UTF8SKIP(scan);
diff --git a/t/op/pat.t b/t/op/pat.t
index 2d86273..a82da60 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4,7 +4,7 @@
 # the format supported by op/regexp.t.  If you want to add a test
 # that does fit that format, add it to op/re_tests, not here.
 
-print "1..580\n";
+print "1..581\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -1238,8 +1238,6 @@ print "ok 247\n";
 {
     # bug id 20001008.001
 
-    use utf8; # BUG - should not be needed, but is, otherwise core dump
-
     my $test = 248;
     my @x = ("stra\337e 138","stra\337e 138");
     for (@x) {
@@ -1537,3 +1535,13 @@ print "ok 247\n";
 	for (576..580) { print "not ok $_\n" }
     }
 }
+
+{
+    # bug id 20010306.008
+
+    $a = "a\x{1234}";
+    # The original bug report had 'no utf8' here but that was irrelevant.
+    $a =~ m/\w/; # used to core dump
+
+    print "ok 581\n";
+}