From: Jarkko Hietaniemi Date: Sat, 10 Mar 2001 21:38:30 +0000 (+0000) Subject: Fix for ID 20010306.008, UTF-8 and \w without 'use utf8' coredump. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=8269fa76d2972b02e844f46a88d03e7d25fb51d7;p=p5sagit%2Fp5-mst-13.2.git Fix for ID 20010306.008, UTF-8 and \w without 'use utf8' coredump. p4raw-id: //depot/perl@9098 --- diff --git a/regcomp.c b/regcomp.c index 2e5aaf3..227737c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2648,22 +2648,16 @@ tryagain: ret = reg_node(pRExC_state, CLUMP); *flagp |= HASWIDTH; nextchar(pRExC_state); - if (UTF && !PL_utf8_mark) - is_utf8_mark((U8*)"~"); /* preload table */ break; case 'w': ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_alnum) - is_utf8_alnum((U8*)"a"); /* preload table */ break; case 'W': ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_alnum) - is_utf8_alnum((U8*)"a"); /* preload table */ break; case 'b': RExC_seen_zerolen++; @@ -2671,8 +2665,6 @@ tryagain: ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND); *flagp |= SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_alnum) - is_utf8_alnum((U8*)"a"); /* preload table */ break; case 'B': RExC_seen_zerolen++; @@ -2680,36 +2672,26 @@ tryagain: ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND); *flagp |= SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_alnum) - is_utf8_alnum((U8*)"a"); /* preload table */ break; case 's': ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_space) - is_utf8_space((U8*)" "); /* preload table */ break; case 'S': ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_space) - is_utf8_space((U8*)" "); /* preload table */ break; case 'd': ret = reg_node(pRExC_state, DIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_digit) - is_utf8_digit((U8*)"1"); /* preload table */ break; case 'D': ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_digit) - is_utf8_digit((U8*)"1"); /* preload table */ break; case 'p': case 'P': diff --git a/regexec.c b/regexec.c index 1fa26c9..a7b6411 100644 --- a/regexec.c +++ b/regexec.c @@ -123,8 +123,9 @@ #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim)) -static void restore_pos(pTHXo_ void *arg); +#define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) (void)CAT2(is_utf8_, a)((U8*)b); } STMT_END +static void restore_pos(pTHXo_ void *arg); STATIC CHECKPOINT S_regcppush(pTHX_ I32 parenfloor) @@ -953,6 +954,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } tmp = ((OP(c) == BOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); + LOAD_UTF8_CHARCLASS(alnum,"a"); while (s < strend) { if (tmp == !(OP(c) == BOUND ? swash_fetch(PL_utf8_alnum, (U8*)s) : @@ -995,6 +997,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } tmp = ((OP(c) == NBOUND ? isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); + LOAD_UTF8_CHARCLASS(alnum,"a"); while (s < strend) { if (tmp == !(OP(c) == NBOUND ? swash_fetch(PL_utf8_alnum, (U8*)s) : @@ -1023,6 +1026,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta break; case ALNUM: if (do_utf8) { + LOAD_UTF8_CHARCLASS(alnum,"a"); while (s < strend) { if (swash_fetch(PL_utf8_alnum, (U8*)s)) { if (tmp && (norun || regtry(prog, s))) @@ -1080,6 +1084,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta break; case NALNUM: if (do_utf8) { + LOAD_UTF8_CHARCLASS(alnum,"a"); while (s < strend) { if (!swash_fetch(PL_utf8_alnum, (U8*)s)) { if (tmp && (norun || regtry(prog, s))) @@ -1137,6 +1142,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta break; case SPACE: if (do_utf8) { + LOAD_UTF8_CHARCLASS(space," "); while (s < strend) { if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) { if (tmp && (norun || regtry(prog, s))) @@ -1194,6 +1200,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta break; case NSPACE: if (do_utf8) { + LOAD_UTF8_CHARCLASS(space," "); while (s < strend) { if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) { if (tmp && (norun || regtry(prog, s))) @@ -1251,6 +1258,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta break; case DIGIT: if (do_utf8) { + LOAD_UTF8_CHARCLASS(digit,"0"); while (s < strend) { if (swash_fetch(PL_utf8_digit,(U8*)s)) { if (tmp && (norun || regtry(prog, s))) @@ -1308,6 +1316,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta break; case NDIGIT: if (do_utf8) { + LOAD_UTF8_CHARCLASS(digit,"0"); while (s < strend) { if (!swash_fetch(PL_utf8_digit,(U8*)s)) { if (tmp && (norun || regtry(prog, s))) @@ -2225,6 +2234,7 @@ S_regmatch(pTHX_ regnode *prog) if (!nextchr && locinput >= PL_regeol) sayNO; if (do_utf8) { + LOAD_UTF8_CHARCLASS(alnum,"a"); if (OP(scan) == NALNUM ? swash_fetch(PL_utf8_alnum, (U8*)locinput) : isALNUM_LC_utf8((U8*)locinput)) @@ -2257,6 +2267,7 @@ S_regmatch(pTHX_ regnode *prog) } if (OP(scan) == BOUND || OP(scan) == NBOUND) { ln = isALNUM_uni(ln); + LOAD_UTF8_CHARCLASS(alnum,"a"); n = swash_fetch(PL_utf8_alnum, (U8*)locinput); } else { @@ -2288,6 +2299,7 @@ S_regmatch(pTHX_ regnode *prog) sayNO; if (do_utf8) { if (UTF8_IS_CONTINUED(nextchr)) { + LOAD_UTF8_CHARCLASS(space," "); if (!(OP(scan) == SPACE ? swash_fetch(PL_utf8_space, (U8*)locinput) : isSPACE_LC_utf8((U8*)locinput))) @@ -2317,6 +2329,7 @@ S_regmatch(pTHX_ regnode *prog) if (!nextchr && locinput >= PL_regeol) sayNO; if (do_utf8) { + LOAD_UTF8_CHARCLASS(space," "); if (OP(scan) == NSPACE ? swash_fetch(PL_utf8_space, (U8*)locinput) : isSPACE_LC_utf8((U8*)locinput)) @@ -2339,6 +2352,7 @@ S_regmatch(pTHX_ regnode *prog) if (!nextchr) sayNO; if (do_utf8) { + LOAD_UTF8_CHARCLASS(digit,"0"); if (!(OP(scan) == DIGIT ? swash_fetch(PL_utf8_digit, (U8*)locinput) : isDIGIT_LC_utf8((U8*)locinput))) @@ -2361,6 +2375,7 @@ S_regmatch(pTHX_ regnode *prog) if (!nextchr && locinput >= PL_regeol) sayNO; if (do_utf8) { + LOAD_UTF8_CHARCLASS(digit,"0"); if (OP(scan) == NDIGIT ? swash_fetch(PL_utf8_digit, (U8*)locinput) : isDIGIT_LC_utf8((U8*)locinput)) @@ -2377,6 +2392,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(++locinput); break; case CLUMP: + LOAD_UTF8_CHARCLASS(mark,"~"); if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark,(U8*)locinput)) sayNO; locinput += PL_utf8skip[nextchr]; @@ -3598,6 +3614,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) case ALNUM: if (do_utf8) { loceol = PL_regeol; + LOAD_UTF8_CHARCLASS(alnum,"a"); while (hardcount < max && scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) { scan += UTF8SKIP(scan); @@ -3625,6 +3642,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) case NALNUM: if (do_utf8) { loceol = PL_regeol; + LOAD_UTF8_CHARCLASS(alnum,"a"); while (hardcount < max && scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) { scan += UTF8SKIP(scan); @@ -3652,6 +3670,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) case SPACE: if (do_utf8) { loceol = PL_regeol; + LOAD_UTF8_CHARCLASS(space," "); while (hardcount < max && scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { scan += UTF8SKIP(scan); @@ -3679,6 +3698,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) case NSPACE: if (do_utf8) { loceol = PL_regeol; + LOAD_UTF8_CHARCLASS(space," "); while (hardcount < max && scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { scan += UTF8SKIP(scan); @@ -3706,6 +3726,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) case DIGIT: if (do_utf8) { loceol = PL_regeol; + LOAD_UTF8_CHARCLASS(digit,"0"); while (hardcount < max && scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) { scan += UTF8SKIP(scan); @@ -3719,6 +3740,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) case NDIGIT: if (do_utf8) { loceol = PL_regeol; + LOAD_UTF8_CHARCLASS(digit,"0"); while (hardcount < max && scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) { scan += UTF8SKIP(scan); diff --git a/t/op/pat.t b/t/op/pat.t index 2d86273..a82da60 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4,7 +4,7 @@ # the format supported by op/regexp.t. If you want to add a test # that does fit that format, add it to op/re_tests, not here. -print "1..580\n"; +print "1..581\n"; BEGIN { chdir 't' if -d 't'; @@ -1238,8 +1238,6 @@ print "ok 247\n"; { # bug id 20001008.001 - use utf8; # BUG - should not be needed, but is, otherwise core dump - my $test = 248; my @x = ("stra\337e 138","stra\337e 138"); for (@x) { @@ -1537,3 +1535,13 @@ print "ok 247\n"; for (576..580) { print "not ok $_\n" } } } + +{ + # bug id 20010306.008 + + $a = "a\x{1234}"; + # The original bug report had 'no utf8' here but that was irrelevant. + $a =~ m/\w/; # used to core dump + + print "ok 581\n"; +}