From: Jarkko Hietaniemi Date: Tue, 6 Nov 2001 15:18:41 +0000 (+0000) Subject: More UTF-8 EXACT tweaking, plus a forgotten UTF-8 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=5ff6fc6d3e84f8da3756f8b5246037f5e410021e;p=p5sagit%2Fp5-mst-13.2.git More UTF-8 EXACT tweaking, plus a forgotten UTF-8 toggle-on from the encoding pragma. p4raw-id: //depot/perl@12872 --- diff --git a/regcomp.c b/regcomp.c index cd3857e..12e0395 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1764,7 +1764,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */ pm->op_pmflags = RExC_flags16; if (UTF) - r->reganch |= ROPT_UTF8; + r->reganch |= ROPT_UTF8; /* Unicode in it? */ r->regstclass = NULL; if (RExC_naughty >= 10) /* Probably an expensive pattern. */ r->reganch |= ROPT_NAUGHTY; @@ -3168,6 +3168,7 @@ tryagain: RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); } else RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); + RExC_utf8 = 1; } return(ret); diff --git a/regexec.c b/regexec.c index 60d93f7..712c4d9 100644 --- a/regexec.c +++ b/regexec.c @@ -2204,43 +2204,40 @@ S_regmatch(pTHX_ regnode *prog) s = STRING(scan); ln = STR_LEN(scan); if (do_utf8 != (UTF!=0)) { + /* The target and the pattern have differing "utf8ness". */ char *l = locinput; char *e = s + ln; STRLEN len; - if (do_utf8) + if (do_utf8) { + /* The target is utf8, the pattern is not utf8. */ while (s < e) { - UV uv; - if (l >= PL_regeol) - sayNO; - uv = NATIVE_TO_UNI(*(U8*)s); - if (UTF8_IS_START(uv)) { - len = UTF8SKIP(s); - if (memNE(s, l, len)) - sayNO; - l += len; - s += len; - } else { - if (uv != utf8_to_uvchr((U8*)l, &len)) - sayNO; - l += len; - s ++; - } + sayNO; + if (NATIVE_TO_UNI(*(U8*)s) != + utf8_to_uvchr((U8*)l, &len)) + sayNO; + l += len; + s ++; } - else + } + else { + /* The target is not utf8, the pattern is utf8. */ while (s < e) { if (l >= PL_regeol) sayNO; - if (*((U8*)l) != utf8_to_uvchr((U8*)s, &len)) + if (NATIVE_TO_UNI(*((U8*)l)) != + utf8_to_uvchr((U8*)s, &len)) sayNO; s += len; l ++; } + } locinput = l; nextchr = UCHARAT(locinput); break; } + /* The target and the pattern have the same "utf8ness". */ /* Inline the first character, for speed. */ if (UCHARAT(s) != nextchr) sayNO;