From: Jarkko Hietaniemi Date: Mon, 31 Dec 2001 03:42:15 +0000 (+0000) Subject: Delay folding until necessary; start of handling X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=80aecb99acb05e810c6136645b97c6bc9f385ca3;p=p5sagit%2Fp5-mst-13.2.git Delay folding until necessary; start of handling folding into several characters. p4raw-id: //depot/perl@13969 --- diff --git a/regcomp.c b/regcomp.c index b061991..cac14bf 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2986,7 +2986,8 @@ tryagain: char *oldp, *s; STRLEN numlen; STRLEN ulen; - U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + STRLEN foldlen; + U8 tmpbuf[UTF8_MAXLEN_UCLC+1], *foldbuf; parse_start = RExC_parse - 1; @@ -3130,17 +3131,28 @@ tryagain: } if (RExC_flags16 & PMf_EXTENDED) p = regwhite(p, RExC_end); - if (UTF && FOLD) { - toFOLD_uni(ender, tmpbuf, &ulen); - ender = utf8_to_uvchr(tmpbuf, 0); - } + if (UTF && FOLD) + toFOLD_uni(ender, tmpbuf, &foldlen); if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen; + if (FOLD) { + for (foldbuf = tmpbuf; + foldlen; + foldlen -= numlen) { + ender = utf8_to_uvchr(foldbuf, &numlen); + reguni(pRExC_state, ender, s, &numlen); + s += numlen; + len += numlen; + foldbuf += numlen; + } + } + else { + reguni(pRExC_state, ender, s, &numlen); + s += numlen; + len += numlen; + } } else { len++; @@ -3149,9 +3161,23 @@ tryagain: break; } if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { - reguni(pRExC_state, ender, s, &numlen); - s += numlen; - len += numlen - 1; + if (FOLD) { + for (foldbuf = tmpbuf; + foldlen; + foldlen -= numlen) { + ender = utf8_to_uvchr(foldbuf, &numlen); + reguni(pRExC_state, ender, s, &numlen); + s += numlen; + len += numlen; + foldbuf += numlen; + } + } + else { + reguni(pRExC_state, ender, s, &numlen); + s += numlen; + len += numlen; + } + len--; } else REGC(ender, s++); diff --git a/regexec.c b/regexec.c index c932165..e67774d 100644 --- a/regexec.c +++ b/regexec.c @@ -979,38 +979,59 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta * Fortunately, not getting this right is allowed * for Unicode Regular Expression Support level 1, * only one-to-one matching is required. --jhi */ + if (c1 == c2) { while (s <= e) { c = utf8_to_uvchr((U8*)s, &len); - uvchr_to_utf8(tmpbuf, c); - to_utf8_fold(tmpbuf, foldbuf, &foldlen); - f = utf8_to_uvchr(foldbuf, 0); - - if ( ((c == c1 && ln == len) || - (f == c1 && ln == foldlen) || - !ibcmp_utf8(s, do_utf8, (I32)(strend - s), - m, UTF, (I32)ln)) + if ( c == c1 + && (ln == len || + !ibcmp_utf8(s, do_utf8, strend - s, + m, UTF, ln)) && (norun || regtry(prog, s)) ) goto got_it; + else { + uvchr_to_utf8(tmpbuf, c); + to_utf8_fold(tmpbuf, foldbuf, &foldlen); + f = utf8_to_uvchr(foldbuf, 0); + if ( f != c + && (f == c1 || f == c2) + && (ln == foldlen || + !ibcmp_utf8((char *)foldbuf, + do_utf8, foldlen, + m, UTF, ln)) + && (norun || regtry(prog, s)) ) + goto got_it; + } s += len; } } else { while (s <= e) { c = utf8_to_uvchr((U8*)s, &len); - uvchr_to_utf8(tmpbuf, c); - to_utf8_fold(tmpbuf, foldbuf, &foldlen); - f = utf8_to_uvchr(foldbuf, 0); - + if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA || c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA; - if ( (((c == c1 || c == c2) && ln == len) || - ((f == c1 || f == c2) && ln == foldlen) || - !ibcmp_utf8(s, do_utf8, (I32)(strend - s), - m, UTF, (I32)ln)) + + if ( (c == c1 || c == c2) + && (ln == len || + !ibcmp_utf8(s, do_utf8, strend - s, + m, UTF, ln)) && (norun || regtry(prog, s)) ) goto got_it; + else { + uvchr_to_utf8(tmpbuf, c); + to_utf8_fold(tmpbuf, foldbuf, &foldlen); + f = utf8_to_uvchr(foldbuf, 0); + if ( f != c + && (f == c1 || f == c2) + && (ln == foldlen || + !ibcmp_utf8((char *)foldbuf, + do_utf8, foldlen, + m, UTF, ln)) + && (norun || regtry(prog, s)) ) + goto got_it; + } s += len; } } @@ -2372,10 +2393,21 @@ S_regmatch(pTHX_ regnode *prog) sayNO; if (UTF8SKIP(s) != UTF8SKIP(l) || memNE(s, (char*)l, UTF8SKIP(s))) { - to_utf8_fold((U8*)l, tmpbuf, &ulen); - if (UTF8SKIP(s) != ulen || - memNE(s, (char*)tmpbuf, ulen)) - sayNO; + U8 lfoldbuf[UTF8_MAXLEN_FOLD+1]; + STRLEN lfoldlen; + + to_utf8_fold((U8*)l, lfoldbuf, &lfoldlen); + if (UTF8SKIP(s) != lfoldlen || + memNE(s, (char*)lfoldbuf, lfoldlen)) { + U8 sfoldbuf[UTF8_MAXLEN_FOLD+1]; + STRLEN sfoldlen; + + to_utf8_fold((U8*)s, sfoldbuf, &sfoldlen); + if (sfoldlen != lfoldlen || + memNE((char*)sfoldbuf, + (char*)lfoldbuf, lfoldlen)) + sayNO; + } } l += UTF8SKIP(l); s += UTF8SKIP(s);