Delay folding until necessary; start of handling
Jarkko Hietaniemi [Mon, 31 Dec 2001 03:42:15 +0000 (03:42 +0000)]
folding into several characters.

p4raw-id: //depot/perl@13969

regcomp.c
regexec.c

index b061991..cac14bf 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2986,7 +2986,8 @@ tryagain:
            char *oldp, *s;
            STRLEN numlen;
            STRLEN ulen;
-           U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+           STRLEN foldlen;
+           U8 tmpbuf[UTF8_MAXLEN_UCLC+1], *foldbuf;
 
             parse_start = RExC_parse - 1;
 
@@ -3130,17 +3131,28 @@ tryagain:
                }
                if (RExC_flags16 & PMf_EXTENDED)
                    p = regwhite(p, RExC_end);
-               if (UTF && FOLD) {
-                   toFOLD_uni(ender, tmpbuf, &ulen);
-                   ender = utf8_to_uvchr(tmpbuf, 0);
-               }
+               if (UTF && FOLD)
+                   toFOLD_uni(ender, tmpbuf, &foldlen);
                if (ISMULT2(p)) { /* Back off on ?+*. */
                    if (len)
                        p = oldp;
                    else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
-                       reguni(pRExC_state, ender, s, &numlen);
-                       s += numlen;
-                       len += numlen;
+                        if (FOLD) {
+                             for (foldbuf = tmpbuf;
+                                  foldlen;
+                                  foldlen -= numlen) {
+                                  ender = utf8_to_uvchr(foldbuf, &numlen);
+                                  reguni(pRExC_state, ender, s, &numlen);
+                                  s       += numlen;
+                                  len     += numlen;
+                                  foldbuf += numlen;
+                             }
+                        }
+                        else {
+                             reguni(pRExC_state, ender, s, &numlen);
+                             s   += numlen;
+                             len += numlen;
+                        }
                    }
                    else {
                        len++;
@@ -3149,9 +3161,23 @@ tryagain:
                    break;
                }
                if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
-                   reguni(pRExC_state, ender, s, &numlen);
-                   s += numlen;
-                   len += numlen - 1;
+                    if (FOLD) {
+                         for (foldbuf = tmpbuf;
+                              foldlen;
+                              foldlen -= numlen) {
+                              ender = utf8_to_uvchr(foldbuf, &numlen);
+                              reguni(pRExC_state, ender, s, &numlen);
+                              s       += numlen;
+                              len     += numlen;
+                              foldbuf += numlen;
+                         }
+                    }
+                    else {
+                         reguni(pRExC_state, ender, s, &numlen);
+                         s   += numlen;
+                         len += numlen;
+                    }
+                    len--;
                }
                else
                    REGC(ender, s++);
index c932165..e67774d 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -979,38 +979,59 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
                 * Fortunately, not getting this right is allowed
                 * for Unicode Regular Expression Support level 1,
                 * only one-to-one matching is required. --jhi */
+
                if (c1 == c2) {
                    while (s <= e) {
                        c = utf8_to_uvchr((U8*)s, &len);
-                       uvchr_to_utf8(tmpbuf, c);
-                       to_utf8_fold(tmpbuf, foldbuf, &foldlen);
-                       f = utf8_to_uvchr(foldbuf, 0);
-                       
-                       if ( ((c == c1 && ln == len) ||
-                             (f == c1 && ln == foldlen) ||
-                             !ibcmp_utf8(s, do_utf8, (I32)(strend - s),
-                                         m, UTF, (I32)ln))
+                       if ( c == c1
+                            && (ln == len ||
+                                !ibcmp_utf8(s, do_utf8, strend - s,
+                                            m, UTF, ln))
                             && (norun || regtry(prog, s)) )
                            goto got_it;
+                       else {
+                            uvchr_to_utf8(tmpbuf, c);
+                            to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+                            f = utf8_to_uvchr(foldbuf, 0);
+                            if ( f != c
+                                 && (f == c1 || f == c2)
+                                 && (ln == foldlen ||
+                                     !ibcmp_utf8((char *)foldbuf,
+                                                 do_utf8, foldlen,
+                                                 m, UTF, ln))
+                                 && (norun || regtry(prog, s)) )
+                                 goto got_it;
+                       }
                        s += len;
                    }
                }
                else {
                    while (s <= e) {
                        c = utf8_to_uvchr((U8*)s, &len);
-                       uvchr_to_utf8(tmpbuf, c);
-                       to_utf8_fold(tmpbuf, foldbuf, &foldlen);
-                       f = utf8_to_uvchr(foldbuf, 0);
-                       
+
                        if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
                            c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
                            c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
-                       if ( (((c == c1 || c == c2) && ln == len) ||
-                             ((f == c1 || f == c2) && ln == foldlen) ||
-                             !ibcmp_utf8(s, do_utf8, (I32)(strend - s),
-                                         m, UTF, (I32)ln))
+
+                       if ( (c == c1 || c == c2)
+                            && (ln == len ||
+                                !ibcmp_utf8(s, do_utf8, strend - s,
+                                            m, UTF, ln))
                             && (norun || regtry(prog, s)) )
                            goto got_it;
+                       else {
+                            uvchr_to_utf8(tmpbuf, c);
+                            to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+                            f = utf8_to_uvchr(foldbuf, 0);
+                            if ( f != c
+                                 && (f == c1 || f == c2)
+                                 && (ln == foldlen ||
+                                     !ibcmp_utf8((char *)foldbuf,
+                                                 do_utf8, foldlen,
+                                                 m, UTF, ln))
+                                 && (norun || regtry(prog, s)) )
+                                 goto got_it;
+                       }
                        s += len;
                    }
                }
@@ -2372,10 +2393,21 @@ S_regmatch(pTHX_ regnode *prog)
                               sayNO;
                          if (UTF8SKIP(s) != UTF8SKIP(l) ||
                              memNE(s, (char*)l, UTF8SKIP(s))) {
-                              to_utf8_fold((U8*)l, tmpbuf, &ulen);
-                              if (UTF8SKIP(s) != ulen ||
-                                  memNE(s, (char*)tmpbuf, ulen))
-                                   sayNO;
+                              U8 lfoldbuf[UTF8_MAXLEN_FOLD+1];
+                              STRLEN lfoldlen;
+
+                              to_utf8_fold((U8*)l, lfoldbuf, &lfoldlen);
+                              if (UTF8SKIP(s) != lfoldlen ||
+                                  memNE(s, (char*)lfoldbuf, lfoldlen)) {
+                                   U8 sfoldbuf[UTF8_MAXLEN_FOLD+1];
+                                   STRLEN sfoldlen;
+
+                                   to_utf8_fold((U8*)s, sfoldbuf, &sfoldlen);
+                                   if (sfoldlen != lfoldlen ||
+                                       memNE((char*)sfoldbuf,
+                                             (char*)lfoldbuf, lfoldlen))
+                                     sayNO;
+                              }
                          }
                          l += UTF8SKIP(l);
                          s += UTF8SKIP(s);