[REPATCH] Re: [PATCH] Re: socketpair blip on unicos/mk, too

[p5sagit/p5-mst-13.2.git] / regexec.c
diff --git a/regexec.c b/regexec.c

index b7528e7..5f25888 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -393,11 +393,22 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
 #endif
 
+    if (prog->reganch & ROPT_UTF8) {
+       DEBUG_r(PerlIO_printf(Perl_debug_log,
+                             "UTF-8 regex...\n"));
+       PL_reg_flags |= RF_utf8;
+    }
+
     DEBUG_r({
-        char*s   = UTF ? sv_uni_display(dsv, sv, 60, 0) : strpos;
-        int  len = UTF ? strlen(s) : strend - strpos;
+        char *s   = PL_reg_match_utf8 ?
+                        sv_uni_display(dsv, sv, 60, 0) : strpos;
+        int   len = PL_reg_match_utf8 ?
+                        strlen(s) : strend - strpos;
         if (!PL_colorset)
              reginitcolors();
+        if (PL_reg_match_utf8)
+            DEBUG_r(PerlIO_printf(Perl_debug_log,
+                                  "UTF-8 target...\n"));
         PerlIO_printf(Perl_debug_log,
                       "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
                       PL_colors[4],PL_colors[5],PL_colors[0],
@@ -411,9 +422,6 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
              );
     });
 
-    if (prog->reganch & ROPT_UTF8)
-       PL_reg_flags |= RF_utf8;
-
     if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
        DEBUG_r(PerlIO_printf(Perl_debug_log,
                              "String too short... [re_intuit_start]\n"));
@@ -952,40 +960,89 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
            c1 = *(U8*)m;
            c2 = PL_fold_locale[c1];
          do_exactf:
-           e = strend - ln;
+           e = do_utf8 ? s + ln : strend - ln;
 
            if (norun && e < s)
                e = s;                  /* Due to minlen logic of intuit() */
 
+           /* The idea in the EXACTF* cases is to first find the
+            * first character of the EXACTF* node and then, if
+            * necessary, case-insensitively compare the full
+            * text of the node.  The c1 and c2 are the first
+            * characters (though in Unicode it gets a bit
+            * more complicated because there are more cases
+            * than just upper and lower: one is really supposed
+            * to use the so-called folding case for case-insensitive
+            * matching (called "loose matching" in Unicode).  */
+
            if (do_utf8) {
-               STRLEN len;
-               /* The ibcmp_utf8() uses to_uni_fold() which is more
-                * correct folding for Unicode than using lowercase.
-                * However, it doesn't work quite fully since the folding
-                * is a one-to-many mapping and the regex optimizer is
-                * unaware of this, so it may throw out good matches.
-                * Fortunately, not getting this right is allowed
-                * for Unicode Regular Expression Support level 1,
-                * only one-to-one matching is required. --jhi */
-               if (c1 == c2)
+               UV c, f;
+               U8 tmpbuf [UTF8_MAXLEN+1];
+               U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+               STRLEN len, foldlen;
+               
+               if (c1 == c2) {
                    while (s <= e) {
-                       if ( utf8_to_uvchr((U8*)s, &len) == c1
-                            && (ln == 1 ||
-                                ibcmp_utf8(s, do_utf8,  strend - s,
-                                           m, UTF, ln)) )
+                       c = utf8_to_uvchr((U8*)s, &len);
+                       if ( c == c1
+                            && (ln == len ||
+                                ibcmp_utf8(s, (char **)0, 0,  do_utf8,
+                                           m, (char **)0, ln, UTF))
+                            && (norun || regtry(prog, s)) )
                            goto got_it;
+                       else {
+                            uvchr_to_utf8(tmpbuf, c);
+                            f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+                            if ( f != c
+                                 && (f == c1 || f == c2)
+                                 && (ln == foldlen ||
+                                     !ibcmp_utf8((char *) foldbuf,
+                                                 (char **)0, foldlen, do_utf8,
+                                                 m,
+                                                 (char **)0, ln,      UTF))
+                                 && (norun || regtry(prog, s)) )
+                                 goto got_it;
+                       }
                        s += len;
                    }
-               else
+               }
+               else {
                    while (s <= e) {
-                       UV c = utf8_to_uvchr((U8*)s, &len);
+                       c = utf8_to_uvchr((U8*)s, &len);
+
+                       /* Handle some of the three Greek sigmas cases.
+                         * Note that not all the possible combinations
+                         * are handled here: some of them are handled
+                         * handled by the standard folding rules, and
+                         * some of them (the character class or ANYOF
+                         * cases) are handled during compiletime in
+                         * regexec.c:S_regclass(). */
+                       if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
+                           c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
+                           c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
+
                        if ( (c == c1 || c == c2)
-                            && (ln == 1 ||
-                                ibcmp_utf8(s, do_utf8, strend - s,
-                                           m, UTF, ln)) )
+                            && (ln == len ||
+                                ibcmp_utf8(s, (char **)0, 0,  do_utf8,
+                                           m, (char **)0, ln, UTF))
+                            && (norun || regtry(prog, s)) )
                            goto got_it;
+                       else {
+                            uvchr_to_utf8(tmpbuf, c);
+                            f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+                            if ( f != c
+                                 && (f == c1 || f == c2)
+                                 && (ln == foldlen ||
+                                     !ibcmp_utf8((char *)foldbuf,
+                                                 (char **)0, foldlen, do_utf8,
+                                                 m,
+                                                 (char **)0, ln,      UTF))
+                                 && (norun || regtry(prog, s)) )
+                                 goto got_it;
+                       }
                        s += len;
                    }
+               }
            }
            else {
                if (c1 == c2)
@@ -2230,10 +2287,10 @@ S_regmatch(pTHX_ regnode *prog)
            s = STRING(scan);
            ln = STR_LEN(scan);
            if (do_utf8 != (UTF!=0)) {
-               /* The target and the pattern have differing "utf8ness". */
+               /* The target and the pattern have differing utf8ness. */
                char *l = locinput;
                char *e = s + ln;
-               STRLEN len;
+               STRLEN ulen;
 
                if (do_utf8) {
                    /* The target is utf8, the pattern is not utf8. */
@@ -2241,9 +2298,9 @@ S_regmatch(pTHX_ regnode *prog)
                        if (l >= PL_regeol)
                             sayNO;
                        if (NATIVE_TO_UNI(*(U8*)s) !=
-                           utf8_to_uvchr((U8*)l, &len))
+                           utf8_to_uvchr((U8*)l, &ulen))
                             sayNO;
-                       l += len;
+                       l += ulen;
                        s ++;
                    }
                }
@@ -2253,9 +2310,9 @@ S_regmatch(pTHX_ regnode *prog)
                        if (l >= PL_regeol)
                            sayNO;
                        if (NATIVE_TO_UNI(*((U8*)l)) !=
-                           utf8_to_uvchr((U8*)s, &len))
+                           utf8_to_uvchr((U8*)s, &ulen))
                            sayNO;
-                       s += len;
+                       s += ulen;
                        l ++;
                    }
                }
@@ -2263,7 +2320,7 @@ S_regmatch(pTHX_ regnode *prog)
                nextchr = UCHARAT(locinput);
                break;
            }
-           /* The target and the pattern have the same "utf8ness". */
+           /* The target and the pattern have the same utf8ness. */
            /* Inline the first character, for speed. */
            if (UCHARAT(s) != nextchr)
                sayNO;
@@ -2281,26 +2338,21 @@ S_regmatch(pTHX_ regnode *prog)
            s = STRING(scan);
            ln = STR_LEN(scan);
 
-           if (do_utf8) {
+           if (do_utf8 || UTF) {
+             /* Either target or the pattern are utf8. */
                char *l = locinput;
-               char *e;
-               STRLEN ulen;
-               U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
-               e = s + ln;
-               while (s < e) {
-                   if (l >= PL_regeol)
-                       sayNO;
-                   toLOWER_utf8((U8*)l, tmpbuf, &ulen);
-                   if (memNE(s, (char*)tmpbuf, ulen))
-                       sayNO;
-                   s += UTF8SKIP(s);
-                   l += ulen;
-               }
-               locinput = l;
+               char *e = PL_regeol;
+
+               if (ibcmp_utf8(s, 0,  ln, do_utf8,
+                              l, &e, 0,  UTF))
+                    sayNO;
+               locinput = e;
                nextchr = UCHARAT(locinput);
                break;
            }
 
+           /* Neither the target and the pattern are utf8. */
+
            /* Inline the first character, for speed. */
            if (UCHARAT(s) != nextchr &&
                UCHARAT(s) != ((OP(scan) == EXACTF)
@@ -2317,11 +2369,13 @@ S_regmatch(pTHX_ regnode *prog)
            break;
        case ANYOF:
            if (do_utf8) {
-               if (!reginclass(scan, (U8*)locinput, do_utf8))
+               STRLEN inclasslen = PL_regeol - locinput;
+
+               if (!reginclasslen(scan, (U8*)locinput, &inclasslen, do_utf8))
                    sayNO;
                if (locinput >= PL_regeol)
                    sayNO;
-               locinput += PL_utf8skip[nextchr];
+               locinput += inclasslen;
                nextchr = UCHARAT(locinput);
            }
            else {
@@ -2522,16 +2576,21 @@ S_regmatch(pTHX_ regnode *prog)
            nextchr = UCHARAT(++locinput);
            break;
        case CLUMP:
-           LOAD_UTF8_CHARCLASS(mark,"~");
-           if (locinput >= PL_regeol ||
-               swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
-               sayNO;
-           locinput += PL_utf8skip[nextchr];
-           while (locinput < PL_regeol &&
-                  swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
-               locinput += UTF8SKIP(locinput);
-           if (locinput > PL_regeol)
+           if (locinput >= PL_regeol)
                sayNO;
+           if  (do_utf8) {
+               LOAD_UTF8_CHARCLASS(mark,"~");
+               if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
+                   sayNO;
+               locinput += PL_utf8skip[nextchr];
+               while (locinput < PL_regeol &&
+                      swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
+                   locinput += UTF8SKIP(locinput);
+               if (locinput > PL_regeol)
+                   sayNO;
+           } 
+           else
+              locinput++;
            nextchr = UCHARAT(locinput);
            break;
        case REFFL:
@@ -4050,10 +4109,11 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
 */
 
 SV *
-Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
+Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
 {
-    SV *sw = NULL;
-    SV *si = NULL;
+    SV *sw  = NULL;
+    SV *si  = NULL;
+    SV *alt = NULL;
 
     if (PL_regdata && PL_regdata->count) {
        U32 n = ARG(node);
@@ -4061,10 +4121,11 @@ Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
        if (PL_regdata->what[n] == 's') {
            SV *rv = (SV*)PL_regdata->data[n];
            AV *av = (AV*)SvRV((SV*)rv);
-           SV **a;
+           SV **a, **b;
        
-           si = *av_fetch(av, 0, FALSE);
-           a  =  av_fetch(av, 1, FALSE);
+           si  = *av_fetch(av, 0, FALSE);
+           a   =  av_fetch(av, 1, FALSE);
+           b   =  av_fetch(av, 2, FALSE);
        
            if (a)
                sw = *a;
@@ -4072,11 +4133,15 @@ Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
                sw = swash_init("utf8", "", si, 1, 0);
                (void)av_store(av, 1, sw);
            }
+           if (b)
+               alt = *b;
        }
     }
        
-    if (initsvp)
-       *initsvp = si;
+    if (listsvp)
+       *listsvp = si;
+    if (altsvp)
+       *altsvp  = alt;
 
     return sw;
 }
@@ -4086,16 +4151,20 @@ Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
  */
 
 STATIC bool
-S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
+S_reginclasslen(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
 {
     char flags = ANYOF_FLAGS(n);
     bool match = FALSE;
     UV c;
     STRLEN len = 0;
+    STRLEN plen;
 
     c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
 
+    plen = lenp ? *lenp : UNISKIP(c);
     if (do_utf8 || (flags & ANYOF_UNICODE)) {
+        if (lenp)
+           *lenp = 0;
        if (do_utf8 && !ANYOF_RUNTIME(n)) {
            if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
                match = TRUE;
@@ -4103,24 +4172,46 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
        if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
            match = TRUE;
        if (!match) {
-           SV *sw = regclass_swash(n, TRUE, 0);
+           AV *av;
+           SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
        
            if (sw) {
                if (swash_fetch(sw, p, do_utf8))
                    match = TRUE;
                else if (flags & ANYOF_FOLD) {
-                   STRLEN ulen;
                    U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+                   STRLEN tmplen;
 
-                   to_utf8_fold(p, tmpbuf, &ulen);
-                   if (swash_fetch(sw, tmpbuf, do_utf8))
-                       match = TRUE;
-                   to_utf8_upper(p, tmpbuf, &ulen);
-                   if (swash_fetch(sw, tmpbuf, do_utf8))
-                       match = TRUE;
+                   if (!match && lenp && av) {
+                       I32 i;
+                     
+                       for (i = 0; i <= av_len(av); i++) {
+                           SV* sv = *av_fetch(av, i, FALSE);
+                           STRLEN len;
+                           char *s = SvPV(sv, len);
+                       
+                           if (len <= plen && memEQ(s, p, len)) {
+                               *lenp = len;
+                               match = TRUE;
+                               break;
+                           }
+                       }
+                   }
+                   if (!match) {
+                       to_utf8_fold(p, tmpbuf, &tmplen);
+                       if (swash_fetch(sw, tmpbuf, do_utf8))
+                           match = TRUE;
+                   }
+                   if (!match) {
+                       to_utf8_upper(p, tmpbuf, &tmplen);
+                       if (swash_fetch(sw, tmpbuf, do_utf8))
+                           match = TRUE;
+                   }
                }
            }
        }
+       if (match && lenp && *lenp == 0)
+           *lenp = UNISKIP(c);
     }
     if (!match && c < 256) {
        if (ANYOF_BITMAP_TEST(n, c))
@@ -4181,6 +4272,12 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
     return (flags & ANYOF_INVERT) ? !match : match;
 }
 
+STATIC bool
+S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
+{
+    return S_reginclasslen(aTHX_ n, p, 0, do_utf8);
+}
+
 STATIC U8 *
 S_reghop(pTHX_ U8 *s, I32 off)
 {