The test requires perlio.
[p5sagit/p5-mst-13.2.git] / regcomp.c
index 53d8947..b453116 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -69,7 +69,7 @@
  *
  ****    Alterations to Henry's code are...
  ****
- ****    Copyright (c) 1991-2001, Larry Wall
+ ****    Copyright (c) 1991-2002, Larry Wall
  ****
  ****    You may distribute under the terms of either the GNU General Public
  ****    License or the Artistic License, as specified in the README file.
@@ -736,6 +736,50 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                    n = nnext;
                }
            }
+
+           if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
+/*
+  Two problematic code points in Unicode casefolding of EXACT nodes:
+
+   U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+   U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+
+   which casefold to
+
+   Unicode                     UTF-8
+
+   U+03B9 U+0308 U+0301                0xCE 0xB9 0xCC 0x88 0xCC 0x81
+   U+03C5 U+0308 U+0301                0xCF 0x85 0xCC 0x88 0xCC 0x81
+
+   This means that in case-insensitive matching (or "loose matching",
+   as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
+   length of the above casefolded versions) can match a target string
+   of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
+   This would rather mess up the minimum length computation.
+
+   What we'll do is to look for the tail four bytes, and then peek
+   at the preceding two bytes to see whether we need to decrease
+   the minimum length by four (six minus two).
+
+   Thanks to the design of UTF-8, there cannot be false matches:
+   A sequence of valid UTF-8 bytes cannot be a subsequence of
+   another valid sequence of UTF-8 bytes.
+
+*/
+                char *s0 = STRING(scan), *s, *t;
+                char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
+                char *t0 = "\xcc\x88\xcc\x81";
+                char *t1 = t0 + 3;
+                
+                for (s = s0 + 2;
+                     s < s2 && (t = ninstr(s, s1, t0, t1));
+                     s = t + 4) {
+                     if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
+                         ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+                          min -= 4;
+                }
+           }
+
 #ifdef DEBUGGING
            /* Allow dumping */
            n = scan + NODE_SZ_STR(scan);
@@ -1690,17 +1734,15 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     if (exp == NULL)
        FAIL("NULL regexp argument");
 
-    /* XXXX This looks very suspicious... */
-    if (pm->op_pmdynflags & PMdf_CMP_UTF8)
-        RExC_utf8 = 1;
-    else
-        RExC_utf8 = 0;
+    RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
 
     RExC_precomp = exp;
-    DEBUG_r(if (!PL_colorset) reginitcolors());
-    DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
-                     PL_colors[4],PL_colors[5],PL_colors[0],
-                     (int)(xend - exp), RExC_precomp, PL_colors[1]));
+    DEBUG_r({
+        if (!PL_colorset) reginitcolors();
+        PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
+                      PL_colors[4],PL_colors[5],PL_colors[0],
+                      (int)(xend - exp), RExC_precomp, PL_colors[1]);
+    });
     RExC_flags16 = pm->op_pmflags;
     RExC_sawback = 0;
 
@@ -1850,7 +1892,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
            first = NEXTOPER(first);
            goto again;
        }
-       else if ((OP(first) == STAR &&
+       else if (!sawopen && (OP(first) == STAR &&
            PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
            !(r->reganch & ROPT_ANCH) )
        {
@@ -2126,6 +2168,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
                /* FALL THROUGH*/
            case '?':           /* (??...) */
                logical = 1;
+               if (*RExC_parse != '{')
+                   goto unknown;
                paren = *RExC_parse++;
                /* FALL THROUGH */
            case '{':           /* (?{...}) */
@@ -2363,9 +2407,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
     }
     else if (paren != '?')             /* Not Conditional */
        ret = br;
-    if (flags&HASWIDTH)
-       *flagp |= HASWIDTH;
-    *flagp |= flags&SPSTART;
+    *flagp |= flags & (SPSTART | HASWIDTH);
     lastbr = br;
     while (*RExC_parse == '|') {
        if (!SIZE_ONLY && RExC_extralen) {
@@ -2987,8 +3029,8 @@ tryagain:
            register char *p;
            char *oldp, *s;
            STRLEN numlen;
-           STRLEN ulen;
-           U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+           STRLEN foldlen;
+           U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
 
             parse_start = RExC_parse - 1;
 
@@ -3019,6 +3061,8 @@ tryagain:
                case '\\':
                    switch (*++p) {
                    case 'A':
+                   case 'C':
+                   case 'X':
                    case 'G':
                    case 'Z':
                    case 'z':
@@ -3131,16 +3175,38 @@ tryagain:
                if (RExC_flags16 & PMf_EXTENDED)
                    p = regwhite(p, RExC_end);
                if (UTF && FOLD) {
-                   toLOWER_uni(ender, tmpbuf, &ulen);
-                   ender = utf8_to_uvchr(tmpbuf, 0);
+                   /* Prime the casefolded buffer. */
+                   ender = toFOLD_uni(ender, tmpbuf, &foldlen);
                }
                if (ISMULT2(p)) { /* Back off on ?+*. */
                    if (len)
                        p = oldp;
-                   else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
-                       reguni(pRExC_state, ender, s, &numlen);
-                       s += numlen;
-                       len += numlen;
+                   else if (UTF) {
+                        if (FOLD) {
+                             /* Emit all the Unicode characters. */
+                             for (foldbuf = tmpbuf;
+                                  foldlen;
+                                  foldlen -= numlen) {
+                                  ender = utf8_to_uvchr(foldbuf, &numlen);
+                                  if (numlen > 0) {
+                                       reguni(pRExC_state, ender, s, &numlen);
+                                       s       += numlen;
+                                       len     += numlen;
+                                       foldbuf += numlen;
+                                       if (numlen >= foldlen)
+                                            break;
+                                  }
+                                  else
+                                       break; /* "Can't happen." */
+                             }
+                        }
+                        else {
+                             reguni(pRExC_state, ender, s, &numlen);
+                             if (numlen > 0) {
+                                  s   += numlen;
+                                  len += numlen;
+                             }
+                        }
                    }
                    else {
                        len++;
@@ -3148,10 +3214,33 @@ tryagain:
                    }
                    break;
                }
-               if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
-                   reguni(pRExC_state, ender, s, &numlen);
-                   s += numlen;
-                   len += numlen - 1;
+               if (UTF) {
+                    if (FOLD) {
+                         /* Emit all the Unicode characters. */
+                         for (foldbuf = tmpbuf;
+                              foldlen;
+                              foldlen -= numlen) {
+                              ender = utf8_to_uvchr(foldbuf, &numlen);
+                              if (numlen > 0) {
+                                   reguni(pRExC_state, ender, s, &numlen);
+                                   len     += numlen;
+                                   s       += numlen;
+                                   foldbuf += numlen;
+                                   if (numlen >= foldlen)
+                                        break;
+                              }
+                              else
+                                   break;
+                         }
+                    }
+                    else {
+                         reguni(pRExC_state, ender, s, &numlen);
+                         if (numlen > 0) {
+                              s   += numlen;
+                              len += numlen;
+                         }
+                    }
+                    len--;
                }
                else
                    REGC(ender, s++);
@@ -3180,20 +3269,28 @@ tryagain:
        break;
     }
 
-    if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT && !RExC_utf8) {
+    /* If the encoding pragma is in effect recode the text of
+     * any EXACT-kind nodes. */
+    if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
         STRLEN oldlen = STR_LEN(ret);
         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
-        char *s       = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
-        STRLEN newlen = SvCUR(sv);
-        if (!SIZE_ONLY) {
-             DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
-                                   (int)oldlen, STRING(ret), (int)newlen, s));
-             Copy(s, STRING(ret), newlen, char);
-             STR_LEN(ret) += newlen - oldlen;
-             RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
-        } else
-             RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
-        RExC_utf8 = 1;
+
+        if (RExC_utf8)
+             SvUTF8_on(sv);
+        if (sv_utf8_downgrade(sv, TRUE)) {
+             char *s       = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
+             STRLEN newlen = SvCUR(sv);
+        
+             if (!SIZE_ONLY) {
+                  DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
+                                        (int)oldlen, STRING(ret),
+                                        (int)newlen, s));
+                  Copy(s, STRING(ret), newlen, char);
+                  STR_LEN(ret) += newlen - oldlen;
+                  RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
+             } else
+                  RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+        }
     }
 
     return(ret);
@@ -3353,8 +3450,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
 STATIC void
 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 {
-    if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
-       POSIXCC(UCHARAT(RExC_parse))) {
+    if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
        char *s = RExC_parse;
        char  c = *s++;
 
@@ -3390,7 +3486,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
     SV *listsv = Nullsv;
     register char *e;
     UV n;
-    bool optimize_invert = TRUE;
+    bool optimize_invert   = TRUE;
+    AV* unicode_alternate  = 0;
 
     ret = reganode(pRExC_state, ANYOF, 0);
 
@@ -3418,11 +3515,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 
     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
 
-    if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
+    if (!SIZE_ONLY && POSIXCC(nextvalue))
        checkposixcc(pRExC_state);
 
-    if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-')
-       goto charclassloop;             /* allow 1st char to be ] or - */
+    /* allow 1st char to be ] (allowing it to be - is dealt with later) */
+    if (UCHARAT(RExC_parse) == ']')
+       goto charclassloop;
 
     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
 
@@ -3967,17 +4065,76 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                }
                else
 #endif
-                   for (i = prevvalue; i <= ceilvalue; i++)
-                       ANYOF_BITMAP_SET(ret, i);
+                     for (i = prevvalue; i <= ceilvalue; i++)
+                         ANYOF_BITMAP_SET(ret, i);
          }
-         if (value > 255) {
+         if (value > 255 || UTF) {
+               UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
+               UV natvalue      = NATIVE_TO_UNI(value);
+
                ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
-               if (prevvalue < value)
+               if (prevnatvalue < natvalue) { /* what about > ? */
                    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
-                                  (UV)prevvalue, (UV)value);
-               else if (prevvalue == value)
-                   Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
-                                  (UV)value);
+                                  prevnatvalue, natvalue);
+               }
+               else if (prevnatvalue == natvalue) {
+                   Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
+                   if (FOLD) {
+                        U8 tmpbuf [UTF8_MAXLEN+1];
+                        U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+                        STRLEN foldlen;
+                        UV f;
+
+                        uvchr_to_utf8(tmpbuf, natvalue);
+                        to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+                        f = UNI_TO_NATIVE(utf8_to_uvchr(foldbuf, 0));
+
+                        /* If folding and foldable and a single
+                         * character, insert also the folded version
+                         * to the charclass. */
+                        if (f != value) {
+                             if (foldlen == UNISKIP(f))
+                                 Perl_sv_catpvf(aTHX_ listsv,
+                                                "%04"UVxf"\n", f);
+                             else {
+                                 /* Any multicharacter foldings
+                                  * require the following transform:
+                                  * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
+                                  * where E folds into "pq" and F folds
+                                  * into "rst", all other characters
+                                  * fold to single characters.  We save
+                                  * away these multicharacter foldings,
+                                  * to be later saved as part of the
+                                  * additional "s" data. */
+                                 SV *sv;
+
+                                 if (!unicode_alternate)
+                                     unicode_alternate = newAV();
+                                 sv = newSVpvn((char*)foldbuf, foldlen);
+                                 SvUTF8_on(sv);
+                                 av_push(unicode_alternate, sv);
+                             }
+                        }
+
+                        /* If folding and the value is one of the Greek
+                         * sigmas insert a few more sigmas to make the
+                         * folding rules of the sigmas to work right.
+                         * Note that not all the possible combinations
+                         * are handled here: some of them are handled
+                         * by the standard folding rules, and some of
+                         * them (literal or EXACTF cases) are handled
+                         * during runtime in regexec.c:S_find_byclass(). */
+                        if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+                        }
+                        else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
+                             Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+                                            (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+                   }
+               }
            }
         }
 
@@ -4021,8 +4178,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
        AV *av = newAV();
        SV *rv;
 
+       /* The 0th element stores the character class description
+        * in its textual form: used later (regexec.c:Perl_regclass_swatch())
+        * to initialize the appropriate swash (which gets stored in
+        * the 1st element), and also useful for dumping the regnode.
+        * The 2nd element stores the multicharacter foldings,
+        * used later (regexec.c:s_reginclasslen()). */
        av_store(av, 0, listsv);
        av_store(av, 1, NULL);
+       av_store(av, 2, (SV*)unicode_alternate);
        rv = newRV_noinc((SV*)av);
        n = add_data(pRExC_state, 1, "s");
        RExC_rx->data->data[n] = (void*)rv;
@@ -4456,9 +4620,13 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
 
     if (k == EXACT) {
         SV *dsv = sv_2mortal(newSVpvn("", 0));
-       bool do_utf8 = DO_UTF8(sv);
+       /* Using is_utf8_string() is a crude hack but it may
+        * be the best for now since we have no flag "this EXACTish
+        * node was UTF-8" --jhi */
+       bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
        char *s    = do_utf8 ?
-         pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) :
+         pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
+                        UNI_DISPLAY_REGEX) :
          STRING(o);
        int len = do_utf8 ?
          strlen(s) :
@@ -4552,7 +4720,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
 
        {
            SV *lv;
-           SV *sw = regclass_swash(o, FALSE, &lv);
+           SV *sw = regclass_swash(o, FALSE, &lv, 0);
        
            if (lv) {
                if (sw) {
@@ -4641,16 +4809,25 @@ Perl_re_intuit_string(pTHX_ regexp *prog)
 void
 Perl_pregfree(pTHX_ struct regexp *r)
 {
-    DEBUG_r(if (!PL_colorset) reginitcolors());
+#ifdef DEBUGGING
+    SV *dsv = PERL_DEBUG_PAD_ZERO(0);
+#endif
 
     if (!r || (--r->refcnt > 0))
        return;
-    DEBUG_r(PerlIO_printf(Perl_debug_log,
-                     "%sFreeing REx:%s `%s%.60s%s%s'\n",
-                     PL_colors[4],PL_colors[5],PL_colors[0],
-                     r->precomp,
-                     PL_colors[1],
-                     (strlen(r->precomp) > 60 ? "..." : "")));
+    DEBUG_r({
+         char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
+                                 UNI_DISPLAY_REGEX);
+        int len = SvCUR(dsv);
+        if (!PL_colorset)
+             reginitcolors();
+        PerlIO_printf(Perl_debug_log,
+                      "%sFreeing REx:%s `%s%*.*s%s%s'\n",
+                      PL_colors[4],PL_colors[5],PL_colors[0],
+                      len, len, s,
+                      PL_colors[1],
+                      len > 60 ? "..." : "");
+    });
 
     if (r->precomp)
        Safefree(r->precomp);
@@ -4706,7 +4883,7 @@ Perl_pregfree(pTHX_ struct regexp *r)
                new_comppad = NULL;
                break;
            case 'n':
-               break;
+               break;
            default:
                Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
            }