integrate mainline to get tr.t
[p5sagit/p5-mst-13.2.git] / regcomp.c
index 9ab0d30..ca0fd09 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -38,6 +38,7 @@
 /* *These* symbols are masked to allow static link. */
 #  define Perl_pregfree my_regfree
 #  define Perl_regnext my_regnext
+#  define Perl_save_re_context my_save_re_context
 #endif 
 
 /*SUPPRESS 112*/
@@ -251,6 +252,7 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32
     regnode *scan = *scanp, *next;
     I32 delta = 0;
     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
+    int is_inf_internal = 0;           /* The studied chunk is infinite */
     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
     scan_data_t data_fake;
     
@@ -365,7 +367,7 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32
                    if (max1 < minnext + deltanext)
                        max1 = minnext + deltanext;
                    if (deltanext == I32_MAX)
-                       is_inf = 1;
+                       is_inf = is_inf_internal = 1;
                    scan = next;
                    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
                        pars++;
@@ -460,7 +462,7 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32
                min++;
                /* Fall through. */
            case STAR:
-               is_inf = 1; 
+               is_inf = is_inf_internal = 1; 
                scan = regnext(scan);
                if (flags & SCF_DO_SUBSTR) {
                    scan_commit(data);
@@ -489,13 +491,15 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32
                                        ? (flags & ~SCF_DO_SUBSTR) : flags);
                if (!scan)              /* It was not CURLYX, but CURLY. */
                    scan = next;
-               if (PL_dowarn && (minnext + deltanext == 0) 
+               if (ckWARN(WARN_UNSAFE) && (minnext + deltanext == 0) 
                    && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
-                   && maxcount <= 10000) /* Complement check for big count */
-                   warn("Strange *+?{} on zero-length expression");
+                   && maxcount <= REG_INFTY/3) /* Complement check for big count */
+                   warner(WARN_UNSAFE, "Strange *+?{} on zero-length expression");
                min += minnext * mincount;
-               is_inf |= (maxcount == REG_INFTY && (minnext + deltanext) > 0
-                          || deltanext == I32_MAX);
+               is_inf_internal |= (maxcount == REG_INFTY 
+                                   && (minnext + deltanext) > 0
+                                  || deltanext == I32_MAX);
+               is_inf |= is_inf_internal;
                delta += (minnext + deltanext) * maxcount - minnext * mincount;
 
                /* Try powerful optimization CURLYX => CURLYN. */
@@ -636,6 +640,7 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32
                        }
                        data->longest = &(data->longest_float);
                    }
+                   SvREFCNT_dec(last_str);
                }
                if (data && (fl & SF_HAS_EVAL))
                    data->flags |= SF_HAS_EVAL;
@@ -651,7 +656,7 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32
                    scan_commit(data);
                    data->longest = &(data->longest_float);
                }
-               is_inf = 1;
+               is_inf = is_inf_internal = 1;
                break;
            }
        }
@@ -704,13 +709,20 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32
                if (data)
                    data->flags |= SF_HAS_EVAL;
        }
+       else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded */
+               if (flags & SCF_DO_SUBSTR) {
+                   scan_commit(data);
+                   data->longest = &(data->longest_float);
+               }
+               is_inf = is_inf_internal = 1;
+       }
        /* Else: zero-length, ignore. */
        scan = regnext(scan);
     }
 
   finish:
     *scanp = scan;
-    *deltap = is_inf ? I32_MAX : delta;
+    *deltap = is_inf_internal ? I32_MAX : delta;
     if (flags & SCF_DO_SUBSTR && is_inf) 
        data->pos_delta = I32_MAX - data->pos_min;
     if (is_par > U8_MAX)
@@ -780,14 +792,38 @@ pregcomp(char *exp, char *xend, PMOP *pm)
     if (exp == NULL)
        FAIL("NULL regexp argument");
 
-    if (PL_curcop == &compiling ? (PL_hints & HINT_UTF8) : IN_UTF8)
+    if (PL_curcop == &PL_compiling ? (PL_hints & HINT_UTF8) : IN_UTF8)
        PL_reg_flags |= RF_utf8;
     else
        PL_reg_flags = 0;
 
     PL_regprecomp = savepvn(exp, xend - exp);
-    DEBUG_r(PerlIO_printf(Perl_debug_log, "compiling RE `%*s'\n",
-                         xend - exp, PL_regprecomp));
+    DEBUG_r(
+       if (!PL_colorset) {
+           int i = 0;
+           char *s = PerlEnv_getenv("PERL_RE_COLORS");
+           
+           if (s) {
+               PL_colors[0] = s = savepv(s);
+               while (++i < 6) {
+                   s = strchr(s, '\t');
+                   if (s) {
+                       *s = '\0';
+                       PL_colors[i] = ++s;
+                   }
+                   else
+                       PL_colors[i] = "";
+               }
+           } else {
+               while (i < 6) 
+                   PL_colors[i++] = "";
+           }
+           PL_colorset = 1;
+       }
+       );
+    DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling%s RE `%s%*s%s'\n",
+                         PL_colors[4],PL_colors[5],PL_colors[0],
+                         xend - exp, PL_regprecomp, PL_colors[1]));
     PL_regflags = pm->op_pmflags;
     PL_regsawback = 0;
 
@@ -811,32 +847,6 @@ pregcomp(char *exp, char *xend, PMOP *pm)
     }
     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %d ", PL_regsize));
 
-    DEBUG_r(
-       if (!PL_colorset) {
-           int i = 0;
-           char *s = PerlEnv_getenv("TERMCAP_COLORS");
-           
-           PL_colorset = 1;
-           if (s) {
-               PL_colors[0] = s = savepv(s);
-               while (++i < 4) {
-                   s = strchr(s, '\t');
-                   if (!s) 
-                       FAIL("Not enough TABs in TERMCAP_COLORS");
-                   *s = '\0';
-                   PL_colors[i] = ++s;
-               }
-           }
-           else {
-               while (i < 4) 
-                   PL_colors[i++] = "";
-           }
-           /* Reset colors: */
-           PerlIO_printf(Perl_debug_log, "%s%s%s%s", 
-                         PL_colors[0],PL_colors[1],PL_colors[2],PL_colors[3]);
-       }
-       );
-
     /* Small enough for pointer-storage convention?
        If extralen==0, this means that we will not need long jumps. */
     if (PL_regsize >= 0x10000L && PL_extralen)
@@ -971,9 +981,10 @@ pregcomp(char *exp, char *xend, PMOP *pm)
                && (!(data.flags & SF_FL_BEFORE_MEOL)
                    || (PL_regflags & PMf_MULTILINE)))) {
            if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
-               && data.offset_fixed == data.offset_float_min)
-               goto remove;            /* Like in (a)+. */
-           
+               && data.offset_fixed == data.offset_float_min
+               && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
+                   goto remove_float;          /* As in (a)+. */
+
            r->float_substr = data.longest_float;
            r->float_min_offset = data.offset_float_min;
            r->float_max_offset = data.offset_float_max;
@@ -985,7 +996,7 @@ pregcomp(char *exp, char *xend, PMOP *pm)
                SvTAIL_on(r->float_substr);
        }
        else {
-         remove:
+         remove_float:
            r->float_substr = Nullsv;
            SvREFCNT_dec(data.longest_float);
            longest_float_length = 0;
@@ -1076,6 +1087,7 @@ reg(I32 paren, I32 *flagp)
        if (*PL_regcomp_parse == '?') {
            U16 posflags = 0, negflags = 0;
            U16 *flagsp = &posflags;
+           int logical = 0;
 
            PL_regcomp_parse++;
            paren = *PL_regcomp_parse++;
@@ -1106,6 +1118,10 @@ reg(I32 paren, I32 *flagp)
                nextchar();
                *flagp = TRYAGAIN;
                return NULL;
+           case 'p':
+               logical = 1;
+               paren = *PL_regcomp_parse++;
+               /* FALL THROUGH */
            case '{':
            {
                dTHR;
@@ -1144,8 +1160,9 @@ reg(I32 paren, I32 *flagp)
                    PL_regcomp_rx->data->data[n+2] = (void*)sop;
                    SvREFCNT_dec(sv);
                }
-               else {          /* First pass */
-                   if (PL_reginterp_cnt < ++PL_seen_evals && PL_curcop != &compiling)
+               else {                                          /* First pass */
+                   if (PL_reginterp_cnt < ++PL_seen_evals
+                       && PL_curcop != &PL_compiling)
                        /* No compiled RE interpolated, has runtime
                           components ===> unsafe.  */
                        FAIL("Eval-group not allowed at runtime, use re 'eval'");
@@ -1154,6 +1171,13 @@ reg(I32 paren, I32 *flagp)
                }
                
                nextchar();
+               if (logical) {
+                   ret = reg_node(LOGICAL);
+                   if (!SIZE_ONLY)
+                       ret->flags = 2;
+                   regtail(ret, reganode(EVAL, n));
+                   return ret;
+               }
                return reganode(EVAL, n);
            }
            case '(':
@@ -1165,6 +1189,8 @@ reg(I32 paren, I32 *flagp)
                        I32 flag;
                        
                        ret = reg_node(LOGICAL);
+                       if (!SIZE_ONLY)
+                           ret->flags = 1;
                        regtail(ret, reg(1, &flag));
                        goto insert_if;
                    } 
@@ -1185,10 +1211,14 @@ reg(I32 paren, I32 *flagp)
                    else
                        regtail(br, reganode(LONGJMP, 0));
                    c = *nextchar();
+                   if (flags&HASWIDTH)
+                       *flagp |= HASWIDTH;
                    if (c == '|') {
                        lastbr = reganode(IFTHEN, 0); /* Fake one for optimizer. */
                        regbranch(&flags, 1);
                        regtail(ret, lastbr);
+                       if (flags&HASWIDTH)
+                           *flagp |= HASWIDTH;
                        c = *nextchar();
                    }
                    else
@@ -1557,8 +1587,8 @@ regpiece(I32 *flagp)
        goto do_curly;
     }
   nest_check:
-    if (PL_dowarn && !SIZE_ONLY && !(flags&HASWIDTH) && max > 10000) {
-       warn("%.*s matches null string many times",
+    if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
+       warner(WARN_UNSAFE, "%.*s matches null string many times",
            PL_regcomp_parse - origparse, origparse);
     }
 
@@ -1636,6 +1666,9 @@ tryagain:
     case '[':
        PL_regcomp_parse++;
        ret = (UTF ? regclassutf8() : regclass());
+       if (*PL_regcomp_parse != ']')
+           FAIL("unmatched [] in regexp");
+       nextchar();
        *flagp |= HASWIDTH|SIMPLE;
        break;
     case '(':
@@ -1703,7 +1736,7 @@ tryagain:
            *flagp |= HASWIDTH;
            nextchar();
            if (UTF && !PL_utf8_mark)
-               is_utf8_mark("~");      /* preload table */
+               is_utf8_mark((U8*)"~");         /* preload table */
            break;
        case 'w':
            ret = reg_node(
@@ -1713,7 +1746,7 @@ tryagain:
            *flagp |= HASWIDTH|SIMPLE;
            nextchar();
            if (UTF && !PL_utf8_alnum)
-               is_utf8_alnum("a");     /* preload table */
+               is_utf8_alnum((U8*)"a");        /* preload table */
            break;
        case 'W':
            ret = reg_node(
@@ -1723,7 +1756,7 @@ tryagain:
            *flagp |= HASWIDTH|SIMPLE;
            nextchar();
            if (UTF && !PL_utf8_alnum)
-               is_utf8_alnum("a");     /* preload table */
+               is_utf8_alnum((U8*)"a");        /* preload table */
            break;
        case 'b':
            PL_seen_zerolen++;
@@ -1734,7 +1767,7 @@ tryagain:
            *flagp |= SIMPLE;
            nextchar();
            if (UTF && !PL_utf8_alnum)
-               is_utf8_alnum("a");     /* preload table */
+               is_utf8_alnum((U8*)"a");        /* preload table */
            break;
        case 'B':
            PL_seen_zerolen++;
@@ -1745,7 +1778,7 @@ tryagain:
            *flagp |= SIMPLE;
            nextchar();
            if (UTF && !PL_utf8_alnum)
-               is_utf8_alnum("a");     /* preload table */
+               is_utf8_alnum((U8*)"a");        /* preload table */
            break;
        case 's':
            ret = reg_node(
@@ -1755,7 +1788,7 @@ tryagain:
            *flagp |= HASWIDTH|SIMPLE;
            nextchar();
            if (UTF && !PL_utf8_space)
-               is_utf8_space(" ");     /* preload table */
+               is_utf8_space((U8*)" ");        /* preload table */
            break;
        case 'S':
            ret = reg_node(
@@ -1765,21 +1798,44 @@ tryagain:
            *flagp |= HASWIDTH|SIMPLE;
            nextchar();
            if (UTF && !PL_utf8_space)
-               is_utf8_space(" ");     /* preload table */
+               is_utf8_space((U8*)" ");        /* preload table */
            break;
        case 'd':
            ret = reg_node(UTF ? DIGITUTF8 : DIGIT);
            *flagp |= HASWIDTH|SIMPLE;
            nextchar();
            if (UTF && !PL_utf8_digit)
-               is_utf8_digit("1");     /* preload table */
+               is_utf8_digit((U8*)"1");        /* preload table */
            break;
        case 'D':
            ret = reg_node(UTF ? NDIGITUTF8 : NDIGIT);
            *flagp |= HASWIDTH|SIMPLE;
            nextchar();
            if (UTF && !PL_utf8_digit)
-               is_utf8_digit("1");     /* preload table */
+               is_utf8_digit((U8*)"1");        /* preload table */
+           break;
+       case 'p':
+       case 'P':
+           {   /* a lovely hack--pretend we saw [\pX] instead */
+               char* oldregxend = PL_regxend;
+
+               if (PL_regcomp_parse[1] == '{') {
+                   PL_regxend = strchr(PL_regcomp_parse, '}');
+                   if (!PL_regxend)
+                       FAIL("Missing right brace on \\p{}");
+                   PL_regxend++;
+               }
+               else
+                   PL_regxend = PL_regcomp_parse + 2;
+               PL_regcomp_parse--;
+
+               ret = regclassutf8();
+
+               PL_regxend = oldregxend;
+               PL_regcomp_parse--;
+               nextchar();
+               *flagp |= HASWIDTH|SIMPLE;
+           }
            break;
        case 'n':
        case 'r':
@@ -1876,6 +1932,8 @@ tryagain:
                    case 'S':
                    case 'd':
                    case 'D':
+                   case 'p':
+                   case 'P':
                        --p;
                        goto loopdone;
                    case 'n':
@@ -1952,7 +2010,7 @@ tryagain:
                default:
                  normal_default:
                    if ((*p & 0xc0) == 0xc0 && UTF) {
-                       ender = utf8_to_uv(p, &numlen);
+                       ender = utf8_to_uv((U8*)p, &numlen);
                        p += numlen;
                    }
                    else
@@ -2086,8 +2144,9 @@ regclass(void)
                     * (POSIX Extended Character Classes, that is)
                     * The text between e.g. [: and :] would start
                     * at posixccs + 1 and stop at regcomp_parse - 2. */
-                   if (dowarn && !SIZE_ONLY)
-                       warn("Character class syntax [%c %c] is reserved for future extensions", posixccc, posixccc);
+                   if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY)
+                       warner(WARN_UNSAFE,
+                           "Character class syntax [%c %c] is reserved for future extensions", posixccc, posixccc);
                    PL_regcomp_parse++; /* skip over the ending ] */
                }
            }
@@ -2210,14 +2269,28 @@ regclass(void)
            }
        }
        if (!SIZE_ONLY) {
-           for ( ; lastvalue <= value; lastvalue++)
-               ANYOF_SET(opnd, lastvalue);
-       }
+#ifndef ASCIIish
+           if ((isLOWER(lastvalue) && isLOWER(value)) ||
+               (isUPPER(lastvalue) && isUPPER(value)))
+           {
+               I32 i;
+               if (isLOWER(lastvalue)) {
+                   for (i = lastvalue; i <= value; i++)
+                       if (isLOWER(i))
+                           ANYOF_SET(opnd, i);
+               } else {
+                   for (i = lastvalue; i <= value; i++)
+                       if (isUPPER(i))
+                           ANYOF_SET(opnd, i);
+               }
+           }
+           else
+#endif
+               for ( ; lastvalue <= value; lastvalue++)
+                   ANYOF_SET(opnd, lastvalue);
+        }
        lastvalue = value;
     }
-    if (*PL_regcomp_parse != ']')
-       FAIL("unmatched [] in regexp");
-    nextchar();
     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
     if (!SIZE_ONLY && (*opnd & (0xFF ^ ANYOF_INVERT)) == ANYOF_FOLD) {
        for (value = 0; value < 256; ++value) {
@@ -2270,7 +2343,7 @@ regclassutf8(void)
 
     while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
        skipcond:
-       value = utf8_to_uv(PL_regcomp_parse, &numlen);
+       value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
        PL_regcomp_parse += numlen;
 
        if (value == '[' && PL_regcomp_parse + 1 < PL_regxend &&
@@ -2291,15 +2364,16 @@ regclassutf8(void)
                     * (POSIX Extended Character Classes, that is)
                     * The text between e.g. [: and :] would start
                     * at posixccs + 1 and stop at regcomp_parse - 2. */
-                   if (dowarn && !SIZE_ONLY)
-                       warn("Character class syntax [%c %c] is reserved for future extensions", posixccc, posixccc);
+                   if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY)
+                       warner(WARN_UNSAFE,
+                           "Character class syntax [%c %c] is reserved for future extensions", posixccc, posixccc);
                    PL_regcomp_parse++; /* skip over the ending ] */
                }
            }
        }
 
        if (value == '\\') {
-           value = utf8_to_uv(PL_regcomp_parse, &numlen);
+           value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
            PL_regcomp_parse += numlen;
            switch (value) {
            case 'w':
@@ -2329,7 +2403,7 @@ regclassutf8(void)
                        flags |= ANYOF_SPACEL;
                    sv_catpvf(listsv, "+utf8::IsSpace\n");
                    if (!PL_utf8_space)
-                       is_utf8_space(" ");
+                       is_utf8_space((U8*)" ");
                }
                lastvalue = 123456;
                continue;
@@ -2340,7 +2414,7 @@ regclassutf8(void)
                    sv_catpvf(listsv,
                        "!utf8::IsSpace\n");
                    if (!PL_utf8_space)
-                       is_utf8_space(" ");
+                       is_utf8_space((U8*)" ");
                }
                lastvalue = 123456;
                continue;
@@ -2444,9 +2518,6 @@ regclassutf8(void)
                sv_catpvf(listsv, "%04x\n", value);
        }
     }
-    if (*PL_regcomp_parse != ']')
-       FAIL("unmatched [] in regexp");
-    nextchar();
 
     ret = reganode(ANYOFUTF8, 0);
 
@@ -2550,11 +2621,11 @@ reguni(UV uv, char* s, I32* lenp)
 {
     dTHR;
     if (SIZE_ONLY) {
-       char tmpbuf[10];
+       U8 tmpbuf[10];
        *lenp = uv_to_utf8(tmpbuf, uv) - tmpbuf;
     }
     else
-       *lenp = uv_to_utf8(s, uv) - s;
+       *lenp = uv_to_utf8((U8*)s, uv) - (U8*)s;
 
 }
 
@@ -2996,7 +3067,7 @@ regprop(SV *sv, regnode *o)
        sv_catpvf(sv, "GROUPP%d", ARG(o));
        break;
     case LOGICAL:
-       p = "LOGICAL";
+       sv_catpvf(sv, "LOGICAL[%d]", o->flags);
        break;
     case SUSPEND:
        p = "SUSPEND";