"This is going to be slow." This change switches OpenBSD locale
[p5sagit/p5-mst-13.2.git] / regcomp.c
index 3cc1295..15f1feb 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -113,7 +113,8 @@ typedef struct RExC_state_t {
     I32                sawback;                /* Did we see \1, ...? */
     U32                seen;
     I32                size;                   /* Code size. */
-    I32                npar;                   /* () count. */
+    I32                npar;                   /* Capture buffer count, (OPEN). */
+    I32                cpar;                   /* Capture buffer count, (CLOSE). */
     I32                nestroot;               /* root parens we are in - used by accept */
     I32                extralen;
     I32                seen_zerolen;
@@ -153,6 +154,7 @@ typedef struct RExC_state_t {
 #define RExC_seen      (pRExC_state->seen)
 #define RExC_size      (pRExC_state->size)
 #define RExC_npar      (pRExC_state->npar)
+#define RExC_cpar      (pRExC_state->cpar)
 #define RExC_nestroot   (pRExC_state->nestroot)
 #define RExC_extralen  (pRExC_state->extralen)
 #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
@@ -549,7 +551,7 @@ static const scan_data_t zero_scan_data =
 #define DEBUG_STUDYDATA(data,depth)                                  \
 DEBUG_OPTIMISE_MORE_r(if(data){                                           \
     PerlIO_printf(Perl_debug_log,                                    \
-        "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf           \
+        "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf           \
         " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ",           \
         (int)(depth)*2, "",                                          \
         (IV)((data)->pos_min),                                       \
@@ -1892,21 +1894,30 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                 }
                 if ( count == 1 ) {
                     SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
-                    const char *ch = SvPV_nolen_const( *tmp );
-                    DEBUG_OPTIMISE_r(
+                    char *ch = SvPV_nolen( *tmp );
+                    DEBUG_OPTIMISE_r({
+                        SV *sv=sv_newmortal();
                         PerlIO_printf( Perl_debug_log,
                            "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
                             (int)depth * 2 + 2, "",
-                            (UV)state, (UV)idx, ch)
-                    );
+                            (UV)state, (UV)idx, 
+                            pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, 
+                               PL_colors[0], PL_colors[1],
+                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
+                               PERL_PV_ESCAPE_FIRSTCHAR 
+                            )
+                        );
+                    });
                     if ( state==1 ) {
                         OP( convert ) = nodetype;
                         str=STRING(convert);
                         STR_LEN(convert)=0;
                     }
-                    *str++=*ch;
-                    STR_LEN(convert)++;
-
+                    while (*ch) {
+                        *str++ = *ch++;
+                        STR_LEN(convert)++;
+                    }
+                    
                } else {
 #ifdef DEBUGGING           
                    if (state>1)
@@ -1923,11 +1934,21 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                 trie->maxlen -= (state - 1);
                 DEBUG_r({
                     regnode *fix = convert;
+                    U32 word = trie->wordcount;
                     mjd_nodelen++;
                     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
                     while( ++fix < n ) {
                         Set_Node_Offset_Length(fix, 0, 0);
                     }
+                    while (word--) {
+                        SV ** const tmp = av_fetch( trie->words, word, 0 );
+                        if (tmp) {
+                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
+                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
+                            else
+                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
+                        }
+                    }    
                 });
                 if (trie->maxlen) {
                     convert = n;
@@ -2117,14 +2138,14 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
 #endif
 
 #define DEBUG_PEEP(str,scan,depth) \
-    DEBUG_OPTIMISE_r({ \
+    DEBUG_OPTIMISE_r({if (scan){ \
        SV * const mysv=sv_newmortal(); \
        regnode *Next = regnext(scan); \
        regprop(RExC_rx, mysv, scan); \
        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
        Next ? (REG_NODE_NUM(Next)) : 0 ); \
-   });
+   }});
 
 
 
@@ -2287,6 +2308,17 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
     Newx(and_withp,1,struct regnode_charclass_class); \
     SAVEFREEPV(and_withp)
 
+/* this is a chain of data about sub patterns we are processing that
+   need to be handled seperately/specially in study_chunk. Its so
+   we can simulate recursion without losing state.  */
+struct scan_frame;
+typedef struct scan_frame {
+    regnode *last;  /* last node to process in this frame */
+    regnode *next;  /* next node to process when last is reached */
+    struct scan_frame *prev; /*previous frame*/
+    I32 stop; /* what stopparen do we use */
+} scan_frame;
+
 STATIC I32
 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         I32 *minlenp, I32 *deltap,
@@ -2315,7 +2347,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
     SV *re_trie_maxbuff = NULL;
     regnode *first_non_open = scan;
     I32 stopmin = I32_MAX;
+    scan_frame *frame = NULL;
+
     GET_RE_DEBUG_FLAGS_DECL;
+
 #ifdef DEBUGGING
     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
 #endif
@@ -2326,7 +2361,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
     }
 
 
-    while (scan && OP(scan) != END && scan < last) {
+  fake_study_recurse:
+    while ( scan && OP(scan) != END && scan < last ){
        /* Peephole optimizer: */
        DEBUG_STUDYDATA(data,depth);
        DEBUG_PEEP("Peep",scan,depth);
@@ -2360,12 +2396,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
        /* The principal pseudo-switch.  Cannot be a switch, since we
           look into several different things.  */
        if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
-                  || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
+                  || OP(scan) == IFTHEN) {
            next = regnext(scan);
            code = OP(scan);
            /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
        
-           if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
+           if (OP(next) == code || code == IFTHEN) {
                /* NOTE - There is similar code to this block below for handling
                   TRIE nodes on a re-study.  If you change stuff here check there
                   too. */
@@ -2373,7 +2409,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                struct regnode_charclass_class accum;
                regnode * const startbranch=scan;
                
-               if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
+               if (flags & SCF_DO_SUBSTR)
                    scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
                if (flags & SCF_DO_STCLASS)
                    cl_init_zero(pRExC_state, &accum);
@@ -2429,8 +2465,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    }
                    if (flags & SCF_DO_STCLASS)
                        cl_or(pRExC_state, &accum, &this_class);
-                   if (code == SUSPEND)
-                       break;
                }
                if (code == IFTHEN && num < 2) /* Empty ELSE branch */
                    min1 = 0;
@@ -2669,6 +2703,63 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            } else                      /* single branch is optimized. */
                scan = NEXTOPER(scan);
            continue;
+       } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
+           scan_frame *newframe = NULL;
+           I32 paren;
+           regnode *start;
+           regnode *end;
+
+           if (OP(scan) != SUSPEND) {
+           /* set the pointer */
+               if (OP(scan) == GOSUB) {
+                   paren = ARG(scan);
+                   RExC_recurse[ARG2L(scan)] = scan;
+                    start = RExC_open_parens[paren-1];
+                    end   = RExC_close_parens[paren-1];
+                } else {
+                    paren = 0;
+                    start = RExC_rx->program + 1;
+                    end   = RExC_opend;
+                }
+                if (!recursed) {
+                    Newxz(recursed, (((RExC_npar)>>3) +1), U8);
+                    SAVEFREEPV(recursed);
+                }
+                if (!PAREN_TEST(recursed,paren+1)) {
+                   PAREN_SET(recursed,paren+1);
+                    Newx(newframe,1,scan_frame);
+                } else {
+                    if (flags & SCF_DO_SUBSTR) {
+                        scan_commit(pRExC_state,data,minlenp);
+                        data->longest = &(data->longest_float);
+                    }
+                    is_inf = is_inf_internal = 1;
+                    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
+                        cl_anything(pRExC_state, data->start_class);
+                    flags &= ~SCF_DO_STCLASS;
+               }
+            } else {
+               Newx(newframe,1,scan_frame);
+               paren = stopparen;
+               start = scan+2;
+               end = regnext(scan);
+           }
+           if (newframe) {
+                assert(start);
+                assert(end);
+               SAVEFREEPV(newframe);
+               newframe->next = regnext(scan);
+               newframe->last = last;
+               newframe->stop = stopparen;
+               newframe->prev = frame;
+
+               frame = newframe;
+               scan =  start;
+               stopparen = paren;
+               last = end;
+
+               continue;
+           }
        }
        else if (OP(scan) == EXACT) {
            I32 l = STR_LEN(scan);
@@ -3538,61 +3629,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            if (data)
                *(data->last_closep) = ARG(scan);
        }
-       else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
-           /* set the pointer */
-           I32 paren;
-           regnode *start;
-           regnode *end;
-           if (OP(scan) == GOSUB) {
-               paren = ARG(scan);
-               RExC_recurse[ARG2L(scan)] = scan;
-                start = RExC_open_parens[paren-1];
-                end   = RExC_close_parens[paren-1];
-            } else {
-                paren = 0;
-                start = RExC_rx->program + 1;
-                end   = RExC_opend;
-            }
-            assert(start);
-            assert(end);
-           if (!recursed) {
-               Newxz(recursed, (((RExC_npar)>>3) +1), U8);
-               SAVEFREEPV(recursed);
-           }
-           if (!PAREN_TEST(recursed,paren+1)) {
-               I32 deltanext = 0;
-               PAREN_SET(recursed,paren+1);
-
-               DEBUG_PEEP("goto",start,depth);
-               min += study_chunk(
-                       pRExC_state,
-                       &start,
-                       minlenp,
-                       &deltanext,
-                       end+1,
-                       data,
-                       paren,
-                       recursed,
-                       and_withp,
-                       flags,depth+1);
-               delta+=deltanext;
-               if (deltanext == I32_MAX) {
-                   is_inf = is_inf_internal = 1;
-                   delta=deltanext;
-               }
-               DEBUG_PEEP("rtrn",end,depth);
-               PAREN_UNSET(recursed,paren+1);
-           } else {
-               if (flags & SCF_DO_SUBSTR) {
-                   scan_commit(pRExC_state,data,minlenp);
-                   data->longest = &(data->longest_float);
-               }
-               is_inf = is_inf_internal = 1;
-               if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
-                   cl_anything(pRExC_state, data->start_class);
-               flags &= ~SCF_DO_STCLASS;
-           }
-       }
        else if (OP(scan) == EVAL) {
                if (data)
                    data->flags |= SF_HAS_EVAL;
@@ -3763,8 +3799,17 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
        /* Else: zero-length, ignore. */
        scan = regnext(scan);
     }
+    if (frame) {
+        last = frame->last;
+        scan = frame->next;
+        stopparen = frame->stop;
+        frame = frame->prev;
+        goto fake_study_recurse;
+    }
 
   finish:
+    assert(!frame);
+
     *scanp = scan;
     *deltap = is_inf_internal ? I32_MAX : delta;
     if (flags & SCF_DO_SUBSTR && is_inf)
@@ -3943,6 +3988,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     RExC_end = xend;
     RExC_naughty = 0;
     RExC_npar = 1;
+    RExC_cpar = 1;
     RExC_nestroot = 0;
     RExC_size = 0L;
     RExC_emit = &PL_regdummy;
@@ -4013,6 +4059,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     r->substrs = 0;                    /* Useful during FAIL. */
     r->startp = 0;                     /* Useful during FAIL. */
     r->endp = 0;                       
+    r->swap = NULL; 
     r->paren_names = 0;
     
     if (RExC_seen & REG_SEEN_RECURSE) {
@@ -4040,6 +4087,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     RExC_end = xend;
     RExC_naughty = 0;
     RExC_npar = 1;
+    RExC_cpar = 1;
     RExC_emit_start = r->program;
     RExC_emit = r->program;
 #ifdef DEBUGGING
@@ -4184,7 +4232,8 @@ reStudy:
            first = NEXTOPER(first);
            goto again;
        }
-       else if (!sawopen && (OP(first) == STAR &&
+       else if ((!sawopen || !RExC_sawback) &&
+           (OP(first) == STAR &&
            PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
            !(r->reganch & ROPT_ANCH) )
        {
@@ -4482,8 +4531,8 @@ reStudy:
     }
     Newxz(r->startp, RExC_npar, I32);
     Newxz(r->endp, RExC_npar, I32);
-    
-    DEBUG_r( RX_DEBUG_on(r) );
+    /* assume we don't need to swap parens around before we match */
+
     DEBUG_DUMP_r({
         PerlIO_printf(Perl_debug_log,"Final program:\n");
         regdump(r);
@@ -4915,17 +4964,54 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 }
                 goto gen_recurse_regop;
                 /* NOT REACHED */
+            case '+':
+                if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
+                    RExC_parse++;
+                    vFAIL("Illegal pattern");
+                }
+                goto parse_recursion;
+                /* NOT REACHED*/
+            case '-': /* (?-1) */
+                if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
+                    RExC_parse--; /* rewind to let it be handled later */
+                    goto parse_flags;
+                } 
+                /*FALLTHROUGH */
             case '1': case '2': case '3': case '4': /* (?1) */
            case '5': case '6': case '7': case '8': case '9':
                RExC_parse--;
+              parse_recursion:
                num = atoi(RExC_parse);
                parse_start = RExC_parse - 1; /* MJD */
+               if (*RExC_parse == '-')
+                   RExC_parse++;
                while (isDIGIT(*RExC_parse))
                        RExC_parse++;
                if (*RExC_parse!=')') 
                    vFAIL("Expecting close bracket");
                        
               gen_recurse_regop:
+                if ( paren == '-' ) {
+                    /*
+                    Diagram of capture buffer numbering.
+                    Top line is the normal capture buffer numbers
+                    Botton line is the negative indexing as from
+                    the X (the (?-2))
+
+                    +   1 2    3 4 5 X          6 7
+                       /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
+                    -   5 4    3 2 1 X          x x
+
+                    */
+                    num = RExC_npar + num;
+                    if (num < 1)  {
+                        RExC_parse++;
+                        vFAIL("Reference to nonexistent group");
+                    }
+                } else if ( paren == '+' ) {
+                    num = RExC_npar + num - 1;
+                }
+
                 ret = reganode(pRExC_state, GOSUB, num);
                 if (!SIZE_ONLY) {
                    if (num > (I32)RExC_rx->nparens) {
@@ -5290,6 +5376,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
            ender = reg_node(pRExC_state, TAIL);
            break;
        case 1:
+           RExC_cpar++;
            ender = reganode(pRExC_state, CLOSE, parno);
            if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
                DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
@@ -6234,11 +6321,20 @@ tryagain:
        case 'c':
        case '0':
            goto defchar;
+       case 'R': 
        case '1': case '2': case '3': case '4':
        case '5': case '6': case '7': case '8': case '9':
            {
-               const I32 num = atoi(RExC_parse);
-
+               I32 num;
+               bool isrel=(*RExC_parse=='R');
+               if (isrel)
+                   RExC_parse++;
+               num = atoi(RExC_parse);
+                if (isrel) {
+                    num = RExC_cpar - num;
+                    if (num < 1)
+                        vFAIL("Reference to nonexistent or unclosed group");
+                }
                if (num > 9 && num >= RExC_npar)
                    goto defchar;
                else {
@@ -6246,8 +6342,16 @@ tryagain:
                    while (isDIGIT(*RExC_parse))
                        RExC_parse++;
 
-                   if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
-                       vFAIL("Reference to nonexistent group");
+                   if (!SIZE_ONLY) {
+                       if (num > (I32)RExC_rx->nparens)
+                           vFAIL("Reference to nonexistent group");
+                       /* People make this error all the time apparently.
+                          So we cant fail on it, even though we should 
+                       
+                       else if (num >= RExC_cpar)
+                           vFAIL("Reference to unclosed group will always match");
+                       */
+                   }
                    RExC_sawback = 1;
                    ret = reganode(pRExC_state,
                                   (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
@@ -6336,6 +6440,7 @@ tryagain:
                    case 'p':
                    case 'P':
                     case 'N':
+                    case 'R':
                        --p;
                        goto loopdone;
                    case 'n':
@@ -8057,6 +8162,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
     GET_RE_DEBUG_FLAGS_DECL;
 
     sv_setpvn(sv, "", 0);
+
     if (OP(o) > REGNODE_MAX)           /* regnode.type is unsigned */
        /* It would be nice to FAIL() here, but this may be called from
           regexec.c, and it would be hard to supply pRExC_state. */
@@ -8338,8 +8444,8 @@ Perl_pregfree(pTHX_ struct regexp *r)
     DEBUG_COMPILE_r({
        if (!PL_colorset)
            reginitcolors();
-       if (RX_DEBUG(r)){
-            SV *dsv= sv_newmortal();
+       {
+           SV *dsv= sv_newmortal();
             RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
                 dsv, r->precomp, r->prelen, 60);
             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", 
@@ -8448,12 +8554,10 @@ Perl_pregfree(pTHX_ struct regexp *r)
                         if (trie->nextword)
                             Safefree(trie->nextword);
 #ifdef DEBUGGING
-                        if (RX_DEBUG(r)) {
-                            if (trie->words)
-                                SvREFCNT_dec((SV*)trie->words);
-                            if (trie->revcharmap)
-                                SvREFCNT_dec((SV*)trie->revcharmap);
-                        }
+                        if (trie->words)
+                            SvREFCNT_dec((SV*)trie->words);
+                        if (trie->revcharmap)
+                            SvREFCNT_dec((SV*)trie->revcharmap);
 #endif
                         Safefree(r->data->data[n]); /* do this last!!!! */
                    }
@@ -8468,6 +8572,11 @@ Perl_pregfree(pTHX_ struct regexp *r)
     }
     Safefree(r->startp);
     Safefree(r->endp);
+    if (r->swap) {
+        Safefree(r->swap->startp);
+        Safefree(r->swap->endp);
+        Safefree(r->swap);
+    }
     Safefree(r);
 }
 
@@ -8510,6 +8619,14 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
     Copy(r->startp, ret->startp, npar, I32);
     Newx(ret->endp, npar, I32);
     Copy(r->startp, ret->startp, npar, I32);
+    if(r->swap) {
+        Newx(ret->swap, 1, regexp_paren_ofs);
+        /* no need to copy these */
+        Newx(ret->swap->startp, npar, I32);
+        Newx(ret->swap->endp, npar, I32);
+    } else {
+        ret->swap = NULL;
+    }
 
     Newx(ret->substrs, 1, struct reg_substr_data);
     for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
@@ -8885,7 +9002,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
 
        NODE_ALIGN(node);
        op = OP(node);
-       if (op == CLOSE)
+       if (op == CLOSE || op == WHILEM)
            indent--;
        next = regnext((regnode *)node);
        
@@ -9004,8 +9121,6 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
        }
        if (op == CURLYX || op == OPEN)
            indent++;
-       else if (op == WHILEM)
-           indent--;
     }
     CLEAR_OPTSTART;
 #ifdef DEBUG_DUMPUNTIL