Further tweaks to make it easier to create regexp engine plug ins.
[p5sagit/p5-mst-13.2.git] / regcomp.c
index 3d1211a..1611eb4 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -156,7 +156,6 @@ typedef struct RExC_state_t {
 #define RExC_seen      (pRExC_state->seen)
 #define RExC_size      (pRExC_state->size)
 #define RExC_npar      (pRExC_state->npar)
-#define RExC_cpar      (pRExC_state->cpar)
 #define RExC_nestroot   (pRExC_state->nestroot)
 #define RExC_extralen  (pRExC_state->extralen)
 #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
@@ -557,17 +556,18 @@ static const scan_data_t zero_scan_data =
 #define EXPERIMENTAL_INPLACESCAN
 #endif
 
-#define DEBUG_STUDYDATA(data,depth)                                  \
-DEBUG_OPTIMISE_MORE_r(if(data){                                           \
+#define DEBUG_STUDYDATA(str,data,depth)                              \
+DEBUG_OPTIMISE_MORE_r(if(data){                                      \
     PerlIO_printf(Perl_debug_log,                                    \
-        "%*s"/* Len:%"IVdf"/%"IVdf" */"Pos:%"IVdf"/%"IVdf           \
-        " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ",           \
+        "%*s" str "Pos:%"IVdf"/%"IVdf                                \
+        " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
         (int)(depth)*2, "",                                          \
         (IV)((data)->pos_min),                                       \
         (IV)((data)->pos_delta),                                     \
-        (IV)((data)->flags),                                         \
+        (UV)((data)->flags),                                         \
         (IV)((data)->whilem_c),                                      \
-        (IV)((data)->last_closep ? *((data)->last_closep) : -1)      \
+        (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
+        is_inf ? "INF " : ""                                         \
     );                                                               \
     if ((data)->last_found)                                          \
         PerlIO_printf(Perl_debug_log,                                \
@@ -597,7 +597,7 @@ static void clear_re(pTHX_ void *r);
    floating substrings if needed. */
 
 STATIC void
-S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
+S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 {
     const STRLEN l = CHR_SVLEN(data->last_found);
     const STRLEN old_l = CHR_SVLEN(*data->longest);
@@ -615,12 +615,12 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min
            data->minlen_fixed=minlenp; 
            data->lookbehind_fixed=0;
        }
-       else {
+       else { /* *data->longest == data->longest_float */
            data->offset_float_min = l ? data->last_start_min : data->pos_min;
            data->offset_float_max = (l
                                      ? data->last_start_max
                                      : data->pos_min + data->pos_delta);
-           if ((U32)data->offset_float_max > (U32)I32_MAX)
+           if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
                data->offset_float_max = I32_MAX;
            if (data->flags & SF_BEFORE_EOL)
                data->flags
@@ -642,7 +642,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min
     }
     data->last_end = -1;
     data->flags &= ~SF_BEFORE_EOL;
-    DEBUG_STUDYDATA(data,0);
+    DEBUG_STUDYDATA("cl_anything: ",data,0);
 }
 
 /* Can match anything (initialization) */
@@ -798,9 +798,9 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
 
 #ifdef DEBUGGING
 /*
-   dump_trie(trie,widecharmap)
-   dump_trie_interim_list(trie,widecharmap,next_alloc)
-   dump_trie_interim_table(trie,widecharmap,next_alloc)
+   dump_trie(trie,widecharmap,revcharmap)
+   dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
+   dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 
    These routines dump out a trie in a somewhat readable format.
    The _interim_ variants are used for debugging the interim
@@ -818,7 +818,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
 */
  
 STATIC void
-S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, U32 depth)
+S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
+           AV *revcharmap, U32 depth)
 {
     U32 state;
     SV *sv=sv_newmortal();
@@ -831,7 +832,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, U32 depth)
         "Match","Base","Ofs" );
 
     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
-       SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
+       SV ** const tmp = av_fetch( revcharmap, state, 0);
         if ( tmp ) {
             PerlIO_printf( Perl_debug_log, "%*s", 
                 colwidth,
@@ -900,7 +901,8 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, U32 depth)
 */
 STATIC void
 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
-                        HV *widecharmap, U32 next_alloc, U32 depth)
+                        HV *widecharmap, AV *revcharmap, U32 next_alloc,
+                        U32 depth)
 {
     U32 state;
     SV *sv=sv_newmortal();
@@ -924,7 +926,7 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
             );
         }
         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
-           SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
+           SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
            if ( tmp ) {
                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
                     colwidth,
@@ -953,7 +955,8 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
 */
 STATIC void
 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
-                         HV *widecharmap, U32 next_alloc, U32 depth)
+                         HV *widecharmap, AV *revcharmap, U32 next_alloc,
+                         U32 depth)
 {
     U32 state;
     U16 charid;
@@ -969,7 +972,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
 
     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
-       SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
+       SV ** const tmp = av_fetch( revcharmap, charid, 0);
         if ( tmp ) {
             PerlIO_printf( Perl_debug_log, "%*s", 
                 colwidth,
@@ -1136,7 +1139,7 @@ is the recommended Unicode-aware way of saying
        SV *tmp = newSVpvs("");                                            \
        if (UTF) SvUTF8_on(tmp);                                           \
        Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc );                       \
-       av_push( TRIE_REVCHARMAP(trie), tmp );                             \
+       av_push( revcharmap, tmp );                                        \
     } STMT_END
 
 #define TRIE_READ_CHAR STMT_START {                                           \
@@ -1196,7 +1199,7 @@ is the recommended Unicode-aware way of saying
         else                                                    \
             tmp = newSVpvn( "", 0 );                            \
         if ( UTF ) SvUTF8_on( tmp );                            \
-        av_push( trie->words, tmp );                            \
+        av_push( trie_words, tmp );                             \
     });                                                         \
                                                                 \
     curword++;                                                  \
@@ -1249,6 +1252,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     /* first pass, loop through and scan words */
     reg_trie_data *trie;
     HV *widecharmap = NULL;
+    AV *revcharmap = newAV();
     regnode *cur;
     const U32 uniflags = UTF8_ALLOW_DEFAULT;
     STRLEN len = 0;
@@ -1267,15 +1271,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                          )
                      );
 
-    const U32 data_slot = add_data( pRExC_state, 2, "tu" );
-    SV *re_trie_maxbuff;
-#ifndef DEBUGGING
-    /* these are only used during construction but are useful during
-     * debugging so we store them in the struct when debugging.
+#ifdef DEBUGGING
+    const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
+    AV *trie_words = NULL;
+    /* along with revcharmap, this only used during construction but both are
+     * useful during debugging so we store them in the struct when debugging.
      */
+#else
+    const U32 data_slot = add_data( pRExC_state, 2, "tu" );
     STRLEN trie_charcount=0;
-    AV *trie_revcharmap;
 #endif
+    SV *re_trie_maxbuff;
     GET_RE_DEBUG_FLAGS_DECL;
 #ifndef DEBUGGING
     PERL_UNUSED_ARG(depth);
@@ -1290,9 +1296,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     if (!(UTF && folder))
        trie->bitmap = PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
     DEBUG_r({
-        trie->words = newAV();
+        trie_words = newAV();
     });
-    TRIE_REVCHARMAP(trie) = newAV();
 
     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
     if (!SvIOK(re_trie_maxbuff)) {
@@ -1513,8 +1518,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                                              * sizeof(reg_trie_state) );
 
         /* and now dump it out before we compress it */
-        DEBUG_TRIE_COMPILE_MORE_r(
-            dump_trie_interim_list(trie,widecharmap,next_alloc,depth+1)
+        DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
+                                                        revcharmap, next_alloc,
+                                                        depth+1)
         );
 
         trie->trans
@@ -1687,9 +1693,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
         } /* end second pass */
 
         /* and now dump it out before we compress it */
-        DEBUG_TRIE_COMPILE_MORE_r(
-            dump_trie_interim_table(trie,widecharmap,next_alloc,depth+1)
-        );
+        DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
+                                                         revcharmap,
+                                                         next_alloc, depth+1));
 
         {
         /*
@@ -1818,9 +1824,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                                         * sizeof(reg_trie_trans) );
 
     /* and now dump out the compressed format */
-    DEBUG_TRIE_COMPILE_r(
-        dump_trie(trie,widecharmap,depth+1)
-    );
+    DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
 
     {   /* Modify the program and insert the new TRIE node*/ 
         U8 nodetype =(U8)(flags & 0xFF);
@@ -1882,7 +1886,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
                     {
                         if ( ++count > 1 ) {
-                            SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
+                            SV **tmp = av_fetch( revcharmap, ofs, 0);
                            const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
                             if ( state == 1 ) break;
                             if ( count == 2 ) {
@@ -1893,7 +1897,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                                         (int)depth * 2 + 2, "",
                                         (UV)state));
                                if (idx >= 0) {
-                                   SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
+                                   SV ** const tmp = av_fetch( revcharmap, idx, 0);
                                    const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
 
                                     TRIE_BITMAP_SET(trie,*ch);
@@ -1913,7 +1917,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                    }
                 }
                 if ( count == 1 ) {
-                    SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
+                    SV **tmp = av_fetch( revcharmap, idx, 0);
                     char *ch = SvPV_nolen( *tmp );
                     DEBUG_OPTIMISE_r({
                         SV *sv=sv_newmortal();
@@ -1961,7 +1965,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                         Set_Node_Offset_Length(fix, 0, 0);
                     }
                     while (word--) {
-                        SV ** const tmp = av_fetch( trie->words, word, 0 );
+                        SV ** const tmp = av_fetch( trie_words, word, 0 );
                         if (tmp) {
                             if ( STR_LEN(convert) <= SvCUR(*tmp) )
                                 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
@@ -2030,8 +2034,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
         });
     } /* end node insert */
     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
-#ifndef DEBUGGING
-    SvREFCNT_dec(TRIE_REVCHARMAP(trie));
+#ifdef DEBUGGING
+    RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
+    RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
+#else
+    SvREFCNT_dec(revcharmap);
 #endif
     return trie->jump 
            ? MADE_JUMP_TRIE 
@@ -2341,6 +2348,9 @@ typedef struct scan_frame {
     I32 stop; /* what stopparen do we use */
 } scan_frame;
 
+
+#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
+
 STATIC I32
 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         I32 *minlenp, I32 *deltap,
@@ -2386,7 +2396,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
   fake_study_recurse:
     while ( scan && OP(scan) != END && scan < last ){
        /* Peephole optimizer: */
-       DEBUG_STUDYDATA(data,depth);
+       DEBUG_STUDYDATA("Peep:", data,depth);
        DEBUG_PEEP("Peep",scan,depth);
         JOIN_EXACT(scan,&min,0);
 
@@ -2432,7 +2442,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                regnode * const startbranch=scan;
                
                if (flags & SCF_DO_SUBSTR)
-                   scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
+                   SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
                if (flags & SCF_DO_STCLASS)
                    cl_init_zero(pRExC_state, &accum);
 
@@ -2754,7 +2764,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     Newx(newframe,1,scan_frame);
                 } else {
                     if (flags & SCF_DO_SUBSTR) {
-                        scan_commit(pRExC_state,data,minlenp);
+                        SCAN_COMMIT(pRExC_state,data,minlenp);
                         data->longest = &(data->longest_float);
                     }
                     is_inf = is_inf_internal = 1;
@@ -2856,7 +2866,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            /* Search for fixed substrings supports EXACT only. */
            if (flags & SCF_DO_SUBSTR) {
                assert(data);
-               scan_commit(pRExC_state, data, minlenp);
+               SCAN_COMMIT(pRExC_state, data, minlenp);
            }
            if (UTF) {
                const U8 * const s = (U8 *)STRING(scan);
@@ -2935,7 +2945,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                is_inf = is_inf_internal = 1;
                scan = regnext(scan);
                if (flags & SCF_DO_SUBSTR) {
-                   scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
+                   SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
                    data->longest = &(data->longest_float);
                }
                goto optimize_curly_tail;
@@ -2958,7 +2968,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                next_is_eval = (OP(scan) == EVAL);
              do_curly:
                if (flags & SCF_DO_SUBSTR) {
-                   if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
+                   if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
                    pos_before = data->pos_min;
                }
                if (data) {
@@ -3229,7 +3239,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (mincount != maxcount) {
                         /* Cannot extend fixed substrings found inside
                            the group.  */
-                       scan_commit(pRExC_state,data,minlenp);
+                       SCAN_COMMIT(pRExC_state,data,minlenp);
                        if (mincount && last_str) {
                            SV * const sv = data->last_found;
                            MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
@@ -3261,7 +3271,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                continue;
            default:                    /* REF and CLUMP only? */
                if (flags & SCF_DO_SUBSTR) {
-                   scan_commit(pRExC_state,data,minlenp);      /* Cannot expect anything... */
+                   SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
                    data->longest = &(data->longest_float);
                }
                is_inf = is_inf_internal = 1;
@@ -3275,7 +3285,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            int value = 0;
 
            if (flags & SCF_DO_SUBSTR) {
-               scan_commit(pRExC_state,data,minlenp);
+               SCAN_COMMIT(pRExC_state,data,minlenp);
                data->pos_min++;
            }
            min++;
@@ -3564,7 +3574,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
                         f |= SCF_DO_SUBSTR;
                         if (scan->flags) 
-                            scan_commit(pRExC_state, &data_fake,minlenp);
+                            SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
                         data_fake.last_found=newSVsv(data->last_found);
                     }
                 }
@@ -3615,7 +3625,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
                         if (RExC_rx->minlen<*minnextp)
                             RExC_rx->minlen=*minnextp;
-                        scan_commit(pRExC_state, &data_fake, minnextp);
+                        SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
                         SvREFCNT_dec(data_fake.last_found);
                         
                         if ( data_fake.minlen_fixed != minlenp ) 
@@ -3661,7 +3671,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
        }
        else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
            if (flags & SCF_DO_SUBSTR) {
-               scan_commit(pRExC_state,data,minlenp);
+               SCAN_COMMIT(pRExC_state,data,minlenp);
                flags &= ~SCF_DO_SUBSTR;
            }
            if (data && OP(scan)==ACCEPT) {
@@ -3673,7 +3683,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
        else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
        {
                if (flags & SCF_DO_SUBSTR) {
-                   scan_commit(pRExC_state,data,minlenp);
+                   SCAN_COMMIT(pRExC_state,data,minlenp);
                    data->longest = &(data->longest_float);
                }
                is_inf = is_inf_internal = 1;
@@ -3707,7 +3717,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             struct regnode_charclass_class accum;
 
             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
-                scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
+                SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
             if (flags & SCF_DO_STCLASS)
                 cl_init_zero(pRExC_state, &accum);
                 
@@ -3824,7 +3834,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            delta += (trie->maxlen - trie->minlen);
            flags &= ~SCF_DO_STCLASS; /* xxx */
             if (flags & SCF_DO_SUBSTR) {
-               scan_commit(pRExC_state,data,minlenp);  /* Cannot expect anything... */
+               SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
                data->pos_min += trie->minlen;
                data->pos_delta += (trie->maxlen - trie->minlen);
                if (trie->maxlen != trie->minlen)
@@ -3848,6 +3858,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
   finish:
     assert(!frame);
+    DEBUG_STUDYDATA("pre-fin:",data,depth);
 
     *scanp = scan;
     *deltap = is_inf_internal ? I32_MAX : delta;
@@ -3868,7 +3879,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
     if (flags & SCF_TRIE_RESTUDY)
         data->flags |=         SCF_TRIE_RESTUDY;
     
-    DEBUG_STUDYDATA(data,depth);
+    DEBUG_STUDYDATA("post-fin:",data,depth);
     
     return min < stopmin ? min : stopmin;
 }
@@ -3890,6 +3901,7 @@ S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
     return count;
 }
 
+/*XXX: todo make this not included in a non debugging perl */
 #ifndef PERL_IN_XSUB_RE
 void
 Perl_reginitcolors(pTHX)
@@ -3953,23 +3965,18 @@ Perl_reginitcolors(pTHX)
 extern const struct regexp_engine my_reg_engine;
 #define RE_ENGINE_PTR &my_reg_engine
 #endif
-/* these make a few things look better, to avoid indentation */
-#define BEGIN_BLOCK {
-#define END_BLOCK }
+
+#ifndef PERL_IN_XSUB_RE 
 regexp *
 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
 {
     dVAR;
-    GET_RE_DEBUG_FLAGS_DECL;
-    DEBUG_r(if (!PL_colorset) reginitcolors());
-#ifndef PERL_IN_XSUB_RE
-    BEGIN_BLOCK
+    HV * const table = GvHV(PL_hintgv);
     /* Dispatch a request to compile a regexp to correct 
        regexp engine. */
-    HV * const table = GvHV(PL_hintgv);
     if (table) {
         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
+        GET_RE_DEBUG_FLAGS_DECL;
         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
             DEBUG_COMPILE_r({
@@ -3979,9 +3986,14 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
             return CALLREGCOMP_ENG(eng, exp, xend, pm);
         } 
     }
-    END_BLOCK
+    return Perl_re_compile(aTHX_ exp, xend, pm);
+}
 #endif
-    BEGIN_BLOCK    
+
+regexp *
+Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
+{
+    dVAR;
     register regexp *r;
     register regexp_internal *ri;
     regnode *scan;
@@ -3997,6 +4009,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     int restudied= 0;
     RExC_state_t copyRExC_state;
 #endif    
+    GET_RE_DEBUG_FLAGS_DECL;
+    DEBUG_r(if (!PL_colorset) reginitcolors());
+        
     if (exp == NULL)
        FAIL("NULL regexp argument");
 
@@ -4024,7 +4039,6 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     RExC_end = xend;
     RExC_naughty = 0;
     RExC_npar = 1;
-    RExC_cpar = 1;
     RExC_nestroot = 0;
     RExC_size = 0L;
     RExC_emit = &PL_regdummy;
@@ -4120,7 +4134,6 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     RExC_end = xend;
     RExC_naughty = 0;
     RExC_npar = 1;
-    RExC_cpar = 1;
     RExC_emit_start = ri->program;
     RExC_emit = ri->program;
 #ifdef DEBUGGING
@@ -4129,7 +4142,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     ri->program[RExC_size].type = 255;
 #endif
     /* Store the count of eval-groups for security checks: */
-    RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
+    RExC_rx->seen_evals = RExC_seen_evals;
     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
     if (reg(pRExC_state, 0, &flags,1) == NULL)
        return(NULL);
@@ -4335,9 +4348,10 @@ reStudy:
        if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
             && data.last_start_min == 0 && data.last_end > 0
             && !RExC_seen_zerolen
+            && !(RExC_seen & REG_SEEN_VERBARG)
             && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
            r->extflags |= RXf_CHECK_ALL;
-       scan_commit(pRExC_state, &data,&minlen);
+       scan_commit(pRExC_state, &data,&minlen,0);
        SvREFCNT_dec(data.last_found);
 
         /* Note that code very similar to this but for anchored string 
@@ -4583,11 +4597,9 @@ reStudy:
         PerlIO_printf(Perl_debug_log, "\n");
     });
     return(r);
-    END_BLOCK    
 }
 
 #undef CORE_ONLY_BLOCK
-#undef END_BLOCK
 #undef RE_ENGINE_PTR
 
 #ifndef PERL_IN_XSUB_RE
@@ -5410,7 +5422,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
            ender = reg_node(pRExC_state, TAIL);
            break;
        case 1:
-           RExC_cpar++;
            ender = reganode(pRExC_state, CLOSE, parno);
            if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
                DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
@@ -6355,36 +6366,45 @@ tryagain:
        case 'c':
        case '0':
            goto defchar;
-       case 'R': 
+       case 'g': 
        case '1': case '2': case '3': case '4':
        case '5': case '6': case '7': case '8': case '9':
            {
                I32 num;
-               bool isrel=(*RExC_parse=='R');
-               if (isrel)
+               bool isg = *RExC_parse == 'g';
+               bool isrel = 0; 
+               bool hasbrace = 0;
+               if (isg) {
                    RExC_parse++;
+                   if (*RExC_parse == '{') {
+                       RExC_parse++;
+                       hasbrace = 1;
+                   }
+                   if (*RExC_parse == '-') {
+                       RExC_parse++;
+                       isrel = 1;
+                   }
+               }   
                num = atoi(RExC_parse);
                 if (isrel) {
-                    num = RExC_cpar - num;
+                    num = RExC_npar - num;
                     if (num < 1)
                         vFAIL("Reference to nonexistent or unclosed group");
                 }
-               if (num > 9 && num >= RExC_npar)
+               if (!isg && num > 9 && num >= RExC_npar)
                    goto defchar;
                else {
                    char * const parse_start = RExC_parse - 1; /* MJD */
                    while (isDIGIT(*RExC_parse))
                        RExC_parse++;
-
+                    if (hasbrace) {
+                        if (*RExC_parse != '}') 
+                            vFAIL("Unterminated \\g{...} pattern");
+                        RExC_parse++;
+                    }    
                    if (!SIZE_ONLY) {
                        if (num > (I32)RExC_rx->nparens)
                            vFAIL("Reference to nonexistent group");
-                       /* People make this error all the time apparently.
-                          So we cant fail on it, even though we should 
-                       
-                       else if (num >= RExC_cpar)
-                           vFAIL("Reference to unclosed group will always match");
-                       */
                    }
                    RExC_sawback = 1;
                    ret = reganode(pRExC_state,
@@ -6461,6 +6481,7 @@ tryagain:
                    case 'C':
                    case 'X':
                    case 'G':
+                   case 'g':
                    case 'Z':
                    case 'z':
                    case 'w':
@@ -6475,6 +6496,7 @@ tryagain:
                    case 'P':
                     case 'N':
                     case 'R':
+                    case 'k':
                        --p;
                        goto loopdone;
                    case 'n':
@@ -8463,36 +8485,32 @@ Perl_re_intuit_string(pTHX_ regexp *prog)
 }
 
 /* 
-   pregfree - free a regexp
+   pregfree() 
+   
+   handles refcounting and freeing the perl core regexp structure. When 
+   it is necessary to actually free the structure the first thing it 
+   does is call the 'free' method of the regexp_engine associated to to 
+   the regexp, allowing the handling of the void *pprivate; member 
+   first. (This routine is not overridable by extensions, which is why 
+   the extensions free is called first.)
    
-   See regdupe below if you change anything here. 
+   See regdupe and regdupe_internal if you change anything here. 
 */
-
+#ifndef PERL_IN_XSUB_RE
 void
 Perl_pregfree(pTHX_ struct regexp *r)
 {
     dVAR;
-    RXi_GET_DECL(r,ri);
     GET_RE_DEBUG_FLAGS_DECL;
 
     if (!r || (--r->refcnt > 0))
        return;
-    DEBUG_COMPILE_r({
-       if (!PL_colorset)
-           reginitcolors();
-       {
-           SV *dsv= sv_newmortal();
-            RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
-                dsv, r->precomp, r->prelen, 60);
-            PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", 
-                PL_colors[4],PL_colors[5],s);
-        }
-    });
-
+       
+    CALLREGFREE_PVT(r); /* free the private data */
+    
     /* gcov results gave these as non-null 100% of the time, so there's no
        optimisation in checking them before calling Safefree  */
     Safefree(r->precomp);
-    Safefree(ri->offsets);             /* 20010421 MJD */
     RX_MATCH_COPY_FREE(r);
 #ifdef PERL_OLD_COPY_ON_WRITE
     if (r->saved_copy)
@@ -8511,6 +8529,45 @@ Perl_pregfree(pTHX_ struct regexp *r)
     }
     if (r->paren_names)
             SvREFCNT_dec(r->paren_names);
+    
+    Safefree(r->startp);
+    Safefree(r->endp);
+    Safefree(r);
+}
+#endif
+
+/* regfree_internal() 
+
+   Free the private data in a regexp. This is overloadable by 
+   extensions. Perl takes care of the regexp structure in pregfree(), 
+   this covers the *pprivate pointer which technically perldoesnt 
+   know about, however of course we have to handle the 
+   regexp_internal structure when no extension is in use. 
+   
+   Note this is called before freeing anything in the regexp 
+   structure. 
+ */
+void
+Perl_regfree_internal(pTHX_ struct regexp *r)
+{
+    dVAR;
+    RXi_GET_DECL(r,ri);
+    GET_RE_DEBUG_FLAGS_DECL;
+    
+    DEBUG_COMPILE_r({
+       if (!PL_colorset)
+           reginitcolors();
+       {
+           SV *dsv= sv_newmortal();
+            RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
+                dsv, r->precomp, r->prelen, 60);
+            PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", 
+                PL_colors[4],PL_colors[5],s);
+        }
+    });
+
+    Safefree(ri->offsets);             /* 20010421 MJD */
     if (ri->data) {
        int n = ri->data->count;
        PAD* new_comppad = NULL;
@@ -8587,12 +8644,6 @@ Perl_pregfree(pTHX_ struct regexp *r)
                             PerlMemShared_free(trie->jump);
                         if (trie->nextword)
                             PerlMemShared_free(trie->nextword);
-#ifdef DEBUGGING
-                        if (trie->words)
-                            SvREFCNT_dec((SV*)trie->words);
-                        if (trie->revcharmap)
-                            SvREFCNT_dec((SV*)trie->revcharmap);
-#endif
                         /* do this last!!!! */
                         PerlMemShared_free(ri->data->data[n]);
                    }
@@ -8605,15 +8656,12 @@ Perl_pregfree(pTHX_ struct regexp *r)
        Safefree(ri->data->what);
        Safefree(ri->data);
     }
-    Safefree(r->startp);
-    Safefree(r->endp);
     if (ri->swap) {
         Safefree(ri->swap->startp);
         Safefree(ri->swap->endp);
         Safefree(ri->swap);
     }
     Safefree(ri);
-    Safefree(r);
 }
 
 #define sv_dup_inc(s,t)        SvREFCNT_inc(sv_dup(s,t))
@@ -8628,37 +8676,111 @@ Perl_pregfree(pTHX_ struct regexp *r)
    given regexp structure. It is a no-op when not under USE_ITHREADS. 
    (Originally this *was* re_dup() for change history see sv.c)
    
-   See pregfree() above if you change anything here. 
+   After all of the core data stored in struct regexp is duplicated
+   the regexp_engine.dupe method is used to copy any private data
+   stored in the *pprivate pointer. This allows extensions to handle
+   any duplication it needs to do.
+
+   See pregfree() and regfree_internal() if you change anything here. 
 */
 #if defined(USE_ITHREADS)
+#ifndef PERL_IN_XSUB_RE
 regexp *
-Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
+Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
 {
     dVAR;
     regexp *ret;
-    regexp_internal *reti;
-    int i, len, npar;
+    int i, npar;
     struct reg_substr_datum *s;
-    RXi_GET_DECL(r,ri);
-    
+
     if (!r)
        return (REGEXP *)NULL;
 
     if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
        return ret;
 
-    len = ri->offsets[0];
+    
     npar = r->nparens+1;
-
     Newxz(ret, 1, regexp);
-    Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
-    RXi_SET(ret,reti);
-    Copy(ri->program, reti->program, len+1, regnode);
-
     Newx(ret->startp, npar, I32);
     Copy(r->startp, ret->startp, npar, I32);
     Newx(ret->endp, npar, I32);
-    Copy(r->startp, ret->startp, npar, I32);
+    Copy(r->endp, ret->endp, npar, I32);
+
+    if (ret->substrs) {
+        Newx(ret->substrs, 1, struct reg_substr_data);
+        for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
+            s->min_offset = r->substrs->data[i].min_offset;
+            s->max_offset = r->substrs->data[i].max_offset;
+            s->end_shift  = r->substrs->data[i].end_shift;
+            s->substr     = sv_dup_inc(r->substrs->data[i].substr, param);
+            s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
+        }
+    } else 
+        ret->substrs = NULL;    
+
+    ret->precomp        = SAVEPVN(r->precomp, r->prelen);
+    ret->refcnt         = r->refcnt;
+    ret->minlen         = r->minlen;
+    ret->minlenret      = r->minlenret;
+    ret->prelen         = r->prelen;
+    ret->nparens        = r->nparens;
+    ret->lastparen      = r->lastparen;
+    ret->lastcloseparen = r->lastcloseparen;
+    ret->intflags       = r->intflags;
+    ret->extflags       = r->extflags;
+
+    ret->sublen         = r->sublen;
+
+    ret->engine         = r->engine;
+    
+    ret->paren_names    = hv_dup_inc(r->paren_names, param);
+
+    if (RX_MATCH_COPIED(ret))
+       ret->subbeg  = SAVEPVN(r->subbeg, r->sublen);
+    else
+       ret->subbeg = NULL;
+#ifdef PERL_OLD_COPY_ON_WRITE
+    ret->saved_copy = NULL;
+#endif
+    
+    ret->pprivate = r->pprivate;
+    if (ret->pprivate) 
+        RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
+    
+    ptr_table_store(PL_ptr_table, r, ret);
+    return ret;
+}
+#endif /* PERL_IN_XSUB_RE */
+
+/*
+   regdupe_internal()
+   
+   This is the internal complement to regdupe() which is used to copy
+   the structure pointed to by the *pprivate pointer in the regexp.
+   This is the core version of the extension overridable cloning hook.
+   The regexp structure being duplicated will be copied by perl prior
+   to this and will be provided as the regexp *r argument, however 
+   with the /old/ structures pprivate pointer value. Thus this routine
+   may override any copying normally done by perl.
+   
+   It returns a pointer to the new regexp_internal structure.
+*/
+
+void *
+Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
+{
+    dVAR;
+    regexp_internal *reti;
+    int len, npar;
+    RXi_GET_DECL(r,ri);
+    
+    npar = r->nparens+1;
+    len = ri->offsets[0];
+    
+    Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
+    Copy(ri->program, reti->program, len+1, regnode);
+    
     if(ri->swap) {
         Newx(reti->swap, 1, regexp_paren_ofs);
         /* no need to copy these */
@@ -8668,14 +8790,6 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
         reti->swap = NULL;
     }
 
-    Newx(ret->substrs, 1, struct reg_substr_data);
-    for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
-       s->min_offset = r->substrs->data[i].min_offset;
-       s->max_offset = r->substrs->data[i].max_offset;
-       s->end_shift  = r->substrs->data[i].end_shift;
-       s->substr     = sv_dup_inc(r->substrs->data[i].substr, param);
-       s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
-    }
 
     reti->regstclass = NULL;
     if (ri->data) {
@@ -8740,36 +8854,11 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
 
     Newx(reti->offsets, 2*len+1, U32);
     Copy(ri->offsets, reti->offsets, 2*len+1, U32);
-
-    ret->precomp        = SAVEPVN(r->precomp, r->prelen);
-    ret->refcnt         = r->refcnt;
-    ret->minlen         = r->minlen;
-    ret->minlenret      = r->minlenret;
-    ret->prelen         = r->prelen;
-    ret->nparens        = r->nparens;
-    ret->lastparen      = r->lastparen;
-    ret->lastcloseparen = r->lastcloseparen;
-    ret->intflags       = r->intflags;
-    ret->extflags       = r->extflags;
-
-    ret->sublen         = r->sublen;
-
-    ret->engine         = r->engine;
     
-    ret->paren_names    = hv_dup_inc(r->paren_names, param);
-
-    if (RX_MATCH_COPIED(ret))
-       ret->subbeg  = SAVEPVN(r->subbeg, r->sublen);
-    else
-       ret->subbeg = NULL;
-#ifdef PERL_OLD_COPY_ON_WRITE
-    ret->saved_copy = NULL;
-#endif
-
-    ptr_table_store(PL_ptr_table, r, ret);
-    return ret;
+    return (void*)reti;
 }
-#endif    
+
+#endif    /* USE_ITHREADS */
 
 /* 
    reg_stringify() 
@@ -8782,29 +8871,28 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
    resulting string
    
    If flags is nonnull and the returned string contains UTF8 then 
-   (flags & 1) will be true.
+   (*flags & 1) will be true.
    
    If haseval is nonnull then it is used to return whether the pattern 
    contains evals.
    
    Normally called via macro: 
    
-        CALLREG_STRINGIFY(mg,0,0);
+        CALLREG_STRINGIFY(mg,&len,&utf8);
         
    And internally with
    
-        CALLREG_AS_STR(mg,lp,flags,haseval)        
+        CALLREG_AS_STR(mg,&lp,&flags,&haseval)        
     
    See sv_2pv_flags() in sv.c for an example of internal usage.
     
  */
-
+#ifndef PERL_IN_XSUB_RE
 char *
 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
     dVAR;
     const regexp * const re = (regexp *)mg->mg_obj;
-    RXi_GET_DECL(re,ri);
-    
+
     if (!mg->mg_ptr) {
        const char *fptr = "msix";
        char reflags[6];
@@ -8867,7 +8955,7 @@ Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
        mg->mg_ptr[mg->mg_len] = 0;
     }
     if (haseval) 
-        *haseval = ri->program[0].next_off;
+        *haseval = re->seen_evals;
     if (flags)    
        *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
     
@@ -8876,8 +8964,6 @@ Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
     return mg->mg_ptr;
 }
 
-
-#ifndef PERL_IN_XSUB_RE
 /*
  - regnext - dig the "next" pointer out of a node
  */
@@ -9093,11 +9179,14 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
                NULL;
            const reg_trie_data * const trie =
                (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
+#ifdef DEBUGGING
+           AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
+#endif
            const regnode *nextbranch= NULL;
            I32 word_idx;
             sv_setpvn(sv, "", 0);
            for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
-               SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
+               SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
                
                 PerlIO_printf(Perl_debug_log, "%*s%s ",
                    (int)(2*(indent+3)), "",