patch to support computed regular subexpressions
Ilya Zakharevich [Thu, 20 Aug 1998 15:19:50 +0000 (11:19 -0400)]
Message-Id: <199808201919.PAA04692@monk.mps.ohio-state.edu>
Subject: [5.005_5* PATCH] Postponed RE - now!

p4raw-id: //depot/perl@1813

embedvar.h
objXSUB.h
pod/perlre.pod
regcomp.c
regexec.c
t/op/pat.t
t/op/re_tests
thrdvar.h
toke.c

index e1c9e9a..8c1b786 100644 (file)
 #define PL_ofslen              (PL_curinterp->Tofslen)
 #define PL_op                  (PL_curinterp->Top)
 #define PL_opsave              (PL_curinterp->Topsave)
+#define PL_reg_call_cc         (PL_curinterp->Treg_call_cc)
 #define PL_reg_eval_set                (PL_curinterp->Treg_eval_set)
 #define PL_reg_flags           (PL_curinterp->Treg_flags)
+#define PL_reg_re              (PL_curinterp->Treg_re)
 #define PL_reg_start_tmp       (PL_curinterp->Treg_start_tmp)
 #define PL_reg_start_tmpl      (PL_curinterp->Treg_start_tmpl)
 #define PL_regbol              (PL_curinterp->Tregbol)
 #define PL_Tofslen             PL_ofslen
 #define PL_Top                 PL_op
 #define PL_Topsave             PL_opsave
+#define PL_Treg_call_cc                PL_reg_call_cc
 #define PL_Treg_eval_set       PL_reg_eval_set
 #define PL_Treg_flags          PL_reg_flags
+#define PL_Treg_re             PL_reg_re
 #define PL_Treg_start_tmp      PL_reg_start_tmp
 #define PL_Treg_start_tmpl     PL_reg_start_tmpl
 #define PL_Tregbol             PL_regbol
 #define PL_ofslen              (thr->Tofslen)
 #define PL_op                  (thr->Top)
 #define PL_opsave              (thr->Topsave)
+#define PL_reg_call_cc         (thr->Treg_call_cc)
 #define PL_reg_eval_set                (thr->Treg_eval_set)
 #define PL_reg_flags           (thr->Treg_flags)
+#define PL_reg_re              (thr->Treg_re)
 #define PL_reg_start_tmp       (thr->Treg_start_tmp)
 #define PL_reg_start_tmpl      (thr->Treg_start_tmpl)
 #define PL_regbol              (thr->Tregbol)
index eee1178..1e6bc80 100644 (file)
--- a/objXSUB.h
+++ b/objXSUB.h
 #define PL_regbol                              pPerl->PL_regbol                
 #undef  PL_regcc               
 #define PL_regcc                               pPerl->PL_regcc         
+#undef  PL_reg_call_cc
+#define PL_reg_call_cc                         pPerl->PL_reg_call_cc
+#undef  PL_reg_re
+#define PL_reg_re                              pPerl->PL_reg_re
 #undef  PL_regcode             
 #define PL_regcode                             pPerl->PL_regcode               
 #undef  PL_regcomp_parse       
index f696525..6ecb7ad 100644 (file)
@@ -399,6 +399,28 @@ checks, thus to allow $re in the above snippet to contain C<(?{})>
 I<with tainting enabled>, one needs both C<use re 'eval'> and untaint
 the $re.
 
+=item C<(?p{ code })>
+
+I<Very experimental> "postponed" regular subexpression.  C<code> is evaluated
+at runtime, at the moment this subexpression may match.  The result of
+evaluation is considered as a regular expression, and matched as if it
+were inserted instead of this construct.
+
+C<code> is not interpolated.  Currently the rules to
+determine where the C<code> ends are somewhat convoluted.
+
+The following regular expression matches matching parenthesized group:
+
+  $re = qr{
+            \(
+            (?:
+               (?> [^()]+ )    # Non-parens without backtracking
+             |
+               (?p{ $re })     # Group with matching parens
+            )*
+            \)
+         }x;
+
 =item C<(?E<gt>pattern)>
 
 An "independent" subexpression.  Matches the substring that a
index 8db8b8a..0782232 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -709,6 +709,13 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32
                if (data)
                    data->flags |= SF_HAS_EVAL;
        }
+       else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded */
+               if (flags & SCF_DO_SUBSTR) {
+                   scan_commit(data);
+                   data->longest = &(data->longest_float);
+               }
+               is_inf = is_inf_internal = 1;
+       }
        /* Else: zero-length, ignore. */
        scan = regnext(scan);
     }
@@ -1082,6 +1089,7 @@ reg(I32 paren, I32 *flagp)
        if (*PL_regcomp_parse == '?') {
            U16 posflags = 0, negflags = 0;
            U16 *flagsp = &posflags;
+           int logical = 0;
 
            PL_regcomp_parse++;
            paren = *PL_regcomp_parse++;
@@ -1112,6 +1120,10 @@ reg(I32 paren, I32 *flagp)
                nextchar();
                *flagp = TRYAGAIN;
                return NULL;
+           case 'p':
+               logical = 1;
+               paren = *PL_regcomp_parse++;
+               /* FALL THROUGH */
            case '{':
            {
                dTHR;
@@ -1160,6 +1172,13 @@ reg(I32 paren, I32 *flagp)
                }
                
                nextchar();
+               if (logical) {
+                   ret = reg_node(LOGICAL);
+                   if (!SIZE_ONLY)
+                       ret->flags = 2;
+                   regtail(ret, reganode(EVAL, n));
+                   return ret;
+               }
                return reganode(EVAL, n);
            }
            case '(':
@@ -1171,6 +1190,8 @@ reg(I32 paren, I32 *flagp)
                        I32 flag;
                        
                        ret = reg_node(LOGICAL);
+                       if (!SIZE_ONLY)
+                           ret->flags = 1;
                        regtail(ret, reg(1, &flag));
                        goto insert_if;
                    } 
@@ -3041,7 +3062,7 @@ regprop(SV *sv, regnode *o)
        sv_catpvf(sv, "GROUPP%d", ARG(o));
        break;
     case LOGICAL:
-       p = "LOGICAL";
+       sv_catpvf(sv, "LOGICAL[%d]", o->flags);
        break;
     case SUSPEND:
        p = "SUSPEND";
index 603120f..0627e2b 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -73,6 +73,8 @@
  */
 #include "EXTERN.h"
 #include "perl.h"
+typedef MAGIC *my_magic;
+
 #include "regcomp.h"
 
 #define RF_tainted     1               /* tainted information used? */
@@ -201,6 +203,25 @@ regcppop(void)
     return input;
 }
 
+STATIC char *
+regcp_set_to(I32 ss)
+{
+    I32 tmp = PL_savestack_ix;
+
+    PL_savestack_ix = ss;
+    regcppop();
+    PL_savestack_ix = tmp;
+}
+
+typedef struct re_cc_state
+{
+    I32 ss;
+    regnode *node;
+    struct re_cc_state *prev;
+    CURCUR *cc;
+    regexp *re;
+} re_cc_state;
+
 #define regcpblow(cp) LEAVE_SCOPE(cp)
 
 /*
@@ -222,6 +243,18 @@ pregexec(register regexp *prog, char *stringarg, register char *strend,
        regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL, 
                      nosave ? 0 : REXEC_COPY_STR);
 }
+
+STATIC void
+cache_re(regexp *prog)
+{
+    PL_regprecomp = prog->precomp;             /* Needed for FAIL. */
+#ifdef DEBUGGING
+    PL_regprogram = prog->program;
+#endif
+    PL_regnpar = prog->nparens;
+    PL_regdata = prog->data;    
+    PL_reg_re = prog;    
+}
   
 /*
  - regexec_flags - match a regexp against a string
@@ -254,10 +287,9 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend,
     cc.oldcc = 0;
     PL_regcc = &cc;
 
-    PL_regprecomp = prog->precomp;             /* Needed for error messages. */
+    cache_re(prog);
 #ifdef DEBUGGING
     PL_regnarrate = PL_debug & 512;
-    PL_regprogram = prog->program;
 #endif
 
     /* Be paranoid... */
@@ -282,7 +314,6 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend,
        FAIL("corrupted regexp program");
     }
 
-    PL_regnpar = prog->nparens;
     PL_reg_flags = 0;
     PL_reg_eval_set = 0;
 
@@ -299,6 +330,9 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend,
     /* see how far we have to get to not match where we matched before */
     PL_regtill = startpos+minend;
 
+    /* We start without call_cc context.  */
+    PL_reg_call_cc = 0;
+
     /* If there is a "must appear" string, look for it. */
     s = startpos;
     if (!(flags & REXEC_CHECKED) 
@@ -360,8 +394,6 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend,
                      (strend - startpos > 60 ? "..." : ""))
        );
 
-    PL_regdata = prog->data;
-
     /* Simplest case:  anchored match need be tried only once. */
     /*  [unless only anchor is BOL and multiline is set] */
     if (prog->reganch & ROPT_ANCH) {
@@ -1552,15 +1584,91 @@ regmatch(regnode *prog)
            ret = POPs;
            PUTBACK;
            
+           PL_op = oop;
+           PL_curpad = ocurpad;
+           PL_curcop = ocurcop;
            if (logical) {
-               logical = 0;
+               if (logical == 2) {     /* Postponed subexpression. */
+                   regexp *re;
+                   my_magic mg = Null(my_magic);
+                   re_cc_state state;
+                   CURCUR cctmp;
+                   CHECKPOINT cp, lastcp;
+
+                   if(SvROK(ret) || SvRMAGICAL(ret)) {
+                       SV *sv = SvROK(ret) ? SvRV(ret) : ret;
+
+                       if(SvMAGICAL(sv))
+                           mg = mg_find(sv, 'r');
+                   }
+                   if (mg) {
+                       re = (regexp *)mg->mg_obj;
+                       ReREFCNT_inc(re);
+                   }
+                   else {
+                       STRLEN len;
+                       char *t = SvPV(ret, len);
+                       PMOP pm;
+                       char *oprecomp = PL_regprecomp;
+                       I32 osize = PL_regsize;
+                       I32 onpar = PL_regnpar;
+
+                       pm.op_pmflags = 0;
+                       re = CALLREGCOMP(t, t + len, &pm);
+                       if (!(SvFLAGS(ret) 
+                             & (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
+                           sv_magic(ret,(SV*)ReREFCNT_inc(re),'r',0,0);
+                       PL_regprecomp = oprecomp;
+                       PL_regsize = osize;
+                       PL_regnpar = onpar;
+                   }
+                   DEBUG_r(
+                       PerlIO_printf(Perl_debug_log, 
+                                     "Entering embedded `%s%.60s%s%s'\n",
+                                     PL_colors[0],
+                                     re->precomp,
+                                     PL_colors[1],
+                                     (strlen(re->precomp) > 60 ? "..." : ""))
+                       );
+                   state.node = next;
+                   state.prev = PL_reg_call_cc;
+                   state.cc = PL_regcc;
+                   state.re = PL_reg_re;
+
+                   cctmp.cur = 0;
+                   cctmp.oldcc = 0;
+                   PL_regcc = &cctmp;
+                   
+                   cp = regcppush(0);  /* Save *all* the positions. */
+                   REGCP_SET;
+                   cache_re(re);
+                   state.ss = PL_savestack_ix;
+                   *PL_reglastparen = 0;
+                   PL_reg_call_cc = &state;
+                   PL_reginput = locinput;
+                   if (regmatch(re->program + 1)) {
+                       ReREFCNT_dec(re);
+                       regcpblow(cp);
+                       sayYES;
+                   }
+                   DEBUG_r(
+                       PerlIO_printf(Perl_debug_log,
+                                     "%*s  failed...\n",
+                                     REPORT_CODE_OFF+PL_regindent*2, "")
+                       );
+                   ReREFCNT_dec(re);
+                   REGCP_UNWIND;
+                   regcppop();
+                   PL_reg_call_cc = state.prev;
+                   PL_regcc = state.cc;
+                   PL_reg_re = state.re;
+                   sayNO;
+               }
                sw = SvTRUE(ret);
+               logical = 0;
            }
            else
                sv_setsv(save_scalar(PL_replgv), ret);
-           PL_op = oop;
-           PL_curpad = ocurpad;
-           PL_curcop = ocurcop;
            break;
        }
        case OPEN:
@@ -1590,7 +1698,7 @@ regmatch(regnode *prog)
            }
            break;
        case LOGICAL:
-           logical = 1;
+           logical = scan->flags;
            break;
        case CURLYX: {
                CURCUR cc;
@@ -2086,6 +2194,40 @@ regmatch(regnode *prog)
            sayNO;
            break;
        case END:
+           if (PL_reg_call_cc) {
+               re_cc_state *cur_call_cc = PL_reg_call_cc;
+               CURCUR *cctmp = PL_regcc;
+               regexp *re = PL_reg_re;
+               CHECKPOINT cp, lastcp;
+               
+               cp = regcppush(0);      /* Save *all* the positions. */
+               REGCP_SET;
+               regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
+                                                   the caller. */
+               PL_reginput = locinput; /* Make position available to
+                                          the callcc. */
+               cache_re(PL_reg_call_cc->re);
+               PL_regcc = PL_reg_call_cc->cc;
+               PL_reg_call_cc = PL_reg_call_cc->prev;
+               if (regmatch(cur_call_cc->node)) {
+                   PL_reg_call_cc = cur_call_cc;
+                   regcpblow(cp);
+                   sayYES;
+               }
+               REGCP_UNWIND;
+               regcppop();
+               PL_reg_call_cc = cur_call_cc;
+               PL_regcc = cctmp;
+               PL_reg_re = re;
+               cache_re(re);
+
+               DEBUG_r(
+                   PerlIO_printf(Perl_debug_log,
+                                 "%*s  continuation failed...\n",
+                                 REPORT_CODE_OFF+PL_regindent*2, "")
+                   );
+               sayNO;
+           }
            if (locinput < PL_regtill)
                sayNO;                  /* Cannot match: too short. */
            /* Fall through */
index aec5f31..f588734 100755 (executable)
@@ -4,7 +4,7 @@
 # the format supported by op/regexp.t.  If you want to add a test
 # that does fit that format, add it to op/re_tests, not here.
 
-print "1..158\n";
+print "1..161\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -363,6 +363,7 @@ sub matchit {
    /xg;
 }
 
+@ans = ();
 push @ans, $res while $res = matchit;
 
 print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne "1 1 1";
@@ -375,6 +376,26 @@ print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne $expect;
 print "ok $test\n";
 $test++;
 
+my $matched;
+$matched = qr/\((?:(?>[^()]+)|(?p{$matched}))*\)/;
+
+@ans = @ans1 = ();
+push(@ans, $res), push(@ans1, $&) while $res = m/$matched/g;
+
+print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne "1 1 1";
+print "ok $test\n";
+$test++;
+
+print "# ans1='@ans1'\n# expect='$expect'\nnot " if "@ans1" ne $expect;
+print "ok $test\n";
+$test++;
+
+@ans = m/$matched/g;
+
+print "# ans='@ans'\n# expect='$expect'\nnot " if "@ans" ne $expect;
+print "ok $test\n";
+$test++;
+
 @ans = ('a/b' =~ m%(.*/)?(.*)%);       # Stack may be bad
 print "not " if "@ans" ne 'a/ b';
 print "ok $test\n";
index b6f654f..d1b1cec 100644 (file)
@@ -485,3 +485,4 @@ b\Z a\nb    y       -       -
 b\z    a\nb    y       -       -
 (^|x)(c)       ca      y       $2      c
 a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz   x       n       -       -
+a(?{$a=2;$b=3;($b)=$a})b       yabz    y       $b      2
index 4ca3ccb..c247dc4 100644 (file)
--- a/thrdvar.h
+++ b/thrdvar.h
@@ -153,6 +153,8 @@ PERLVAR(Tregnarrate,        I32)            /* from regexec.c */
 PERLVAR(Tregprogram,   regnode *)      /* from regexec.c */
 PERLVARI(Tregindent,   int,        0)  /* from regexec.c */
 PERLVAR(Tregcc,                CURCUR *)       /* from regexec.c */
+PERLVAR(Treg_call_cc,  struct re_cc_state *)   /* from regexec.c */
+PERLVAR(Treg_re,       regexp *)       /* from regexec.c */
 
 PERLVARI(Tregcompp,    regcomp_t, FUNC_NAME_TO_PTR(pregcomp))
                                        /* Pointer to RE compiler */
diff --git a/toke.c b/toke.c
index 719867b..2583a42 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -952,14 +952,16 @@ scan_const(char *start)
 
        /* if we get here, we're not doing a transliteration */
 
-       /* skip for regexp comments /(?#comment)/ */
+       /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
+          except for the last char, which will be done separately. */
        else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
            if (s[2] == '#') {
                while (s < send && *s != ')')
                    *d++ = *s++;
-           } else if (s[2] == '{') {   /* This should march regcomp.c */
+           } else if (s[2] == '{'
+                      || s[2] == 'p' && s[3] == '{') { /* This should march regcomp.c */
                I32 count = 1;
-               char *regparse = s + 3;
+               char *regparse = s + (s[2] == '{' ? 3 : 4);
                char c;
 
                while (count && (c = *regparse)) {
@@ -971,11 +973,9 @@ scan_const(char *start)
                        count--;
                    regparse++;
                }
-               if (*regparse == ')')
-                   regparse++;
-               else
+               if (*regparse != ')')
                    yyerror("Sequence (?{...}) not terminated or not {}-balanced");
-               while (s < regparse && *s != ')')
+               while (s < regparse)
                    *d++ = *s++;
            }
        }