Re: New file: t/op/regexp_email.t
Yves Orton [Wed, 28 Feb 2007 17:45:33 +0000 (18:45 +0100)]
Message-ID: <9b18b3110702280845p7860ca08taf1aead39a178aa4@mail.gmail.com>

p4raw-id: //depot/perl@30436

MANIFEST
ext/re/re.pm
regcomp.h
regexec.c
t/op/regexp_email.t [new file with mode: 0644]

index 02db239..b70220b 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -3616,6 +3616,7 @@ t/op/regexp_qr_embed.t            See if regular expressions work with embedded qr//
 t/op/regexp_qr.t               See if regular expressions work as qr//
 t/op/regexp.t                  See if regular expressions work
 t/op/regexp_trielist.t         See if regular expressions work with trie optimisation
+t/op/regexp_email.t            See if regex recursion works by parsing email addresses
 t/op/regmesg.t                 See if one can get regular expression errors
 t/op/repeat.t                  See if x operator works
 t/op/re_tests                  Regular expressions for regexp.t
index 4f8d410..c33ca3c 100644 (file)
@@ -67,8 +67,9 @@ my %flags = (
     STATE           => 0x080000,
     OPTIMISEM       => 0x100000,
     STACK           => 0x280000,
+    BUFFERS         => 0x400000,
 );
-$flags{ALL} = -1;
+$flags{ALL} = -1 & ~($flags{OFFSETS}|$flags{OFFSETSDBG}|$flags{BUFFERS});
 $flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
 $flags{Extra} = $flags{EXECUTE} | $flags{COMPILE};
 $flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
@@ -323,6 +324,11 @@ Enable debugging of start point optimisations.
 
 Turns on all "extra" debugging options.
 
+=item BUFFERS
+
+Enable debugging the capture buffer storage during match. Warning,
+this can potentially produce extremely large output.
+
 =item TRIEM
 
 Enable enhanced TRIE debugging. Enhances both TRIEE
@@ -373,7 +379,7 @@ These are useful shortcuts to save on the typing.
 
 =item ALL
 
-Enable all compile and execute options at once.
+Enable all options at once except OFFSETS, OFFSETSDBG and BUFFERS
 
 =item All
 
index b07a63f..72f415a 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -695,6 +695,7 @@ re.pm, especially to the documentation.
 #define RE_DEBUG_EXTRA_OFFDEBUG    0x040000
 #define RE_DEBUG_EXTRA_STATE       0x080000
 #define RE_DEBUG_EXTRA_OPTIMISE    0x100000
+#define RE_DEBUG_EXTRA_BUFFERS     0x400000
 /* combined */
 #define RE_DEBUG_EXTRA_STACK       0x280000
 
@@ -732,6 +733,9 @@ re.pm, especially to the documentation.
     if (re_debug_flags & RE_DEBUG_EXTRA_STATE) x )
 #define DEBUG_STACK_r(x) DEBUG_r( \
     if (re_debug_flags & RE_DEBUG_EXTRA_STACK) x )
+#define DEBUG_BUFFERS_r(x) DEBUG_r( \
+    if (re_debug_flags & RE_DEBUG_EXTRA_BUFFERS) x )
+
 #define DEBUG_OPTIMISE_MORE_r(x) DEBUG_r( \
     if ((RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE) == \
          (re_debug_flags & (RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE)) ) x )
index 1ae9842..d1f6b8e 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -193,7 +193,7 @@ S_regcppush(pTHX_ I32 parenfloor)
        SSPUSHINT(PL_regstartp[p]);
        SSPUSHPTR(PL_reg_start_tmp[p]);
        SSPUSHINT(p);
-       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+       DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
          "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
                      (UV)p, (IV)PL_regstartp[p],
                      (IV)(PL_reg_start_tmp[p] - PL_bostr),
@@ -263,7 +263,7 @@ S_regcppop(pTHX_ const regexp *rex)
        tmps = SSPOPINT;
        if (paren <= *PL_reglastparen)
            PL_regendp[paren] = tmps;
-       DEBUG_EXECUTE_r(
+       DEBUG_BUFFERS_r(
            PerlIO_printf(Perl_debug_log,
                          "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
                          (UV)paren, (IV)PL_regstartp[paren],
@@ -272,7 +272,7 @@ S_regcppop(pTHX_ const regexp *rex)
                          (paren > *PL_reglastparen ? "(no)" : ""));
        );
     }
-    DEBUG_EXECUTE_r(
+    DEBUG_BUFFERS_r(
        if (*PL_reglastparen + 1 <= rex->nparens) {
            PerlIO_printf(Perl_debug_log,
                          "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
@@ -3568,8 +3568,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             regnode *startpoint;
 
        case GOSTART:
-       case GOSUB: /*    /(...(?1))/      */
-            if (cur_eval && cur_eval->locinput==locinput) {
+       case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
+           if (cur_eval && cur_eval->locinput==locinput) {
                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan)) 
                     Perl_croak(aTHX_ "Infinite recursion in regex");
                 if ( ++nochange_depth > max_nochange_depth )
@@ -3742,7 +3742,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            cur_curlyx = ST.prev_curlyx;
            /* XXXX This is too dramatic a measure... */
            PL_reg_maxiter = 0;
-            if ( nochange_depth > 0 );
+            if ( nochange_depth )
                nochange_depth--;
            sayYES;
 
@@ -3760,7 +3760,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            cur_curlyx = ST.prev_curlyx;
            /* XXXX This is too dramatic a measure... */
            PL_reg_maxiter = 0;
-           if ( nochange_depth > 0 );
+           if ( nochange_depth )
                nochange_depth--;
            sayNO_SILENT;
 #undef ST
@@ -4755,8 +4755,6 @@ NULL
            if (cur_eval) {
                /* we've just finished A in /(??{A})B/; now continue with B */
                I32 tmpix;
-
-
                st->u.eval.toggle_reg_flags
                            = cur_eval->u.eval.toggle_reg_flags;
                PL_reg_flags ^= st->u.eval.toggle_reg_flags; 
@@ -4782,9 +4780,10 @@ NULL
                DEBUG_EXECUTE_r(
                    PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
                                      REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
-                if ( nochange_depth > 0 );
-                   nochange_depth++;
-               PUSH_YES_STATE_GOTO(EVAL_AB,
+                if ( nochange_depth )
+                   nochange_depth--;
+
+                PUSH_YES_STATE_GOTO(EVAL_AB,
                        st->u.eval.prev_eval->u.eval.B); /* match B */
            }
 
diff --git a/t/op/regexp_email.t b/t/op/regexp_email.t
new file mode 100644 (file)
index 0000000..c53dd82
--- /dev/null
@@ -0,0 +1,94 @@
+#!./perl
+#
+# Tests to make sure the regexp engine doesn't run into limits too soon.
+#
+
+BEGIN {
+    chdir 't' if -d 't';
+    @INC = '../lib';
+}
+
+print "1..13\n";
+
+my $email = qr {
+    (?(DEFINE)
+      (?<address>         (?&mailbox) | (?&group))
+      (?<mailbox>         (?&name_addr) | (?&addr_spec))
+      (?<name_addr>       (?&display_name)? (?&angle_addr))
+      (?<angle_addr>      (?&CFWS)? < (?&addr_spec) > (?&CFWS)?)
+      (?<group>           (?&display_name) : (?:(?&mailbox_list) | (?&CFWS))? ;
+                                             (?&CFWS)?)
+      (?<display_name>    (?&phrase))
+      (?<mailbox_list>    (?&mailbox) (?: , (?&mailbox))*)
+
+      (?<addr_spec>       (?&local_part) \@ (?&domain))
+      (?<local_part>      (?&dot_atom) | (?&quoted_string))
+      (?<domain>          (?&dot_atom) | (?&domain_literal))
+      (?<domain_literal>  (?&CFWS)? \[ (?: (?&FWS)? (?&dcontent))* (?&FWS)?
+                                    \] (?&CFWS)?)
+      (?<dcontent>        (?&dtext) | (?&quoted_pair))
+      (?<dtext>           (?&NO_WS_CTL) | [\x21-\x5a\x5e-\x7e])
+
+      (?<atext>           (?&ALPHA) | (?&DIGIT) | [!#\$%&'*+-/=?^_`{|}~])
+      (?<atom>            (?&CFWS)? (?&atext)+ (?&CFWS)?)
+      (?<dot_atom>        (?&CFWS)? (?&dot_atom_text) (?&CFWS)?)
+      (?<dot_atom_text>   (?&atext)+ (?: \. (?&atext)+)*)
+
+      (?<text>            [\x01-\x09\x0b\x0c\x0e-\x7f])
+      (?<quoted_pair>     \\ (?&text))
+
+      (?<qtext>           (?&NO_WS_CTL) | [\x21\x23-\x5b\x5d-\x7e])
+      (?<qcontent>        (?&qtext) | (?&quoted_pair))
+      (?<quoted_string>   (?&CFWS)? (?&DQUOTE) (?:(?&FWS)? (?&qcontent))*
+                           (?&FWS)? (?&DQUOTE) (?&CFWS)?)
+
+      (?<word>            (?&atom) | (?&quoted_string))
+      (?<phrase>          (?&word)+)
+
+      # Folding white space
+      (?<FWS>             (?: (?&WSP)* (?&CRLF))? (?&WSP)+)
+      (?<ctext>           (?&NO_WS_CTL) | [\x21-\x27\x2a-\x5b\x5d-\x7e])
+      (?<ccontent>        (?&ctext) | (?&quoted_pair) | (?&comment))
+      (?<comment>         \( (?: (?&FWS)? (?&ccontent))* (?&FWS)? \) )
+      (?<CFWS>            (?: (?&FWS)? (?&comment))*
+                          (?: (?:(?&FWS)? (?&comment)) | (?&FWS)))
+
+      # No whitespace control
+      (?<NO_WS_CTL>       [\x01-\x08\x0b\x0c\x0e-\x1f\x7f])
+
+      (?<ALPHA>           [A-Za-z])
+      (?<DIGIT>           [0-9])
+      (?<CRLF>            \x0d \x0a)
+      (?<DQUOTE>          ")
+      (?<WSP>             [\x20\x09])
+    )
+
+    (?&address)
+}x;
+
+my $count = 0;
+
+$| = 1;
+while (<DATA>) {
+    chomp;
+    next if /^#/;
+    print /^$email$/ ? "ok " : "not ok ", ++ $count, "\n";
+}
+
+#
+# Acme::MetaSyntactic ++
+#
+__DATA__
+Jeff_Tracy@thunderbirds.org
+"Lady Penelope"@thunderbirds.org
+"The\ Hood"@thunderbirds.org
+fred @ flintstones.net
+barney (rubble) @ flintstones.org
+bammbamm (bam! bam! (bam! bam! (bam!)) bam!) @ flintstones.org
+Michelangelo@[127.0.0.1]
+Donatello @ [127.0.0.1]
+Raphael (He as well) @ [127.0.0.1]
+"Leonardo" @ [127.0.0.1]
+Barbapapa <barbapapa @ barbapapa.net>
+"Barba Mama" <barbamama @ [127.0.0.1]>
+Barbalala (lalalalalalalala) <barbalala (Yes, her!) @ (barba) barbapapa.net>