scalars used in postponed subexpressions aren't first class regexps,
[p5sagit/p5-mst-13.2.git] / regexec.c
index aea0ad6..be159ed 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -285,9 +285,8 @@ S_regcppop(pTHX_ const regexp *rex)
      * requiring null fields (pat.t#187 and split.t#{13,14}
      * (as of patchlevel 7877)  will fail.  Then again,
      * this code seems to be necessary or otherwise
-     * building DynaLoader will fail:
-     * "Error: '*' not in typemap in DynaLoader.xs, line 164"
-     * --jhi */
+     * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
+     * --jhi updated by dapm */
     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
        if (i > PL_regsize)
            PL_regoffs[i].start = -1;
@@ -308,7 +307,7 @@ S_regcppop(pTHX_ const regexp *rex)
  - pregexec - match a regexp against a string
  */
 I32
-Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
+Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
         char *strbeg, I32 minend, SV *screamer, U32 nosave)
 /* strend: pointer to null at end of string */
 /* strbeg: real beginning of string */
@@ -372,8 +371,8 @@ Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *stren
    deleted from the finite automaton. */
 
 char *
-Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
-                    char *strend, U32 flags, re_scream_pos_data *data)
+Perl_re_intuit_start(pTHX_ REGEXP * const prog, SV *sv, char *strpos,
+                    char *strend, const U32 flags, re_scream_pos_data *data)
 {
     dVAR;
     register I32 start_shift = 0;
@@ -990,7 +989,11 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
     return NULL;
 }
 
-
+#define DECL_TRIE_TYPE(scan) \
+    const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
+                   trie_type = (scan->flags != EXACT) \
+                             ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
+                              : (do_utf8 ? trie_utf8 : trie_plain)
 
 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
@@ -1008,6 +1011,19 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
            uscan = foldbuf + UNISKIP( uvc );                               \
        }                                                                   \
        break;                                                              \
+    case trie_latin_utf8_fold:                                              \
+       if ( foldlen>0 ) {                                                  \
+           uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
+           foldlen -= len;                                                 \
+           uscan += len;                                                   \
+           len=0;                                                          \
+       } else {                                                            \
+           len = 1;                                                        \
+           uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
+           foldlen -= UNISKIP( uvc );                                      \
+           uscan = foldbuf + UNISKIP( uvc );                               \
+       }                                                                   \
+       break;                                                              \
     case trie_utf8:                                                         \
        uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
        break;                                                              \
@@ -1030,12 +1046,14 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
     }                                                                       \
 } STMT_END
 
-#define REXEC_FBC_EXACTISH_CHECK(CoNd)                  \
+#define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
+{                                                      \
+    char *my_strend= (char *)strend;                   \
     if ( (CoNd)                                        \
         && (ln == len ||                              \
-            ibcmp_utf8(s, NULL, 0,  do_utf8,          \
+            !ibcmp_utf8(s, &my_strend, 0,  do_utf8,   \
                        m, NULL, ln, (bool)UTF))       \
-        && (!reginfo || regtry(reginfo, &s)) )         \
+        && (!reginfo || regtry(reginfo, &s)) )        \
        goto got_it;                                   \
     else {                                             \
         U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
@@ -1043,15 +1061,14 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
         f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
         if ( f != c                                   \
              && (f == c1 || f == c2)                  \
-             && (ln == foldlen ||                     \
-                 !ibcmp_utf8((char *) foldbuf,        \
-                             NULL, foldlen, do_utf8,  \
-                             m,                       \
-                             NULL, ln, (bool)UTF))    \
-             && (!reginfo || regtry(reginfo, &s)) )    \
+             && (ln == len ||                         \
+               !ibcmp_utf8(s, &my_strend, 0,  do_utf8,\
+                             m, NULL, ln, (bool)UTF)) \
+             && (!reginfo || regtry(reginfo, &s)) )   \
              goto got_it;                             \
     }                                                  \
-    s += len
+}                                                      \
+s += len
 
 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
 STMT_START {                                              \
@@ -1210,15 +1227,28 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                U8 *sm = (U8 *) m;
                U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
                U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
-               const U32 uniflags = UTF8_ALLOW_DEFAULT;
-
-               to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
-               to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
-
+               /* used by commented-out code below */
+               /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
+               
+                /* XXX: Since the node will be case folded at compile
+                   time this logic is a little odd, although im not 
+                   sure that its actually wrong. --dmq */
+                   
+               c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
+               c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
+
+               /* XXX: This is kinda strange. to_utf8_XYZ returns the 
+                   codepoint of the first character in the converted
+                   form, yet originally we did the extra step. 
+                   No tests fail by commenting this code out however
+                   so Ive left it out. -- dmq.
+                   
                c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE, 
                                    0, uniflags);
                c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
                                    0, uniflags);
+                */
+                
                lnc = 0;
                while (sm < ((U8 *) m + ln)) {
                    lnc++;
@@ -1253,24 +1283,33 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             * matching (called "loose matching" in Unicode).
             * ibcmp_utf8() will do just that. */
 
-           if (do_utf8) {
+           if (do_utf8 || UTF) {
                UV c, f;
                U8 tmpbuf [UTF8_MAXBYTES+1];
-               STRLEN len, foldlen;
+               STRLEN len = 1;
+               STRLEN foldlen;
                const U32 uniflags = UTF8_ALLOW_DEFAULT;
                if (c1 == c2) {
                    /* Upper and lower of 1st char are equal -
                     * probably not a "letter". */
                    while (s <= e) {
-                       c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
+                       if (do_utf8) {
+                           c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
                                           uniflags);
+                        } else {
+                            c = *((U8*)s);
+                        }                                        
                        REXEC_FBC_EXACTISH_CHECK(c == c1);
                    }
                }
                else {
                    while (s <= e) {
-                     c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
+                       if (do_utf8) {
+                           c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
                                           uniflags);
+                        } else {
+                            c = *((U8*)s);
+                        }
 
                        /* Handle some of the three Greek sigmas cases.
                         * Note that not all the possible combinations
@@ -1288,6 +1327,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                }
            }
            else {
+               /* Neither pattern nor string are UTF8 */
                if (c1 == c2)
                    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
                else
@@ -1462,10 +1502,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
        case AHOCORASICKC:
        case AHOCORASICK: 
            {
-               const enum { trie_plain, trie_utf8, trie_utf8_fold }
-                   trie_type = do_utf8 ?
-                         (c->flags == EXACT ? trie_utf8 : trie_utf8_fold)
-                       : trie_plain;
+               DECL_TRIE_TYPE(c);
                 /* what trie are we using right now */
                reg_ac_data *aho
                    = (reg_ac_data*)progi->data->data[ ARG( c ) ];
@@ -1482,8 +1519,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                U8 **points; /* map of where we were in the input string
                                when reading a given char. For ASCII this
                                is unnecessary overhead as the relationship
-                               is always 1:1, but for unicode, especially
-                               case folded unicode this is not true. */
+                               is always 1:1, but for Unicode, especially
+                               case folded Unicode this is not true. */
                U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
                U8 *bitmap=NULL;
 
@@ -1705,7 +1742,7 @@ S_swap_match_buff (pTHX_ regexp *prog) {
  - regexec_flags - match a regexp against a string
  */
 I32
-Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
+Perl_regexec_flags(pTHX_ REGEXP * const prog, char *stringarg, register char *strend,
              char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
 /* strend: pointer to null at end of string */
 /* strbeg: real beginning of string */
@@ -1845,7 +1882,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
                    if (regtry(&reginfo, &s))
                        goto got_it;
                  after_try:
-                   if (s >= end)
+                   if (s > end)
                        goto phooey;
                    if (prog->extflags & RXf_USE_INTUIT) {
                        s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
@@ -2267,13 +2304,12 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
      * Actually, the code in regcppop() (which Ilya may be meaning by
      * PL_reglastparen), is not needed at all by the test suite
-     * (op/regexp, op/pat, op/split), but that code is needed, oddly
-     * enough, for building DynaLoader, or otherwise this
-     * "Error: '*' not in typemap in DynaLoader.xs, line 164"
-     * will happen.  Meanwhile, this code *is* needed for the
+     * (op/regexp, op/pat, op/split), but that code is needed otherwise
+     * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
+     * Meanwhile, this code *is* needed for the
      * above-mentioned test suite tests to succeed.  The common theme
      * on those tests seems to be returning null fields from matches.
-     * --jhi */
+     * --jhi updated by dapm */
 #if 1
     if (prog->nparens) {
        regexp_paren_pair *pp = PL_regoffs;
@@ -2874,10 +2910,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
        case TRIE:
            {
                 /* what type of TRIE am I? (utf8 makes this contextual) */
-               const enum { trie_plain, trie_utf8, trie_utf8_fold }
-                   trie_type = do_utf8 ?
-                         (scan->flags == EXACT ? trie_utf8 : trie_utf8_fold)
-                       : trie_plain;
+                DECL_TRIE_TYPE(scan);
 
                 /* what trie are we using right now */
                reg_trie_data * const trie
@@ -2939,7 +2972,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                    if ( got_wordnum ) {
                        if ( ! ST.accepted ) {
                            ENTER;
-                           SAVETMPS;
+                           /* SAVETMPS; */ /* XXX is this necessary? dmq */
                            bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
                            sv_accept_buff=newSV(bufflen *
                                            sizeof(reg_trie_accepted) - 1);
@@ -3150,18 +3183,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                PL_reginput = (char *)ST.accept_buff[ best ].endpos;
                if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
                    scan = ST.B;
-                   /* NOTREACHED */
                } else {
                    scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
-                   /* NOTREACHED */
-                }
-                if (has_cutgroup) {
-                    PUSH_YES_STATE_GOTO(TRIE_next, scan);    
-                    /* NOTREACHED */
-                } else {
-                    PUSH_STATE_GOTO(TRIE_next, scan);
-                    /* NOTREACHED */
                 }
+                PUSH_YES_STATE_GOTO(TRIE_next, scan);    
                 /* NOTREACHED */
            }
            /* NOTREACHED */
@@ -3688,10 +3713,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                    if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
                        mg = mg_find(sv, PERL_MAGIC_qr);
                    else if (SvSMAGICAL(ret)) {
-                       if (SvGMAGICAL(ret))
+                       if (SvGMAGICAL(ret)) {
+                           /* I don't believe that there is ever qr magic
+                              here.  */
+                           assert(!mg_find(ret, PERL_MAGIC_qr));
                            sv_unmagic(ret, PERL_MAGIC_qr);
-                       else
+                       }
+                       else {
                            mg = mg_find(ret, PERL_MAGIC_qr);
+                           /* testing suggests mg only ends up non-NULL for
+                              scalars who were upgraded and compiled in the
+                              else block below. In turn, this is only
+                              triggered in the "postponed utf8 string" tests
+                              in t/op/pat.t  */
+                       }
                    }
 
                    if (mg) {
@@ -3705,9 +3740,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                        re = CALLREGCOMP(ret, pm_flags);
                        if (!(SvFLAGS(ret)
                              & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
-                               | SVs_GMG)))
+                                | SVs_GMG))) {
+                           /* This isn't a first class regexp. Instead, it's
+                              caching a regexp onto an existing, Perl visible
+                              scalar.  */
                            sv_magic(ret,(SV*)ReREFCNT_inc(re),
                                        PERL_MAGIC_qr,0,0);
+                       }
                        PL_regsize = osize;
                    }
                }
@@ -4221,12 +4260,6 @@ NULL
 
        case BRANCH:        /*  /(...|A|...)/ */
            scan = NEXTOPER(scan); /* scan now points to inner node */
-           if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ)) 
-               && !has_cutgroup)
-           {
-               /* last branch; skip state push and jump direct to node */
-               continue;
-            }
            ST.lastparen = *PL_reglastparen;
            ST.next_branch = next;
            REGCP_SET(ST.cp);
@@ -5006,7 +5039,7 @@ NULL
 #undef ST
         case FOLDCHAR:
             n = ARG(scan);
-            if ( n == what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
+            if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
                 locinput += ln;
             } else if ( 0xDF == n && !do_utf8 && !UTF ) {
                 sayNO;
@@ -5017,7 +5050,7 @@ NULL
                 char *e = PL_regeol;
                 to_uni_fold(n, folded, &foldlen);
 
-                if (ibcmp_utf8(folded, 0,  foldlen, 1,
+               if (ibcmp_utf8((const char*) folded, 0,  foldlen, 1,
                               l, &e, 0,  do_utf8)) {
                         sayNO;
                 }
@@ -5421,8 +5454,8 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
        } else {
            while (scan < loceol && !isSPACE(*scan))
                scan++;
-           break;
        }
+       break;
     case NSPACEL:
        PL_reg_flags |= RF_tainted;
        if (do_utf8) {
@@ -5588,8 +5621,8 @@ Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool
             * documentation of these array elements. */
 
            si = *ary;
-           a  = SvROK(ary[1]) ? &ary[1] : 0;
-           b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
+           a  = SvROK(ary[1]) ? &ary[1] : NULL;
+           b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
 
            if (a)
                sw = *a;