perl 4.0 patch 14: patch #11, continued
[p5sagit/p5-mst-13.2.git] / regexec.c
index 2246454..bb63eda 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -7,11 +7,18 @@
  * blame Henry for some of the lack of readability.
  */
 
-/* $Header: regexec.c,v 3.0 89/10/18 15:22:53 lwall Locked $
+/* $RCSfile: regexec.c,v $$Revision: 4.0.1.2 $$Date: 91/06/07 11:50:33 $
  *
  * $Log:       regexec.c,v $
- * Revision 3.0  89/10/18  15:22:53  lwall
- * 3.0 baseline
+ * Revision 4.0.1.2  91/06/07  11:50:33  lwall
+ * patch4: new copyright notice
+ * patch4: // wouldn't use previous pattern if it started with a null character
+ * 
+ * Revision 4.0.1.1  91/04/12  09:07:39  lwall
+ * patch1: regexec only allocated space for 9 subexpresssions
+ * 
+ * Revision 4.0  91/03/20  01:39:16  lwall
+ * 4.0 baseline.
  * 
  */
 
  *
  ****    Alterations to Henry's code are...
  ****
- ****    Copyright (c) 1989, Larry Wall
+ ****    Copyright (c) 1991, Larry Wall
  ****
- ****    You may distribute under the terms of the GNU General Public License
- ****    as specified in the README file that comes with the perl 3.0 kit.
+ ****    You may distribute under the terms of either the GNU General Public
+ ****    License or the Artistic License, as specified in the README file.
  *
  * Beware that some of this code is subtly aware of the way operator
  * precedence is structured in regular expressions.  Serious changes in
 int regnarrate = 0;
 #endif
 
+#define isALNUM(c) (isascii(c) && (isalpha(c) || isdigit(c) || c == '_'))
+#define isSPACE(c) (isascii(c) && isspace(c))
+#define isDIGIT(c) (isascii(c) && isdigit(c))
+#define isUPPER(c) (isascii(c) && isupper(c))
+
 /*
  * regexec and friends
  */
@@ -67,6 +79,7 @@ int regnarrate = 0;
  */
 static char *regprecomp;
 static char *reginput;         /* String-input pointer. */
+static char regprev;           /* char before regbol, \n if none */
 static char *regbol;           /* Beginning of input, for ^ check. */
 static char *regeol;           /* End of input, for $ check. */
 static char **regstartp;       /* Pointer to startp array. */
@@ -74,8 +87,9 @@ static char **regendp;                /* Ditto for endp. */
 static char *reglastparen;     /* Similarly for lastparen. */
 static char *regtill;
 
-static char *regmystartp[10];  /* For remembering backreferences. */
-static char *regmyendp[10];
+static int regmyp_size = 0;
+static char **regmystartp = Null(char**);
+static char **regmyendp   = Null(char**);
 
 /*
  * Forwards.
@@ -106,7 +120,6 @@ int safebase;       /* no need to remember string in subbase */
        register int tmp;
        int minlen = 0;         /* must match at least this many chars */
        int dontbother = 0;     /* how many characters not to try at end */
-       int beginning = (string == strbeg);     /* is ^ valid at stringarg? */
 
        /* Be paranoid... */
        if (prog == NULL || string == NULL) {
@@ -114,6 +127,13 @@ int safebase;      /* no need to remember string in subbase */
                return(0);
        }
 
+       if (string == strbeg)   /* is ^ valid at stringarg? */
+           regprev = '\n';
+       else {
+           regprev = stringarg[-1];
+           if (!multiline && regprev == '\n')
+               regprev = '\0';         /* force ^ to NOT match */
+       }
        regprecomp = prog->precomp;
        /* Check validity of program. */
        if (UCHARAT(prog->program) != MAGIC) {
@@ -128,14 +148,16 @@ int safebase;     /* no need to remember string in subbase */
                string = c;
                strend = string + i;
                for (s = string; s < strend; s++)
-                       if (isupper(*s))
+                       if (isUPPER(*s))
                                *s = tolower(*s);
        }
 
        /* If there is a "must appear" string, look for it. */
        s = string;
-       if (prog->regmust != Nullstr) {
-               if (beginning && screamer) {
+       if (prog->regmust != Nullstr &&
+           (!(prog->reganch & ROPT_ANCH)
+            || (multiline && prog->regback >= 0)) ) {
+               if (stringarg == strbeg && screamer) {
                        if (screamfirst[prog->regmust->str_rare] >= 0)
                                s = screaminstr(screamer,prog->regmust);
                        else
@@ -168,10 +190,7 @@ int safebase;      /* no need to remember string in subbase */
        }
 
        /* Mark beginning of line for ^ . */
-       if (beginning)
-               regbol = string;
-       else
-               regbol = NULL;
+       regbol = string;
 
        /* Mark end of line for $ (and such) */
        regeol = strend;
@@ -179,9 +198,27 @@ int safebase;      /* no need to remember string in subbase */
        /* see how far we have to get to not match where we matched before */
        regtill = string+minend;
 
+       /* Allocate our backreference arrays */
+       if ( regmyp_size < prog->nparens + 1 ) {
+           /* Allocate or enlarge the arrays */
+           regmyp_size = prog->nparens + 1;
+           if ( regmyp_size < 10 ) regmyp_size = 10;   /* minimum */
+           if ( regmystartp ) {
+               /* reallocate larger */
+               Renew(regmystartp,regmyp_size,char*);
+               Renew(regmyendp,  regmyp_size,char*);
+           }
+           else {
+               /* Initial allocation */
+               New(1102,regmystartp,regmyp_size,char*);
+               New(1102,regmyendp,  regmyp_size,char*);
+           }
+       
+       }
+
        /* Simplest case:  anchored match need be tried only once. */
        /*  [unless multiline is set] */
-       if (prog->reganch) {
+       if (prog->reganch & ROPT_ANCH) {
                if (regtry(prog, string))
                        goto got_it;
                else if (multiline) {
@@ -191,9 +228,9 @@ int safebase;       /* no need to remember string in subbase */
                        /* for multiline we only have to try after newlines */
                        if (s > string)
                            s--;
-                       for (; s < strend; s++) {
-                           if (*s == '\n') {
-                               if (++s < strend && regtry(prog, s))
+                       while (s < strend) {
+                           if (*s++ == '\n') {
+                               if (s < strend && regtry(prog, s))
                                    goto got_it;
                            }
                        }
@@ -203,8 +240,22 @@ int safebase;      /* no need to remember string in subbase */
 
        /* Messy cases:  unanchored match. */
        if (prog->regstart) {
-               /* We know what string it must start with. */
-               if (prog->regstart->str_pok == 3) {
+               if (prog->reganch & ROPT_SKIP) {  /* we have /x+whatever/ */
+                   /* it must be a one character string */
+                   i = prog->regstart->str_ptr[0];
+                   while (s < strend) {
+                           if (*s == i) {
+                                   if (regtry(prog, s))
+                                           goto got_it;
+                                   s++;
+                                   while (s < strend && *s == i)
+                                       s++;
+                           }
+                           s++;
+                   }
+               }
+               else if (prog->regstart->str_pok == 3) {
+                   /* We know what string it must start with. */
 #ifndef lint
                    while ((s = fbminstr((unsigned char*)s,
                      (unsigned char*)strend, prog->regstart)) != NULL)
@@ -229,18 +280,26 @@ int safebase;     /* no need to remember string in subbase */
                goto phooey;
        }
        if (c = prog->regstclass) {
+               int doevery = (prog->reganch & ROPT_SKIP) == 0;
+
                if (minlen)
                    dontbother = minlen - 1;
                strend -= dontbother;   /* don't bother with what can't match */
+               tmp = 1;
                /* We know what class it must start with. */
                switch (OP(c)) {
-               case ANYOF: case ANYBUT:
+               case ANYOF:
                    c = OPERAND(c);
                    while (s < strend) {
-                           i = *s;
-                           if (!(c[i >> 3] & (1 << (i&7))))
-                                   if (regtry(prog, s))
+                           i = UCHARAT(s);
+                           if (!(c[i >> 3] & (1 << (i&7)))) {
+                                   if (tmp && regtry(prog, s))
                                            goto got_it;
+                                   else
+                                           tmp = doevery;
+                           }
+                           else
+                                   tmp = 1;
                            s++;
                    }
                    break;
@@ -249,20 +308,20 @@ int safebase;     /* no need to remember string in subbase */
                        dontbother++,strend--;
                    if (s != string) {
                        i = s[-1];
-                       tmp = (isalpha(i) || isdigit(i) || i == '_');
+                       tmp = isALNUM(i);
                    }
                    else
-                       tmp = 0;        /* assume not alphanumeric */
+                       tmp = isALNUM(regprev); /* assume not alphanumeric */
                    while (s < strend) {
                            i = *s;
-                           if (tmp != (isalpha(i) || isdigit(i) || i == '_')) {
+                           if (tmp != isALNUM(i)) {
                                    tmp = !tmp;
                                    if (regtry(prog, s))
                                            goto got_it;
                            }
                            s++;
                    }
-                   if (tmp && regtry(prog,s))
+                   if ((minlen || tmp) && regtry(prog,s))
                            goto got_it;
                    break;
                case NBOUND:
@@ -270,75 +329,106 @@ int safebase;    /* no need to remember string in subbase */
                        dontbother++,strend--;
                    if (s != string) {
                        i = s[-1];
-                       tmp = (isalpha(i) || isdigit(i) || i == '_');
+                       tmp = isALNUM(i);
                    }
                    else
-                       tmp = 0;        /* assume not alphanumeric */
+                       tmp = isALNUM(regprev); /* assume not alphanumeric */
                    while (s < strend) {
                            i = *s;
-                           if (tmp != (isalpha(i) || isdigit(i) || i == '_'))
+                           if (tmp != isALNUM(i))
                                    tmp = !tmp;
                            else if (regtry(prog, s))
                                    goto got_it;
                            s++;
                    }
-                   if (!tmp && regtry(prog,s))
+                   if ((minlen || !tmp) && regtry(prog,s))
                            goto got_it;
                    break;
                case ALNUM:
                    while (s < strend) {
                            i = *s;
-                           if (isalpha(i) || isdigit(i) || i == '_')
-                                   if (regtry(prog, s))
+                           if (isALNUM(i)) {
+                                   if (tmp && regtry(prog, s))
                                            goto got_it;
+                                   else
+                                           tmp = doevery;
+                           }
+                           else
+                                   tmp = 1;
                            s++;
                    }
                    break;
                case NALNUM:
                    while (s < strend) {
                            i = *s;
-                           if (!isalpha(i) && !isdigit(i) && i != '_')
-                                   if (regtry(prog, s))
+                           if (!isALNUM(i)) {
+                                   if (tmp && regtry(prog, s))
                                            goto got_it;
+                                   else
+                                           tmp = doevery;
+                           }
+                           else
+                                   tmp = 1;
                            s++;
                    }
                    break;
                case SPACE:
                    while (s < strend) {
-                           if (isspace(*s))
-                                   if (regtry(prog, s))
+                           if (isSPACE(*s)) {
+                                   if (tmp && regtry(prog, s))
                                            goto got_it;
+                                   else
+                                           tmp = doevery;
+                           }
+                           else
+                                   tmp = 1;
                            s++;
                    }
                    break;
                case NSPACE:
                    while (s < strend) {
-                           if (!isspace(*s))
-                                   if (regtry(prog, s))
+                           if (!isSPACE(*s)) {
+                                   if (tmp && regtry(prog, s))
                                            goto got_it;
+                                   else
+                                           tmp = doevery;
+                           }
+                           else
+                                   tmp = 1;
                            s++;
                    }
                    break;
                case DIGIT:
                    while (s < strend) {
-                           if (isdigit(*s))
-                                   if (regtry(prog, s))
+                           if (isDIGIT(*s)) {
+                                   if (tmp && regtry(prog, s))
                                            goto got_it;
+                                   else
+                                           tmp = doevery;
+                           }
+                           else
+                                   tmp = 1;
                            s++;
                    }
                    break;
                case NDIGIT:
                    while (s < strend) {
-                           if (!isdigit(*s))
-                                   if (regtry(prog, s))
+                           if (!isDIGIT(*s)) {
+                                   if (tmp && regtry(prog, s))
                                            goto got_it;
+                                   else
+                                           tmp = doevery;
+                           }
+                           else
+                                   tmp = 1;
                            s++;
                    }
                    break;
                }
        }
        else {
-               dontbother = minend;
+               if (minlen)
+                   dontbother = minlen - 1;
                strend -= dontbother;
                /* We don't know much -- general case. */
                do {
@@ -360,7 +450,8 @@ int safebase;       /* no need to remember string in subbase */
                    s = nsavestr(strbeg,i);     /* so $digit will work later */
                    if (prog->subbase)
                            Safefree(prog->subbase);
-                   prog->subbase = s;
+                   prog->subbeg = prog->subbase = s;
+                   prog->subend = s+i;
                }
                else
                    s = prog->subbase;
@@ -403,7 +494,7 @@ char *string;
        sp = prog->startp;
        ep = prog->endp;
        if (prog->nparens) {
-               for (i = NSUBEXP; i > 0; i--) {
+               for (i = prog->nparens; i >= 0; i--) {
                        *sp++ = NULL;
                        *ep++ = NULL;
                }
@@ -464,18 +555,20 @@ char *prog;
 
                switch (OP(scan)) {
                case BOL:
-                       if (locinput == regbol ||
+                       if (locinput == regbol ? regprev == '\n' :
                            ((nextchar || locinput < regeol) &&
                              locinput[-1] == '\n') )
                        {
-                               regtill--;
+                               /* regtill = regbol; */
                                break;
                        }
                        return(0);
                case EOL:
                        if ((nextchar || locinput < regeol) && nextchar != '\n')
                                return(0);
-                       regtill--;
+                       if (!multiline && regeol - locinput > 1)
+                               return 0;
+                       /* regtill = regbol; */
                        break;
                case ANY:
                        if ((nextchar == '\0' && locinput >= regeol) ||
@@ -489,7 +582,7 @@ char *prog;
                        /* Inline the first character, for speed. */
                        if (*s != nextchar)
                                return(0);
-                       if (locinput + ln > regeol)
+                       if (regeol - locinput < ln)
                                return 0;
                        if (ln > 1 && bcmp(s, locinput, ln) != 0)
                                return(0);
@@ -497,82 +590,67 @@ char *prog;
                        nextchar = *locinput;
                        break;
                case ANYOF:
-               case ANYBUT:
                        s = OPERAND(scan);
                        if (nextchar < 0)
                                nextchar = UCHARAT(locinput);
                        if (s[nextchar >> 3] & (1 << (nextchar&7)))
                                return(0);
-                       nextchar = *++locinput;
-                       if (!nextchar && locinput > regeol)
+                       if (!nextchar && locinput >= regeol)
                                return 0;
+                       nextchar = *++locinput;
                        break;
                case ALNUM:
                        if (!nextchar)
                                return(0);
-                       if (!isalpha(nextchar) && !isdigit(nextchar) &&
-                         nextchar != '_')
+                       if (!isALNUM(nextchar))
                                return(0);
                        nextchar = *++locinput;
                        break;
                case NALNUM:
                        if (!nextchar && locinput >= regeol)
                                return(0);
-                       if (isalpha(nextchar) || isdigit(nextchar) ||
-                         nextchar == '_')
+                       if (isALNUM(nextchar))
                                return(0);
                        nextchar = *++locinput;
                        break;
                case NBOUND:
                case BOUND:
                        if (locinput == regbol) /* was last char in word? */
-                               ln = 0;
+                               ln = isALNUM(regprev);
                        else 
-                               ln = (isalpha(locinput[-1]) ||
-                                    isdigit(locinput[-1]) ||
-                                    locinput[-1] == '_' );
-                       n = (isalpha(nextchar) || isdigit(nextchar) ||
-                           nextchar == '_' );  /* is next char in word? */
+                               ln = isALNUM(locinput[-1]);
+                       n = isALNUM(nextchar); /* is next char in word? */
                        if ((ln == n) == (OP(scan) == BOUND))
                                return(0);
                        break;
                case SPACE:
                        if (!nextchar && locinput >= regeol)
                                return(0);
-                       if (!isspace(nextchar))
+                       if (!isSPACE(nextchar))
                                return(0);
                        nextchar = *++locinput;
                        break;
                case NSPACE:
                        if (!nextchar)
                                return(0);
-                       if (isspace(nextchar))
+                       if (isSPACE(nextchar))
                                return(0);
                        nextchar = *++locinput;
                        break;
                case DIGIT:
-                       if (!isdigit(nextchar))
+                       if (!isDIGIT(nextchar))
                                return(0);
                        nextchar = *++locinput;
                        break;
                case NDIGIT:
                        if (!nextchar && locinput >= regeol)
                                return(0);
-                       if (isdigit(nextchar))
+                       if (isDIGIT(nextchar))
                                return(0);
                        nextchar = *++locinput;
                        break;
                case REF:
-               case REF+1:
-               case REF+2:
-               case REF+3:
-               case REF+4:
-               case REF+5:
-               case REF+6:
-               case REF+7:
-               case REF+8:
-               case REF+9:
-                       n = OP(scan) - REF;
+                       n = ARG1(scan);  /* which paren pair */
                        s = regmystartp[n];
                        if (!s)
                            return(0);
@@ -596,16 +674,8 @@ char *prog;
                        break;
                case BACK:
                        break;
-               case OPEN+1:
-               case OPEN+2:
-               case OPEN+3:
-               case OPEN+4:
-               case OPEN+5:
-               case OPEN+6:
-               case OPEN+7:
-               case OPEN+8:
-               case OPEN+9:
-                       n = OP(scan) - OPEN;
+               case OPEN:
+                       n = ARG1(scan);  /* which paren pair */
                        reginput = locinput;
 
                        regmystartp[n] = locinput;      /* for REF */
@@ -621,16 +691,8 @@ char *prog;
                        } else
                                return(0);
                        /* NOTREACHED */
-               case CLOSE+1:
-               case CLOSE+2:
-               case CLOSE+3:
-               case CLOSE+4:
-               case CLOSE+5:
-               case CLOSE+6:
-               case CLOSE+7:
-               case CLOSE+8:
-               case CLOSE+9: {
-                               n = OP(scan) - CLOSE;
+               case CLOSE: {
+                               n = ARG1(scan);  /* which paren pair */
                                reginput = locinput;
 
                                regmyendp[n] = locinput;        /* for REF */
@@ -672,19 +734,33 @@ char *prog;
                                }
                        }
                        break;
+               case CURLY:
+                       ln = ARG1(scan);  /* min to match */
+                       n  = ARG2(scan);  /* max to match */
+                       scan = NEXTOPER(scan) + 4;
+                       goto repeat;
                case STAR:
+                       ln = 0;
+                       n = 0;
+                       scan = NEXTOPER(scan);
+                       goto repeat;
                case PLUS:
                        /*
                         * Lookahead to avoid useless match attempts
                         * when we know what character comes next.
                         */
+                       ln = 1;
+                       n = 0;
+                       scan = NEXTOPER(scan);
+                   repeat:
                        if (OP(next) == EXACTLY)
                                nextchar = *(OPERAND(next)+1);
                        else
                                nextchar = -1000;
-                       ln = (OP(scan) == STAR) ? 0 : 1;
                        reginput = locinput;
-                       n = regrepeat(NEXTOPER(scan));
+                       n = regrepeat(scan, n);
+                       if (!multiline && OP(next) == EOL && ln < n)
+                           ln = n;                     /* why back off? */
                        while (n >= ln) {
                                /* If it could work, try it. */
                                if (nextchar == -1000 || *reginput == nextchar)
@@ -726,8 +802,9 @@ char *prog;
  * rather than incrementing count on every character.]
  */
 static int
-regrepeat(p)
+regrepeat(p, max)
 char *p;
+int max;
 {
        register char *scan;
        register char *opnd;
@@ -735,6 +812,8 @@ char *p;
        register char *loceol = regeol;
 
        scan = reginput;
+       if (max && max < loceol - scan)
+           loceol = scan + max;
        opnd = OPERAND(p);
        switch (OP(p)) {
        case ANY:
@@ -747,7 +826,6 @@ char *p;
                        scan++;
                break;
        case ANYOF:
-       case ANYBUT:
                c = UCHARAT(scan);
                while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) {
                        scan++;
@@ -755,28 +833,27 @@ char *p;
                }
                break;
        case ALNUM:
-               while (isalpha(*scan) || isdigit(*scan) || *scan == '_')
+               while (scan < loceol && isALNUM(*scan))
                        scan++;
                break;
        case NALNUM:
-               while (scan < loceol && (!isalpha(*scan) && !isdigit(*scan) &&
-                 *scan != '_'))
+               while (scan < loceol && !isALNUM(*scan))
                        scan++;
                break;
        case SPACE:
-               while (scan < loceol && isspace(*scan))
+               while (scan < loceol && isSPACE(*scan))
                        scan++;
                break;
        case NSPACE:
-               while (scan < loceol && !isspace(*scan))
+               while (scan < loceol && !isSPACE(*scan))
                        scan++;
                break;
        case DIGIT:
-               while (isdigit(*scan))
+               while (scan < loceol && isDIGIT(*scan))
                        scan++;
                break;
        case NDIGIT:
-               while (scan < loceol && !isdigit(*scan))
+               while (scan < loceol && !isDIGIT(*scan))
                        scan++;
                break;
        default:                /* Oh dear.  Called inappropriately. */