* blame Henry for some of the lack of readability.
*/
-/* $Header: regexec.c,v 3.0.1.1 89/11/11 04:52:04 lwall Locked $
+/* $RCSfile: regexec.c,v $$Revision: 4.0.1.2 $$Date: 91/06/07 11:50:33 $
*
* $Log: regexec.c,v $
- * Revision 3.0.1.1 89/11/11 04:52:04 lwall
- * patch2: /\b$foo/ didn't work
+ * Revision 4.0.1.2 91/06/07 11:50:33 lwall
+ * patch4: new copyright notice
+ * patch4: // wouldn't use previous pattern if it started with a null character
*
- * Revision 3.0 89/10/18 15:22:53 lwall
- * 3.0 baseline
+ * Revision 4.0.1.1 91/04/12 09:07:39 lwall
+ * patch1: regexec only allocated space for 9 subexpresssions
+ *
+ * Revision 4.0 91/03/20 01:39:16 lwall
+ * 4.0 baseline.
*
*/
*
**** Alterations to Henry's code are...
****
- **** Copyright (c) 1989, Larry Wall
+ **** Copyright (c) 1991, Larry Wall
****
- **** You may distribute under the terms of the GNU General Public License
- **** as specified in the README file that comes with the perl 3.0 kit.
+ **** You may distribute under the terms of either the GNU General Public
+ **** License or the Artistic License, as specified in the README file.
*
* Beware that some of this code is subtly aware of the way operator
* precedence is structured in regular expressions. Serious changes in
int regnarrate = 0;
#endif
+#define isALNUM(c) (isascii(c) && (isalpha(c) || isdigit(c) || c == '_'))
+#define isSPACE(c) (isascii(c) && isspace(c))
+#define isDIGIT(c) (isascii(c) && isdigit(c))
+#define isUPPER(c) (isascii(c) && isupper(c))
+
/*
* regexec and friends
*/
*/
static char *regprecomp;
static char *reginput; /* String-input pointer. */
+static char regprev; /* char before regbol, \n if none */
static char *regbol; /* Beginning of input, for ^ check. */
static char *regeol; /* End of input, for $ check. */
static char **regstartp; /* Pointer to startp array. */
static char *reglastparen; /* Similarly for lastparen. */
static char *regtill;
-static char *regmystartp[10]; /* For remembering backreferences. */
-static char *regmyendp[10];
+static int regmyp_size = 0;
+static char **regmystartp = Null(char**);
+static char **regmyendp = Null(char**);
/*
* Forwards.
register int tmp;
int minlen = 0; /* must match at least this many chars */
int dontbother = 0; /* how many characters not to try at end */
- int beginning = (string == strbeg); /* is ^ valid at stringarg? */
/* Be paranoid... */
if (prog == NULL || string == NULL) {
return(0);
}
+ if (string == strbeg) /* is ^ valid at stringarg? */
+ regprev = '\n';
+ else {
+ regprev = stringarg[-1];
+ if (!multiline && regprev == '\n')
+ regprev = '\0'; /* force ^ to NOT match */
+ }
regprecomp = prog->precomp;
/* Check validity of program. */
if (UCHARAT(prog->program) != MAGIC) {
string = c;
strend = string + i;
for (s = string; s < strend; s++)
- if (isupper(*s))
+ if (isUPPER(*s))
*s = tolower(*s);
}
/* If there is a "must appear" string, look for it. */
s = string;
- if (prog->regmust != Nullstr) {
- if (beginning && screamer) {
+ if (prog->regmust != Nullstr &&
+ (!(prog->reganch & ROPT_ANCH)
+ || (multiline && prog->regback >= 0)) ) {
+ if (stringarg == strbeg && screamer) {
if (screamfirst[prog->regmust->str_rare] >= 0)
s = screaminstr(screamer,prog->regmust);
else
}
/* Mark beginning of line for ^ . */
- if (beginning)
- regbol = string;
- else
- regbol = NULL;
+ regbol = string;
/* Mark end of line for $ (and such) */
regeol = strend;
/* see how far we have to get to not match where we matched before */
regtill = string+minend;
+ /* Allocate our backreference arrays */
+ if ( regmyp_size < prog->nparens + 1 ) {
+ /* Allocate or enlarge the arrays */
+ regmyp_size = prog->nparens + 1;
+ if ( regmyp_size < 10 ) regmyp_size = 10; /* minimum */
+ if ( regmystartp ) {
+ /* reallocate larger */
+ Renew(regmystartp,regmyp_size,char*);
+ Renew(regmyendp, regmyp_size,char*);
+ }
+ else {
+ /* Initial allocation */
+ New(1102,regmystartp,regmyp_size,char*);
+ New(1102,regmyendp, regmyp_size,char*);
+ }
+
+ }
+
/* Simplest case: anchored match need be tried only once. */
/* [unless multiline is set] */
- if (prog->reganch) {
+ if (prog->reganch & ROPT_ANCH) {
if (regtry(prog, string))
goto got_it;
else if (multiline) {
/* for multiline we only have to try after newlines */
if (s > string)
s--;
- for (; s < strend; s++) {
- if (*s == '\n') {
- if (++s < strend && regtry(prog, s))
+ while (s < strend) {
+ if (*s++ == '\n') {
+ if (s < strend && regtry(prog, s))
goto got_it;
}
}
/* Messy cases: unanchored match. */
if (prog->regstart) {
- /* We know what string it must start with. */
- if (prog->regstart->str_pok == 3) {
+ if (prog->reganch & ROPT_SKIP) { /* we have /x+whatever/ */
+ /* it must be a one character string */
+ i = prog->regstart->str_ptr[0];
+ while (s < strend) {
+ if (*s == i) {
+ if (regtry(prog, s))
+ goto got_it;
+ s++;
+ while (s < strend && *s == i)
+ s++;
+ }
+ s++;
+ }
+ }
+ else if (prog->regstart->str_pok == 3) {
+ /* We know what string it must start with. */
#ifndef lint
while ((s = fbminstr((unsigned char*)s,
(unsigned char*)strend, prog->regstart)) != NULL)
goto phooey;
}
if (c = prog->regstclass) {
+ int doevery = (prog->reganch & ROPT_SKIP) == 0;
+
if (minlen)
dontbother = minlen - 1;
strend -= dontbother; /* don't bother with what can't match */
+ tmp = 1;
/* We know what class it must start with. */
switch (OP(c)) {
- case ANYOF: case ANYBUT:
+ case ANYOF:
c = OPERAND(c);
while (s < strend) {
- i = *s;
- if (!(c[i >> 3] & (1 << (i&7))))
- if (regtry(prog, s))
+ i = UCHARAT(s);
+ if (!(c[i >> 3] & (1 << (i&7)))) {
+ if (tmp && regtry(prog, s))
goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
s++;
}
break;
dontbother++,strend--;
if (s != string) {
i = s[-1];
- tmp = (isalpha(i) || isdigit(i) || i == '_');
+ tmp = isALNUM(i);
}
else
- tmp = 0; /* assume not alphanumeric */
+ tmp = isALNUM(regprev); /* assume not alphanumeric */
while (s < strend) {
i = *s;
- if (tmp != (isalpha(i) || isdigit(i) || i == '_')) {
+ if (tmp != isALNUM(i)) {
tmp = !tmp;
if (regtry(prog, s))
goto got_it;
dontbother++,strend--;
if (s != string) {
i = s[-1];
- tmp = (isalpha(i) || isdigit(i) || i == '_');
+ tmp = isALNUM(i);
}
else
- tmp = 0; /* assume not alphanumeric */
+ tmp = isALNUM(regprev); /* assume not alphanumeric */
while (s < strend) {
i = *s;
- if (tmp != (isalpha(i) || isdigit(i) || i == '_'))
+ if (tmp != isALNUM(i))
tmp = !tmp;
else if (regtry(prog, s))
goto got_it;
case ALNUM:
while (s < strend) {
i = *s;
- if (isalpha(i) || isdigit(i) || i == '_')
- if (regtry(prog, s))
+ if (isALNUM(i)) {
+ if (tmp && regtry(prog, s))
goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
s++;
}
break;
case NALNUM:
while (s < strend) {
i = *s;
- if (!isalpha(i) && !isdigit(i) && i != '_')
- if (regtry(prog, s))
+ if (!isALNUM(i)) {
+ if (tmp && regtry(prog, s))
goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
s++;
}
break;
case SPACE:
while (s < strend) {
- if (isspace(*s))
- if (regtry(prog, s))
+ if (isSPACE(*s)) {
+ if (tmp && regtry(prog, s))
goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
s++;
}
break;
case NSPACE:
while (s < strend) {
- if (!isspace(*s))
- if (regtry(prog, s))
+ if (!isSPACE(*s)) {
+ if (tmp && regtry(prog, s))
goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
s++;
}
break;
case DIGIT:
while (s < strend) {
- if (isdigit(*s))
- if (regtry(prog, s))
+ if (isDIGIT(*s)) {
+ if (tmp && regtry(prog, s))
goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
s++;
}
break;
case NDIGIT:
while (s < strend) {
- if (!isdigit(*s))
- if (regtry(prog, s))
+ if (!isDIGIT(*s)) {
+ if (tmp && regtry(prog, s))
goto got_it;
+ else
+ tmp = doevery;
+ }
+ else
+ tmp = 1;
s++;
}
break;
}
}
else {
- dontbother = minend;
+ if (minlen)
+ dontbother = minlen - 1;
strend -= dontbother;
/* We don't know much -- general case. */
do {
s = nsavestr(strbeg,i); /* so $digit will work later */
if (prog->subbase)
Safefree(prog->subbase);
- prog->subbase = s;
+ prog->subbeg = prog->subbase = s;
+ prog->subend = s+i;
}
else
s = prog->subbase;
sp = prog->startp;
ep = prog->endp;
if (prog->nparens) {
- for (i = NSUBEXP; i > 0; i--) {
+ for (i = prog->nparens; i >= 0; i--) {
*sp++ = NULL;
*ep++ = NULL;
}
switch (OP(scan)) {
case BOL:
- if (locinput == regbol ||
+ if (locinput == regbol ? regprev == '\n' :
((nextchar || locinput < regeol) &&
locinput[-1] == '\n') )
{
- regtill--;
+ /* regtill = regbol; */
break;
}
return(0);
case EOL:
if ((nextchar || locinput < regeol) && nextchar != '\n')
return(0);
- regtill--;
+ if (!multiline && regeol - locinput > 1)
+ return 0;
+ /* regtill = regbol; */
break;
case ANY:
if ((nextchar == '\0' && locinput >= regeol) ||
/* Inline the first character, for speed. */
if (*s != nextchar)
return(0);
- if (locinput + ln > regeol)
+ if (regeol - locinput < ln)
return 0;
if (ln > 1 && bcmp(s, locinput, ln) != 0)
return(0);
nextchar = *locinput;
break;
case ANYOF:
- case ANYBUT:
s = OPERAND(scan);
if (nextchar < 0)
nextchar = UCHARAT(locinput);
if (s[nextchar >> 3] & (1 << (nextchar&7)))
return(0);
- nextchar = *++locinput;
- if (!nextchar && locinput > regeol)
+ if (!nextchar && locinput >= regeol)
return 0;
+ nextchar = *++locinput;
break;
case ALNUM:
if (!nextchar)
return(0);
- if (!isalpha(nextchar) && !isdigit(nextchar) &&
- nextchar != '_')
+ if (!isALNUM(nextchar))
return(0);
nextchar = *++locinput;
break;
case NALNUM:
if (!nextchar && locinput >= regeol)
return(0);
- if (isalpha(nextchar) || isdigit(nextchar) ||
- nextchar == '_')
+ if (isALNUM(nextchar))
return(0);
nextchar = *++locinput;
break;
case NBOUND:
case BOUND:
if (locinput == regbol) /* was last char in word? */
- ln = 0;
+ ln = isALNUM(regprev);
else
- ln = (isalpha(locinput[-1]) ||
- isdigit(locinput[-1]) ||
- locinput[-1] == '_' );
- n = (isalpha(nextchar) || isdigit(nextchar) ||
- nextchar == '_' ); /* is next char in word? */
+ ln = isALNUM(locinput[-1]);
+ n = isALNUM(nextchar); /* is next char in word? */
if ((ln == n) == (OP(scan) == BOUND))
return(0);
break;
case SPACE:
if (!nextchar && locinput >= regeol)
return(0);
- if (!isspace(nextchar))
+ if (!isSPACE(nextchar))
return(0);
nextchar = *++locinput;
break;
case NSPACE:
if (!nextchar)
return(0);
- if (isspace(nextchar))
+ if (isSPACE(nextchar))
return(0);
nextchar = *++locinput;
break;
case DIGIT:
- if (!isdigit(nextchar))
+ if (!isDIGIT(nextchar))
return(0);
nextchar = *++locinput;
break;
case NDIGIT:
if (!nextchar && locinput >= regeol)
return(0);
- if (isdigit(nextchar))
+ if (isDIGIT(nextchar))
return(0);
nextchar = *++locinput;
break;
case REF:
- case REF+1:
- case REF+2:
- case REF+3:
- case REF+4:
- case REF+5:
- case REF+6:
- case REF+7:
- case REF+8:
- case REF+9:
- n = OP(scan) - REF;
+ n = ARG1(scan); /* which paren pair */
s = regmystartp[n];
if (!s)
return(0);
break;
case BACK:
break;
- case OPEN+1:
- case OPEN+2:
- case OPEN+3:
- case OPEN+4:
- case OPEN+5:
- case OPEN+6:
- case OPEN+7:
- case OPEN+8:
- case OPEN+9:
- n = OP(scan) - OPEN;
+ case OPEN:
+ n = ARG1(scan); /* which paren pair */
reginput = locinput;
regmystartp[n] = locinput; /* for REF */
} else
return(0);
/* NOTREACHED */
- case CLOSE+1:
- case CLOSE+2:
- case CLOSE+3:
- case CLOSE+4:
- case CLOSE+5:
- case CLOSE+6:
- case CLOSE+7:
- case CLOSE+8:
- case CLOSE+9: {
- n = OP(scan) - CLOSE;
+ case CLOSE: {
+ n = ARG1(scan); /* which paren pair */
reginput = locinput;
regmyendp[n] = locinput; /* for REF */
}
}
break;
+ case CURLY:
+ ln = ARG1(scan); /* min to match */
+ n = ARG2(scan); /* max to match */
+ scan = NEXTOPER(scan) + 4;
+ goto repeat;
case STAR:
+ ln = 0;
+ n = 0;
+ scan = NEXTOPER(scan);
+ goto repeat;
case PLUS:
/*
* Lookahead to avoid useless match attempts
* when we know what character comes next.
*/
+ ln = 1;
+ n = 0;
+ scan = NEXTOPER(scan);
+ repeat:
if (OP(next) == EXACTLY)
nextchar = *(OPERAND(next)+1);
else
nextchar = -1000;
- ln = (OP(scan) == STAR) ? 0 : 1;
reginput = locinput;
- n = regrepeat(NEXTOPER(scan));
+ n = regrepeat(scan, n);
+ if (!multiline && OP(next) == EOL && ln < n)
+ ln = n; /* why back off? */
while (n >= ln) {
/* If it could work, try it. */
if (nextchar == -1000 || *reginput == nextchar)
* rather than incrementing count on every character.]
*/
static int
-regrepeat(p)
+regrepeat(p, max)
char *p;
+int max;
{
register char *scan;
register char *opnd;
register char *loceol = regeol;
scan = reginput;
+ if (max && max < loceol - scan)
+ loceol = scan + max;
opnd = OPERAND(p);
switch (OP(p)) {
case ANY:
scan++;
break;
case ANYOF:
- case ANYBUT:
c = UCHARAT(scan);
while (scan < loceol && !(opnd[c >> 3] & (1 << (c & 7)))) {
scan++;
}
break;
case ALNUM:
- while (isalpha(*scan) || isdigit(*scan) || *scan == '_')
+ while (scan < loceol && isALNUM(*scan))
scan++;
break;
case NALNUM:
- while (scan < loceol && (!isalpha(*scan) && !isdigit(*scan) &&
- *scan != '_'))
+ while (scan < loceol && !isALNUM(*scan))
scan++;
break;
case SPACE:
- while (scan < loceol && isspace(*scan))
+ while (scan < loceol && isSPACE(*scan))
scan++;
break;
case NSPACE:
- while (scan < loceol && !isspace(*scan))
+ while (scan < loceol && !isSPACE(*scan))
scan++;
break;
case DIGIT:
- while (isdigit(*scan))
+ while (scan < loceol && isDIGIT(*scan))
scan++;
break;
case NDIGIT:
- while (scan < loceol && !isdigit(*scan))
+ while (scan < loceol && !isDIGIT(*scan))
scan++;
break;
default: /* Oh dear. Called inappropriately. */