1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
5 /* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
10 /* $Header: regcomp.c,v 3.0.1.7 90/10/20 02:18:32 lwall Locked $
13 * Revision 3.0.1.7 90/10/20 02:18:32 lwall
14 * patch37: /foo.*bar$/ wrongly optimized to do tail matching on "foo"
16 * Revision 3.0.1.6 90/10/16 10:17:33 lwall
17 * patch29: patterns with multiple short literal strings sometimes failed
19 * Revision 3.0.1.5 90/08/13 22:23:29 lwall
20 * patch28: /x{m}/ didn't work right
22 * Revision 3.0.1.4 90/08/09 05:05:33 lwall
23 * patch19: sped up /x+y/ patterns greatly by not retrying on every x
24 * patch19: inhibited backoff on patterns anchored to the end like /\s+$/
25 * patch19: sped up {m,n} on simple items
26 * patch19: optimized /.*whatever/ to /^.*whatever/
27 * patch19: fixed character classes to allow backslashing hyphen
29 * Revision 3.0.1.3 90/03/12 16:59:22 lwall
30 * patch13: pattern matches can now use \0 to mean \000
32 * Revision 3.0.1.2 90/02/28 18:08:35 lwall
33 * patch9: /[\200-\377]/ didn't work on machines with signed chars
35 * Revision 3.0.1.1 89/11/11 04:51:04 lwall
36 * patch2: /[\000]/ didn't work
38 * Revision 3.0 89/10/18 15:22:29 lwall
44 * regcomp and regexec -- regsub and regerror are not used in perl
46 * Copyright (c) 1986 by University of Toronto.
47 * Written by Henry Spencer. Not derived from licensed software.
49 * Permission is granted to anyone to use this software for any
50 * purpose on any computer system, and to redistribute it freely,
51 * subject to the following restrictions:
53 * 1. The author is not responsible for the consequences of use of
54 * this software, no matter how awful, even if they arise
57 * 2. The origin of this software must not be misrepresented, either
58 * by explicit claim or by omission.
60 * 3. Altered versions must be plainly marked as such, and must not
61 * be misrepresented as being the original software.
64 **** Alterations to Henry's code are...
66 **** Copyright (c) 1989, Larry Wall
68 **** You may distribute under the terms of the GNU General Public License
69 **** as specified in the README file that comes with the perl 3.0 kit.
71 * Beware that some of this code is subtly aware of the way operator
72 * precedence is structured in regular expressions. Serious changes in
73 * regular-expression syntax might require a total rethink.
84 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
85 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
86 ((*s) == '{' && regcurly(s)))
87 #define META "^$.[()|?+*\\"
90 * Flags to be passed up and down.
92 #define HASWIDTH 01 /* Known never to match null string. */
93 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
94 #define SPSTART 04 /* Starts with * or +. */
95 #define WORST 0 /* Worst case. */
98 * Global work variables for regcomp().
100 static char *regprecomp; /* uncompiled string. */
101 static char *regparse; /* Input-scan pointer. */
102 static char *regxend; /* End of input for compile */
103 static int regnpar; /* () count. */
104 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
105 static long regsize; /* Code size. */
107 static int regsawbracket; /* Did we do {d,d} trick? */
110 * Forward declarations for regcomp()'s friends.
112 STATIC int regcurly();
114 STATIC char *regbranch();
115 STATIC char *regpiece();
116 STATIC char *regatom();
117 STATIC char *regclass();
118 STATIC char *regnode();
120 STATIC void reginsert();
121 STATIC void regtail();
122 STATIC void regoptail();
125 - regcomp - compile a regular expression into internal code
127 * We can't allocate space until we know how big the compiled form will be,
128 * but we can't compile it (and thus know how big it is) until we've got a
129 * place to put the code. So we cheat: we compile it twice, once with code
130 * generation turned off and size counting turned on, and once "for real".
131 * This also means that we don't allocate space until we are sure that the
132 * thing really will compile successfully, and we never have to move the
133 * code and thus invalidate pointers into it. (Note that it has to be in
134 * one piece because free() must be able to free it all.) [NB: not true in perl]
136 * Beware that the optimization-preparation code in here knows about some
137 * of the structure of the compiled regexp. [I'll say.]
140 regcomp(exp,xend,fold)
147 register STR *longish;
150 register char *first;
154 extern char *safemalloc();
155 extern char *savestr();
159 fatal("NULL regexp argument");
161 /* First pass: determine size, legality. */
165 regprecomp = nsavestr(exp,xend-exp);
171 if (reg(0, &flags) == NULL) {
172 Safefree(regprecomp);
176 /* Small enough for pointer-storage convention? */
177 if (regsize >= 32767L) /* Probably could be 65535L. */
178 FAIL("regexp too big");
180 /* Allocate space. */
181 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
183 FAIL("regexp out of space");
185 /* Second pass: emit code. */
187 bcopy(regprecomp,exp,xend-exp);
188 r->precomp = regprecomp;
192 regcode = r->program;
194 if (reg(0, &flags) == NULL)
197 /* Dig out information for optimizations. */
198 r->regstart = Nullstr; /* Worst-case defaults. */
200 r->regmust = Nullstr;
202 r->regstclass = Nullch;
203 scan = r->program+1; /* First BRANCH. */
204 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
205 scan = NEXTOPER(scan);
208 while ((OP(first) > OPEN && OP(first) < CLOSE) ||
209 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
210 (OP(first) == PLUS) ||
211 (OP(first) == CURLY && ARG1(first) > 0) ) {
212 if (OP(first) == CURLY)
214 else if (OP(first) == PLUS)
216 first = NEXTOPER(first);
219 /* Starting-point info. */
220 if (OP(first) == EXACTLY) {
222 str_make(OPERAND(first)+1,*OPERAND(first));
223 if (r->regstart->str_cur > !(sawstudy|fold))
224 fbmcompile(r->regstart,fold);
226 else if ((exp = index(simple,OP(first))) && exp > simple)
227 r->regstclass = first;
228 else if (OP(first) == BOUND || OP(first) == NBOUND)
229 r->regstclass = first;
230 else if (OP(first) == BOL ||
231 (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) )
232 r->reganch = 1; /* kinda turn .* into ^.* */
233 r->reganch |= sawplus;
237 fprintf(stderr,"first %d next %d offset %d\n",
238 OP(first), OP(NEXTOPER(first)), first - scan);
241 * If there's something expensive in the r.e., find the
242 * longest literal string that must appear and make it the
243 * regmust. Resolve ties in favor of later strings, since
244 * the regstart check works with the beginning of the r.e.
245 * and avoiding duplication strengthens checking. Not a
246 * strong reason, but sufficient in the absence of others.
247 * [Now we resolve ties in favor of the earlier string if
248 * it happens that curback has been invalidated, since the
249 * earlier string may buy us something the later one won't.]
251 longish = str_make("",0);
252 longest = str_make("",0);
256 while (OP(scan) != END) {
257 if (OP(scan) == BRANCH) {
258 if (OP(regnext(scan)) == BRANCH) {
260 while (OP(scan) == BRANCH)
261 scan = regnext(scan);
263 else /* single branch is ok */
264 scan = NEXTOPER(scan);
266 if (OP(scan) == EXACTLY) {
268 while (OP(regnext(scan)) >= CLOSE)
269 scan = regnext(scan);
270 if (curback - back == len) {
271 str_ncat(longish, OPERAND(first)+1,
273 len += *OPERAND(first);
274 curback += *OPERAND(first);
275 first = regnext(scan);
277 else if (*OPERAND(first) >= len + (curback >= 0)) {
278 len = *OPERAND(first);
279 str_nset(longish, OPERAND(first)+1,len);
282 first = regnext(scan);
285 curback += *OPERAND(first);
287 else if (index(varies,OP(scan))) {
290 if (longish->str_cur > longest->str_cur)
291 str_sset(longest,longish);
292 str_nset(longish,"",0);
294 else if (index(simple,OP(scan))) {
297 if (longish->str_cur > longest->str_cur)
298 str_sset(longest,longish);
299 str_nset(longish,"",0);
301 scan = regnext(scan);
304 /* Prefer earlier on tie, unless we can tail match latter */
306 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur)
307 str_sset(longest,longish);
309 str_nset(longish,"",0);
310 if (longest->str_cur) {
311 r->regmust = longest;
316 > !(sawstudy || fold || OP(first) == EOL) )
317 fbmcompile(r->regmust,fold);
318 r->regmust->str_u.str_useful = 100;
319 if (OP(first) == EOL && longish->str_cur)
320 r->regmust->str_pok |= SP_TAIL;
327 r->do_folding = fold;
328 r->nparens = regnpar - 1;
337 - reg - regular expression, i.e. main body or parenthesized thing
339 * Caller must absorb opening parenthesis.
341 * Combining parenthesis handling with the base level of regular expression
342 * is a trifle forced, but the need to tie the tails of the branches to what
343 * follows makes it hard to avoid.
347 int paren; /* Parenthesized? */
352 register char *ender;
356 *flagp = HASWIDTH; /* Tentatively. */
358 /* Make an OPEN node, if parenthesized. */
360 if (regnpar >= NSUBEXP)
361 FAIL("too many () in regexp");
364 ret = regnode(OPEN+parno);
368 /* Pick up the branches, linking them together. */
369 br = regbranch(&flags);
373 regtail(ret, br); /* OPEN -> first. */
376 if (!(flags&HASWIDTH))
378 *flagp |= flags&SPSTART;
379 while (*regparse == '|') {
381 br = regbranch(&flags);
384 regtail(ret, br); /* BRANCH -> BRANCH. */
385 if (!(flags&HASWIDTH))
387 *flagp |= flags&SPSTART;
390 /* Make a closing node, and hook it on the end. */
391 ender = regnode((paren) ? CLOSE+parno : END);
394 /* Hook the tails of the branches to the closing node. */
395 for (br = ret; br != NULL; br = regnext(br))
396 regoptail(br, ender);
398 /* Check for proper termination. */
399 if (paren && *regparse++ != ')') {
400 FAIL("unmatched () in regexp");
401 } else if (!paren && regparse < regxend) {
402 if (*regparse == ')') {
403 FAIL("unmatched () in regexp");
405 FAIL("junk on end of regexp"); /* "Can't happen". */
413 - regbranch - one alternative of an | operator
415 * Implements the concatenation operator.
422 register char *chain;
423 register char *latest;
426 *flagp = WORST; /* Tentatively. */
428 ret = regnode(BRANCH);
430 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
431 latest = regpiece(&flags);
434 *flagp |= flags&HASWIDTH;
435 if (chain == NULL) /* First piece. */
436 *flagp |= flags&SPSTART;
438 regtail(chain, latest);
441 if (chain == NULL) /* Loop ran zero times. */
442 (void) regnode(NOTHING);
448 - regpiece - something followed by possible [*+?]
450 * Note that the branching code sequences used for ? and the general cases
451 * of * and + are somewhat optimized: they use the same NOTHING node as
452 * both the endmarker for their branch list and the body of the last branch.
453 * It might seem that this node could be dispensed with entirely, but the
454 * endmarker role is not redundant.
464 char *origparse = regparse;
465 int orignpar = regnpar;
470 ret = regatom(&flags);
476 /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
477 * then we decrement the first number by one and reset our
478 * parsing back to the beginning of the same atom. If the first number
479 * is down to 0, decrement the second number instead and fake up
480 * a ? after it. Given the way this compiler doesn't keep track
481 * of offsets on the first pass, this is the only way to replicate
482 * a piece of code. Sigh.
484 if (op == '{' && regcurly(regparse)) {
487 while (isdigit(*next) || *next == ',') {
496 if (*next == '}') { /* got one */
500 iter = atoi(regparse);
501 if (flags&SIMPLE) { /* we can do it right after all */
504 reginsert(CURLY, ret);
510 if (tmp && tmp < iter)
511 fatal("Can't do {n,m} with n > m");
512 if (regcode != ®dummy) {
514 *(unsigned short *)(ret+3) = iter;
515 *(unsigned short *)(ret+5) = tmp;
517 ret[3] = iter >> 8; ret[4] = iter & 0377;
518 ret[5] = tmp >> 8; ret[6] = tmp & 0377;
524 regsawbracket++; /* remember we clobbered exp */
527 sprintf(regparse,"%.*d", max-regparse, iter - 1);
529 if (*max == ',' && max[1] != '}') {
530 if (atoi(max+1) <= 0)
531 fatal("Can't do {n,m} with n > m");
533 sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
536 if (iter != 1 || *max == ',') {
537 regparse = origparse; /* back up input pointer */
538 regnpar = orignpar; /* don't make more parens */
550 if (max == next) { /* any number more? */
552 op = '*'; /* fake up one with a star */
555 op = '?'; /* fake up optional atom */
557 sprintf(max,"%.*d", next-max, iter - 1);
562 regparse = origparse - 1; /* offset ++ below */
567 fatal("Can't do {n,0}");
570 fatal("Can't do {0}");
579 if (!(flags&HASWIDTH) && op != '?')
580 FAIL("regexp *+ operand could be empty");
581 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
583 if (op == '*' && (flags&SIMPLE))
584 reginsert(STAR, ret);
585 else if (op == '*') {
586 /* Emit x* as (x&|), where & means "self". */
587 reginsert(BRANCH, ret); /* Either x */
588 regoptail(ret, regnode(BACK)); /* and loop */
589 regoptail(ret, ret); /* back */
590 regtail(ret, regnode(BRANCH)); /* or */
591 regtail(ret, regnode(NOTHING)); /* null. */
592 } else if (op == '+' && (flags&SIMPLE))
593 reginsert(PLUS, ret);
594 else if (op == '+') {
595 /* Emit x+ as x(&|), where & means "self". */
596 next = regnode(BRANCH); /* Either */
598 regtail(regnode(BACK), ret); /* loop back */
599 regtail(next, regnode(BRANCH)); /* or */
600 regtail(ret, regnode(NOTHING)); /* null. */
601 } else if (op == '?') {
602 /* Emit x? as (x|) */
603 reginsert(BRANCH, ret); /* Either x */
604 regtail(ret, regnode(BRANCH)); /* or */
605 next = regnode(NOTHING); /* null. */
607 regoptail(ret, next);
611 if (ISMULT2(regparse))
612 FAIL("nested *?+ in regexp");
618 - regatom - the lowest level
620 * Optimization: gobbles an entire sequence of ordinary characters so that
621 * it can turn them into a single node, which is smaller to store and
622 * faster to run. Backslashed characters are exceptions, each becoming a
623 * separate node; the code is simpler that way and it's not worth fixing.
625 * [Yes, it is worth fixing, some scripts can run twice the speed.]
634 *flagp = WORST; /* Tentatively. */
636 switch (*regparse++) {
645 *flagp |= HASWIDTH|SIMPLE;
649 *flagp |= HASWIDTH|SIMPLE;
652 ret = reg(1, &flags);
655 *flagp |= flags&(HASWIDTH|SPSTART);
659 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
664 FAIL("?+* follows nothing in regexp");
669 ret = regnode(ALNUM);
670 *flagp |= HASWIDTH|SIMPLE;
674 ret = regnode(NALNUM);
675 *flagp |= HASWIDTH|SIMPLE;
679 ret = regnode(BOUND);
684 ret = regnode(NBOUND);
689 ret = regnode(SPACE);
690 *flagp |= HASWIDTH|SIMPLE;
694 ret = regnode(NSPACE);
695 *flagp |= HASWIDTH|SIMPLE;
699 ret = regnode(DIGIT);
700 *flagp |= HASWIDTH|SIMPLE;
704 ret = regnode(NDIGIT);
705 *flagp |= HASWIDTH|SIMPLE;
713 case '0': case '1': case '2': case '3': case '4':
714 case '5': case '6': case '7': case '8': case '9':
715 if (isdigit(regparse[1]) || *regparse == '0')
718 ret = regnode(REF + *regparse++ - '0');
723 if (regparse >= regxend)
724 FAIL("trailing \\ in regexp");
738 ret = regnode(EXACTLY);
739 regc(0); /* save spot for len */
740 for (len=0, p=regparse-1;
741 len < 127 && p < regxend;
782 case '0': case '1': case '2': case '3':case '4':
783 case '5': case '6': case '7': case '8':case '9':
784 if (isdigit(p[1]) || *p == '0') {
787 foo = (foo<<3) + *++p - '0';
789 foo = (foo<<3) + *++p - '0';
800 FAIL("trailing \\ in regexp");
811 if (regfold && isupper(ender))
812 ender = tolower(ender);
813 if (ISMULT2(p)) { /* Back off on ?+*. */
827 FAIL("internal disaster in regexp");
831 if (regcode != ®dummy)
847 if (regcode == ®dummy)
851 bits[c >> 3] &= ~(1 << (c & 7));
853 bits[c >> 3] |= (1 << (c & 7));
861 register int lastclass;
862 register int range = 0;
866 ret = regnode(ANYOF);
867 if (*regparse == '^') { /* Complement of range. */
874 for (class = 0; class < 32; class++)
876 if (*regparse == ']' || *regparse == '-')
877 goto skipcond; /* allow 1st char to be ] or - */
878 while (regparse < regxend && *regparse != ']') {
880 class = UCHARAT(regparse++);
882 class = UCHARAT(regparse++);
885 for (class = 'a'; class <= 'z'; class++)
886 regset(bits,def,class);
887 for (class = 'A'; class <= 'Z'; class++)
888 regset(bits,def,class);
889 for (class = '0'; class <= '9'; class++)
890 regset(bits,def,class);
891 regset(bits,def,'_');
895 regset(bits,def,' ');
896 regset(bits,def,'\t');
897 regset(bits,def,'\r');
898 regset(bits,def,'\f');
899 regset(bits,def,'\n');
903 for (class = '0'; class <= '9'; class++)
904 regset(bits,def,class);
922 case '0': case '1': case '2': case '3': case '4':
923 case '5': case '6': case '7': case '8': case '9':
925 if (isdigit(*regparse)) {
927 class += *regparse++ - '0';
929 if (isdigit(*regparse)) {
931 class += *regparse++ - '0';
937 if (lastclass > class)
938 FAIL("invalid [] range in regexp");
943 if (*regparse == '-' && regparse+1 < regxend &&
944 regparse[1] != ']') {
947 continue; /* do it next time */
950 for ( ; lastclass <= class; lastclass++) {
951 regset(bits,def,lastclass);
952 if (regfold && isupper(lastclass))
953 regset(bits,def,tolower(lastclass));
957 if (*regparse != ']')
958 FAIL("unmatched [] in regexp");
964 - regnode - emit a node
966 static char * /* Location. */
974 if (ret == ®dummy) {
985 if (!((long)ret & 1))
991 *ptr++ = '\0'; /* Null "next" pointer. */
999 - regc - emit (if appropriate) a byte of code
1005 if (regcode != ®dummy)
1012 - reginsert - insert an operator in front of already-emitted operand
1014 * Means relocating the operand.
1023 register char *place;
1024 register offset = (op == CURLY ? 4 : 0);
1026 if (regcode == ®dummy) {
1028 regsize += 4 + offset;
1030 regsize += 3 + offset;
1037 regcode += 4 + offset;
1039 regcode += 3 + offset;
1045 place = opnd; /* Op node, where operand used to be. */
1049 while (offset-- > 0)
1054 - regtail - set the next-pointer at the end of a node chain
1061 register char *scan;
1062 register char *temp;
1063 register int offset;
1068 /* Find last node. */
1071 temp = regnext(scan);
1078 offset = val - scan;
1080 *(short*)(scan+1) = offset;
1085 if (OP(scan) == BACK)
1086 offset = scan - val;
1088 offset = val - scan;
1089 *(scan+1) = (offset>>8)&0377;
1090 *(scan+2) = offset&0377;
1095 - regoptail - regtail on operand of first argument; nop if operandless
1102 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1103 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
1105 regtail(NEXTOPER(p), val);
1109 - regcurly - a little FSA that accepts {\d+,?\d*}
1133 - regdump - dump a regexp onto stderr in vaguely comprehensible form
1140 register char op = EXACTLY; /* Arbitrary non-END op. */
1141 register char *next;
1142 extern char *index();
1146 while (op != END) { /* While that wasn't END last time... */
1152 fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */
1156 if (next == NULL) /* Next ptr. */
1157 fprintf(stderr,"(0)");
1159 fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1164 if (op == EXACTLY) {
1165 /* Literal string, where present. */
1167 while (*s != '\0') {
1173 (void)putchar('\n');
1176 /* Header fields of interest. */
1178 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1180 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1182 fprintf(stderr,"anchored ");
1184 fprintf(stderr,"plus ");
1185 if (r->regmust != NULL)
1186 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1188 fprintf(stderr,"\n");
1192 - regprop - printable representation of opcode
1200 (void) strcpy(buf, ":");
1255 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1269 (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF);
1281 (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1293 (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1303 FAIL("corrupted regexp opcode");
1306 (void) strcat(buf, p);
1309 #endif /* DEBUGGING */
1315 Safefree(r->precomp);
1317 Safefree(r->subbase);
1319 str_free(r->regmust);
1321 str_free(r->regstart);