1 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
5 /* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
10 /* $RCSfile: regcomp.c,v $$Revision: 4.0.1.2 $$Date: 91/06/07 11:48:24 $
13 * Revision 4.0.1.2 91/06/07 11:48:24 lwall
14 * patch4: new copyright notice
15 * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
16 * patch4: // wouldn't use previous pattern if it started with a null character
18 * Revision 4.0.1.1 91/04/12 09:04:45 lwall
19 * patch1: random cleanup in cpp namespace
21 * Revision 4.0 91/03/20 01:39:01 lwall
27 * regcomp and regexec -- regsub and regerror are not used in perl
29 * Copyright (c) 1986 by University of Toronto.
30 * Written by Henry Spencer. Not derived from licensed software.
32 * Permission is granted to anyone to use this software for any
33 * purpose on any computer system, and to redistribute it freely,
34 * subject to the following restrictions:
36 * 1. The author is not responsible for the consequences of use of
37 * this software, no matter how awful, even if they arise
40 * 2. The origin of this software must not be misrepresented, either
41 * by explicit claim or by omission.
43 * 3. Altered versions must be plainly marked as such, and must not
44 * be misrepresented as being the original software.
47 **** Alterations to Henry's code are...
49 **** Copyright (c) 1991, Larry Wall
51 **** You may distribute under the terms of either the GNU General Public
52 **** License or the Artistic License, as specified in the README file.
55 * Beware that some of this code is subtly aware of the way operator
56 * precedence is structured in regular expressions. Serious changes in
57 * regular-expression syntax might require a total rethink.
65 # if defined(BUGGY_MSC6)
66 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
67 # pragma optimize("a",off)
68 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
69 # pragma optimize("w",on )
70 # endif /* BUGGY_MSC6 */
77 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
78 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
79 ((*s) == '{' && regcurly(s)))
80 #define META "^$.[()|?+*\\"
83 #undef SPSTART /* dratted cpp namespace... */
86 * Flags to be passed up and down.
88 #define HASWIDTH 01 /* Known never to match null string. */
89 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
90 #define SPSTART 04 /* Starts with * or +. */
91 #define WORST 0 /* Worst case. */
94 * Global work variables for regcomp().
96 static char *regprecomp; /* uncompiled string. */
97 static char *regparse; /* Input-scan pointer. */
98 static char *regxend; /* End of input for compile */
99 static int regnpar; /* () count. */
100 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
101 static long regsize; /* Code size. */
103 static int regsawbracket; /* Did we do {d,d} trick? */
104 static int regsawback; /* Did we see \1, ...? */
107 * Forward declarations for regcomp()'s friends.
109 STATIC int regcurly();
111 STATIC char *regbranch();
112 STATIC char *regpiece();
113 STATIC char *regatom();
114 STATIC char *regclass();
115 STATIC char *regnode();
116 STATIC char *reganode();
118 STATIC void reginsert();
119 STATIC void regtail();
120 STATIC void regoptail();
123 - regcomp - compile a regular expression into internal code
125 * We can't allocate space until we know how big the compiled form will be,
126 * but we can't compile it (and thus know how big it is) until we've got a
127 * place to put the code. So we cheat: we compile it twice, once with code
128 * generation turned off and size counting turned on, and once "for real".
129 * This also means that we don't allocate space until we are sure that the
130 * thing really will compile successfully, and we never have to move the
131 * code and thus invalidate pointers into it. (Note that it has to be in
132 * one piece because free() must be able to free it all.) [NB: not true in perl]
134 * Beware that the optimization-preparation code in here knows about some
135 * of the structure of the compiled regexp. [I'll say.]
138 regcomp(exp,xend,fold)
145 register STR *longish;
148 register char *first;
153 extern char *safemalloc();
154 extern char *savestr();
159 fatal("NULL regexp argument");
161 /* First pass: determine size, legality. */
165 regprecomp = nsavestr(exp,xend-exp);
172 if (reg(0, &flags) == NULL) {
173 Safefree(regprecomp);
178 /* Small enough for pointer-storage convention? */
179 if (regsize >= 32767L) /* Probably could be 65535L. */
180 FAIL("regexp too big");
182 /* Allocate space. */
183 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
185 FAIL("regexp out of space");
187 /* Second pass: emit code. */
189 bcopy(regprecomp,exp,xend-exp);
190 r->prelen = xend-exp;
191 r->precomp = regprecomp;
192 r->subbeg = r->subbase = NULL;
195 regcode = r->program;
197 if (reg(0, &flags) == NULL)
200 /* Dig out information for optimizations. */
201 r->regstart = Nullstr; /* Worst-case defaults. */
203 r->regmust = Nullstr;
205 r->regstclass = Nullch;
206 scan = r->program+1; /* First BRANCH. */
207 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
208 scan = NEXTOPER(scan);
211 while ((OP(first) == OPEN && (sawopen = 1)) ||
212 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
213 (OP(first) == PLUS) ||
214 (OP(first) == CURLY && ARG1(first) > 0) ) {
215 if (OP(first) == PLUS)
218 first += regarglen[OP(first)];
219 first = NEXTOPER(first);
222 /* Starting-point info. */
224 if (OP(first) == EXACTLY) {
226 str_make(OPERAND(first)+1,*OPERAND(first));
227 if (r->regstart->str_cur > !(sawstudy|fold))
228 fbmcompile(r->regstart,fold);
230 else if ((exp = index(simple,OP(first))) && exp > simple)
231 r->regstclass = first;
232 else if (OP(first) == BOUND || OP(first) == NBOUND)
233 r->regstclass = first;
234 else if (OP(first) == BOL ||
235 (OP(first) == STAR && OP(NEXTOPER(first)) == ANY) ) {
236 r->reganch = ROPT_ANCH; /* kinda turn .* into ^.* */
237 first = NEXTOPER(first);
240 if (sawplus && (!sawopen || !regsawback))
241 r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */
245 fprintf(stderr,"first %d next %d offset %d\n",
246 OP(first), OP(NEXTOPER(first)), first - scan);
249 * If there's something expensive in the r.e., find the
250 * longest literal string that must appear and make it the
251 * regmust. Resolve ties in favor of later strings, since
252 * the regstart check works with the beginning of the r.e.
253 * and avoiding duplication strengthens checking. Not a
254 * strong reason, but sufficient in the absence of others.
255 * [Now we resolve ties in favor of the earlier string if
256 * it happens that curback has been invalidated, since the
257 * earlier string may buy us something the later one won't.]
259 longish = str_make("",0);
260 longest = str_make("",0);
265 while (OP(scan) != END) {
266 if (OP(scan) == BRANCH) {
267 if (OP(regnext(scan)) == BRANCH) {
269 while (OP(scan) == BRANCH)
270 scan = regnext(scan);
272 else /* single branch is ok */
273 scan = NEXTOPER(scan);
275 if (OP(scan) == EXACTLY) {
279 while (OP(t = regnext(scan)) == CLOSE)
281 if (curback - backish == len) {
282 str_ncat(longish, OPERAND(first)+1,
284 len += *OPERAND(first);
285 curback += *OPERAND(first);
286 first = regnext(scan);
288 else if (*OPERAND(first) >= len + (curback >= 0)) {
289 len = *OPERAND(first);
290 str_nset(longish, OPERAND(first)+1,len);
293 first = regnext(scan);
296 curback += *OPERAND(first);
298 else if (index(varies,OP(scan))) {
301 if (longish->str_cur > longest->str_cur) {
302 str_sset(longest,longish);
305 str_nset(longish,"",0);
307 else if (index(simple,OP(scan))) {
310 if (longish->str_cur > longest->str_cur) {
311 str_sset(longest,longish);
314 str_nset(longish,"",0);
316 scan = regnext(scan);
319 /* Prefer earlier on tie, unless we can tail match latter */
321 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
322 str_sset(longest,longish);
326 str_nset(longish,"",0);
331 !fbminstr(r->regstart->str_ptr,
332 r->regstart->str_ptr + r->regstart->str_cur,
337 r->regmust = longest;
340 r->regback = backest;
342 > !(sawstudy || fold || OP(first) == EOL) )
343 fbmcompile(r->regmust,fold);
344 r->regmust->str_u.str_useful = 100;
345 if (OP(first) == EOL && longish->str_cur)
346 r->regmust->str_pok |= SP_TAIL;
355 r->do_folding = fold;
356 r->nparens = regnpar - 1;
357 New(1002, r->startp, regnpar, char*);
358 New(1002, r->endp, regnpar, char*);
367 - reg - regular expression, i.e. main body or parenthesized thing
369 * Caller must absorb opening parenthesis.
371 * Combining parenthesis handling with the base level of regular expression
372 * is a trifle forced, but the need to tie the tails of the branches to what
373 * follows makes it hard to avoid.
377 int paren; /* Parenthesized? */
382 register char *ender;
386 *flagp = HASWIDTH; /* Tentatively. */
388 /* Make an OPEN node, if parenthesized. */
392 ret = reganode(OPEN, parno);
396 /* Pick up the branches, linking them together. */
397 br = regbranch(&flags);
401 regtail(ret, br); /* OPEN -> first. */
404 if (!(flags&HASWIDTH))
406 *flagp |= flags&SPSTART;
407 while (*regparse == '|') {
409 br = regbranch(&flags);
412 regtail(ret, br); /* BRANCH -> BRANCH. */
413 if (!(flags&HASWIDTH))
415 *flagp |= flags&SPSTART;
418 /* Make a closing node, and hook it on the end. */
420 ender = reganode(CLOSE, parno);
422 ender = regnode(END);
425 /* Hook the tails of the branches to the closing node. */
426 for (br = ret; br != NULL; br = regnext(br))
427 regoptail(br, ender);
429 /* Check for proper termination. */
430 if (paren && *regparse++ != ')') {
431 FAIL("unmatched () in regexp");
432 } else if (!paren && regparse < regxend) {
433 if (*regparse == ')') {
434 FAIL("unmatched () in regexp");
436 FAIL("junk on end of regexp"); /* "Can't happen". */
444 - regbranch - one alternative of an | operator
446 * Implements the concatenation operator.
453 register char *chain;
454 register char *latest;
457 *flagp = WORST; /* Tentatively. */
459 ret = regnode(BRANCH);
461 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
462 latest = regpiece(&flags);
465 *flagp |= flags&HASWIDTH;
466 if (chain == NULL) /* First piece. */
467 *flagp |= flags&SPSTART;
469 regtail(chain, latest);
472 if (chain == NULL) /* Loop ran zero times. */
473 (void) regnode(NOTHING);
479 - regpiece - something followed by possible [*+?]
481 * Note that the branching code sequences used for ? and the general cases
482 * of * and + are somewhat optimized: they use the same NOTHING node as
483 * both the endmarker for their branch list and the body of the last branch.
484 * It might seem that this node could be dispensed with entirely, but the
485 * endmarker role is not redundant.
495 char *origparse = regparse;
496 int orignpar = regnpar;
501 ret = regatom(&flags);
507 /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
508 * then we decrement the first number by one and reset our
509 * parsing back to the beginning of the same atom. If the first number
510 * is down to 0, decrement the second number instead and fake up
511 * a ? after it. Given the way this compiler doesn't keep track
512 * of offsets on the first pass, this is the only way to replicate
513 * a piece of code. Sigh.
515 if (op == '{' && regcurly(regparse)) {
518 while (isdigit(*next) || *next == ',') {
527 if (*next == '}') { /* got one */
531 iter = atoi(regparse);
532 if (flags&SIMPLE) { /* we can do it right after all */
535 reginsert(CURLY, ret);
537 *flagp = (WORST|HASWIDTH);
543 if (tmp && tmp < iter)
544 fatal("Can't do {n,m} with n > m");
545 if (regcode != ®dummy) {
547 *(unsigned short *)(ret+3) = iter;
548 *(unsigned short *)(ret+5) = tmp;
550 ret[3] = iter >> 8; ret[4] = iter & 0377;
551 ret[5] = tmp >> 8; ret[6] = tmp & 0377;
557 regsawbracket++; /* remember we clobbered exp */
560 sprintf(regparse,"%.*d", max-regparse, iter - 1);
562 if (*max == ',' && max[1] != '}') {
563 if (atoi(max+1) <= 0)
564 fatal("Can't do {n,m} with n > m");
566 sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
569 if (iter != 1 || *max == ',') {
570 regparse = origparse; /* back up input pointer */
571 regnpar = orignpar; /* don't make more parens */
583 if (max == next) { /* any number more? */
585 op = '*'; /* fake up one with a star */
588 op = '?'; /* fake up optional atom */
590 sprintf(max,"%.*d", next-max, iter - 1);
595 regparse = origparse - 1; /* offset ++ below */
600 fatal("Can't do {n,0}");
603 fatal("Can't do {0}");
612 if (!(flags&HASWIDTH) && op != '?')
613 FAIL("regexp *+ operand could be empty");
614 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
616 if (op == '*' && (flags&SIMPLE))
617 reginsert(STAR, ret);
618 else if (op == '*') {
619 /* Emit x* as (x&|), where & means "self". */
620 reginsert(BRANCH, ret); /* Either x */
621 regoptail(ret, regnode(BACK)); /* and loop */
622 regoptail(ret, ret); /* back */
623 regtail(ret, regnode(BRANCH)); /* or */
624 regtail(ret, regnode(NOTHING)); /* null. */
625 } else if (op == '+' && (flags&SIMPLE))
626 reginsert(PLUS, ret);
627 else if (op == '+') {
628 /* Emit x+ as x(&|), where & means "self". */
629 next = regnode(BRANCH); /* Either */
631 regtail(regnode(BACK), ret); /* loop back */
632 regtail(next, regnode(BRANCH)); /* or */
633 regtail(ret, regnode(NOTHING)); /* null. */
634 } else if (op == '?') {
635 /* Emit x? as (x|) */
636 reginsert(BRANCH, ret); /* Either x */
637 regtail(ret, regnode(BRANCH)); /* or */
638 next = regnode(NOTHING); /* null. */
640 regoptail(ret, next);
644 if (ISMULT2(regparse))
645 FAIL("nested *?+ in regexp");
651 - regatom - the lowest level
653 * Optimization: gobbles an entire sequence of ordinary characters so that
654 * it can turn them into a single node, which is smaller to store and
655 * faster to run. Backslashed characters are exceptions, each becoming a
656 * separate node; the code is simpler that way and it's not worth fixing.
658 * [Yes, it is worth fixing, some scripts can run twice the speed.]
667 *flagp = WORST; /* Tentatively. */
669 switch (*regparse++) {
678 *flagp |= HASWIDTH|SIMPLE;
682 *flagp |= HASWIDTH|SIMPLE;
685 ret = reg(1, &flags);
688 *flagp |= flags&(HASWIDTH|SPSTART);
692 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
697 FAIL("?+* follows nothing in regexp");
702 ret = regnode(ALNUM);
703 *flagp |= HASWIDTH|SIMPLE;
707 ret = regnode(NALNUM);
708 *flagp |= HASWIDTH|SIMPLE;
712 ret = regnode(BOUND);
717 ret = regnode(NBOUND);
722 ret = regnode(SPACE);
723 *flagp |= HASWIDTH|SIMPLE;
727 ret = regnode(NSPACE);
728 *flagp |= HASWIDTH|SIMPLE;
732 ret = regnode(DIGIT);
733 *flagp |= HASWIDTH|SIMPLE;
737 ret = regnode(NDIGIT);
738 *flagp |= HASWIDTH|SIMPLE;
751 case '1': case '2': case '3': case '4':
752 case '5': case '6': case '7': case '8': case '9':
754 int num = atoi(regparse);
756 if (num > 9 && num >= regnpar)
760 ret = reganode(REF, num);
761 while (isascii(*regparse) && isdigit(*regparse))
768 if (regparse >= regxend)
769 FAIL("trailing \\ in regexp");
783 ret = regnode(EXACTLY);
784 regc(0); /* save spot for len */
785 for (len=0, p=regparse-1;
786 len < 127 && p < regxend;
836 ender = scanhex(++p, 2, &numlen);
843 ender = toupper(ender);
846 case '0': case '1': case '2': case '3':case '4':
847 case '5': case '6': case '7': case '8':case '9':
849 (isdigit(p[1]) && atoi(p) >= regnpar) ) {
850 ender = scanoct(p, 3, &numlen);
860 FAIL("trailing \\ in regexp");
871 if (regfold && isupper(ender))
872 ender = tolower(ender);
873 if (ISMULT2(p)) { /* Back off on ?+*. */
887 FAIL("internal disaster in regexp");
891 if (regcode != ®dummy)
907 if (regcode == ®dummy)
911 bits[c >> 3] &= ~(1 << (c & 7));
913 bits[c >> 3] |= (1 << (c & 7));
921 register int lastclass;
922 register int range = 0;
927 ret = regnode(ANYOF);
928 if (*regparse == '^') { /* Complement of range. */
935 for (class = 0; class < 32; class++)
937 if (*regparse == ']' || *regparse == '-')
938 goto skipcond; /* allow 1st char to be ] or - */
939 while (regparse < regxend && *regparse != ']') {
941 class = UCHARAT(regparse++);
943 class = UCHARAT(regparse++);
946 for (class = 'a'; class <= 'z'; class++)
947 regset(bits,def,class);
948 for (class = 'A'; class <= 'Z'; class++)
949 regset(bits,def,class);
950 for (class = '0'; class <= '9'; class++)
951 regset(bits,def,class);
952 regset(bits,def,'_');
956 regset(bits,def,' ');
957 regset(bits,def,'\t');
958 regset(bits,def,'\r');
959 regset(bits,def,'\f');
960 regset(bits,def,'\n');
964 for (class = '0'; class <= '9'; class++)
965 regset(bits,def,class);
990 class = scanhex(regparse, 2, &numlen);
996 class = toupper(class);
999 case '0': case '1': case '2': case '3': case '4':
1000 case '5': case '6': case '7': case '8': case '9':
1001 class = scanoct(--regparse, 3, &numlen);
1007 if (lastclass > class)
1008 FAIL("invalid [] range in regexp");
1013 if (*regparse == '-' && regparse+1 < regxend &&
1014 regparse[1] != ']') {
1017 continue; /* do it next time */
1020 for ( ; lastclass <= class; lastclass++) {
1021 regset(bits,def,lastclass);
1022 if (regfold && isupper(lastclass))
1023 regset(bits,def,tolower(lastclass));
1027 if (*regparse != ']')
1028 FAIL("unmatched [] in regexp");
1034 - regnode - emit a node
1036 static char * /* Location. */
1044 if (ret == ®dummy) {
1055 if (!((long)ret & 1))
1061 *ptr++ = '\0'; /* Null "next" pointer. */
1069 - reganode - emit a node with an argument
1071 static char * /* Location. */
1080 if (ret == ®dummy) {
1091 if (!((long)ret & 1))
1097 *ptr++ = '\0'; /* Null "next" pointer. */
1100 *(unsigned short *)(ret+3) = arg;
1102 ret[3] = arg >> 8; ret[4] = arg & 0377;
1111 - regc - emit (if appropriate) a byte of code
1117 if (regcode != ®dummy)
1124 - reginsert - insert an operator in front of already-emitted operand
1126 * Means relocating the operand.
1135 register char *place;
1136 register offset = (op == CURLY ? 4 : 0);
1138 if (regcode == ®dummy) {
1140 regsize += 4 + offset;
1142 regsize += 3 + offset;
1149 regcode += 4 + offset;
1151 regcode += 3 + offset;
1157 place = opnd; /* Op node, where operand used to be. */
1161 while (offset-- > 0)
1166 - regtail - set the next-pointer at the end of a node chain
1173 register char *scan;
1174 register char *temp;
1175 register int offset;
1180 /* Find last node. */
1183 temp = regnext(scan);
1190 offset = val - scan;
1192 *(short*)(scan+1) = offset;
1197 if (OP(scan) == BACK)
1198 offset = scan - val;
1200 offset = val - scan;
1201 *(scan+1) = (offset>>8)&0377;
1202 *(scan+2) = offset&0377;
1207 - regoptail - regtail on operand of first argument; nop if operandless
1214 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1215 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
1217 regtail(NEXTOPER(p), val);
1221 - regcurly - a little FSA that accepts {\d+,?\d*}
1245 - regdump - dump a regexp onto stderr in vaguely comprehensible form
1252 register char op = EXACTLY; /* Arbitrary non-END op. */
1253 register char *next;
1257 while (op != END) { /* While that wasn't END last time... */
1263 fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */
1266 if (next == NULL) /* Next ptr. */
1267 fprintf(stderr,"(0)");
1269 fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1274 if (op == EXACTLY) {
1275 /* Literal string, where present. */
1277 while (*s != '\0') {
1283 (void)putchar('\n');
1286 /* Header fields of interest. */
1288 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1290 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1291 if (r->reganch & ROPT_ANCH)
1292 fprintf(stderr,"anchored ");
1293 if (r->reganch & ROPT_SKIP)
1294 fprintf(stderr,"plus ");
1295 if (r->regmust != NULL)
1296 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1298 fprintf(stderr,"\n");
1302 - regprop - printable representation of opcode
1310 (void) strcpy(buf, ":");
1365 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1370 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
1374 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
1378 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
1388 FAIL("corrupted regexp opcode");
1391 (void) strcat(buf, p);
1394 #endif /* DEBUGGING */
1400 Safefree(r->precomp);
1401 r->precomp = Nullch;
1404 Safefree(r->subbase);
1405 r->subbase = Nullch;
1408 str_free(r->regmust);
1409 r->regmust = Nullstr;
1412 str_free(r->regstart);
1413 r->regstart = Nullstr;
1415 Safefree(r->startp);