Commit | Line | Data |
a687059c |
1 | /* NOTE: this is derived from Henry Spencer's regexp code, and should not |
2 | * confused with the original package (see point 3 below). Thanks, Henry! |
3 | */ |
4 | |
5 | /* Additional note: this code is very heavily munged from Henry's version |
6 | * in places. In some spots I've traded clarity for efficiency, so don't |
7 | * blame Henry for some of the lack of readability. |
8 | */ |
9 | |
79a0689e |
10 | /* $Header: regcomp.c,v 3.0.1.3 90/03/12 16:59:22 lwall Locked $ |
a687059c |
11 | * |
12 | * $Log: regcomp.c,v $ |
79a0689e |
13 | * Revision 3.0.1.3 90/03/12 16:59:22 lwall |
14 | * patch13: pattern matches can now use \0 to mean \000 |
15 | * |
ac58e20f |
16 | * Revision 3.0.1.2 90/02/28 18:08:35 lwall |
17 | * patch9: /[\200-\377]/ didn't work on machines with signed chars |
18 | * |
ae986130 |
19 | * Revision 3.0.1.1 89/11/11 04:51:04 lwall |
20 | * patch2: /[\000]/ didn't work |
21 | * |
a687059c |
22 | * Revision 3.0 89/10/18 15:22:29 lwall |
23 | * 3.0 baseline |
24 | * |
25 | */ |
26 | |
27 | /* |
28 | * regcomp and regexec -- regsub and regerror are not used in perl |
29 | * |
30 | * Copyright (c) 1986 by University of Toronto. |
31 | * Written by Henry Spencer. Not derived from licensed software. |
32 | * |
33 | * Permission is granted to anyone to use this software for any |
34 | * purpose on any computer system, and to redistribute it freely, |
35 | * subject to the following restrictions: |
36 | * |
37 | * 1. The author is not responsible for the consequences of use of |
38 | * this software, no matter how awful, even if they arise |
39 | * from defects in it. |
40 | * |
41 | * 2. The origin of this software must not be misrepresented, either |
42 | * by explicit claim or by omission. |
43 | * |
44 | * 3. Altered versions must be plainly marked as such, and must not |
45 | * be misrepresented as being the original software. |
46 | * |
47 | * |
48 | **** Alterations to Henry's code are... |
49 | **** |
50 | **** Copyright (c) 1989, Larry Wall |
51 | **** |
52 | **** You may distribute under the terms of the GNU General Public License |
53 | **** as specified in the README file that comes with the perl 3.0 kit. |
54 | * |
55 | * Beware that some of this code is subtly aware of the way operator |
56 | * precedence is structured in regular expressions. Serious changes in |
57 | * regular-expression syntax might require a total rethink. |
58 | */ |
59 | #include "EXTERN.h" |
60 | #include "perl.h" |
61 | #include "INTERN.h" |
62 | #include "regcomp.h" |
63 | |
64 | #ifndef STATIC |
65 | #define STATIC static |
66 | #endif |
67 | |
68 | #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') |
69 | #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ |
70 | ((*s) == '{' && regcurly(s))) |
71 | #define META "^$.[()|?+*\\" |
72 | |
73 | /* |
74 | * Flags to be passed up and down. |
75 | */ |
76 | #define HASWIDTH 01 /* Known never to match null string. */ |
77 | #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ |
78 | #define SPSTART 04 /* Starts with * or +. */ |
79 | #define WORST 0 /* Worst case. */ |
80 | |
81 | /* |
82 | * Global work variables for regcomp(). |
83 | */ |
84 | static char *regprecomp; /* uncompiled string. */ |
85 | static char *regparse; /* Input-scan pointer. */ |
86 | static char *regxend; /* End of input for compile */ |
87 | static int regnpar; /* () count. */ |
88 | static char *regcode; /* Code-emit pointer; ®dummy = don't. */ |
89 | static long regsize; /* Code size. */ |
90 | static int regfold; |
91 | static int regsawbracket; /* Did we do {d,d} trick? */ |
92 | |
93 | /* |
94 | * Forward declarations for regcomp()'s friends. |
95 | */ |
96 | STATIC int regcurly(); |
97 | STATIC char *reg(); |
98 | STATIC char *regbranch(); |
99 | STATIC char *regpiece(); |
100 | STATIC char *regatom(); |
101 | STATIC char *regclass(); |
102 | STATIC char *regnode(); |
103 | STATIC void regc(); |
104 | STATIC void reginsert(); |
105 | STATIC void regtail(); |
106 | STATIC void regoptail(); |
107 | |
108 | /* |
109 | - regcomp - compile a regular expression into internal code |
110 | * |
111 | * We can't allocate space until we know how big the compiled form will be, |
112 | * but we can't compile it (and thus know how big it is) until we've got a |
113 | * place to put the code. So we cheat: we compile it twice, once with code |
114 | * generation turned off and size counting turned on, and once "for real". |
115 | * This also means that we don't allocate space until we are sure that the |
116 | * thing really will compile successfully, and we never have to move the |
117 | * code and thus invalidate pointers into it. (Note that it has to be in |
118 | * one piece because free() must be able to free it all.) [NB: not true in perl] |
119 | * |
120 | * Beware that the optimization-preparation code in here knows about some |
121 | * of the structure of the compiled regexp. [I'll say.] |
122 | */ |
123 | regexp * |
124 | regcomp(exp,xend,fold,rare) |
125 | char *exp; |
126 | char *xend; |
127 | int fold; |
128 | int rare; |
129 | { |
130 | register regexp *r; |
131 | register char *scan; |
132 | register STR *longest; |
133 | register int len; |
134 | register char *first; |
135 | int flags; |
136 | int back; |
137 | int curback; |
138 | extern char *safemalloc(); |
139 | extern char *savestr(); |
140 | |
141 | if (exp == NULL) |
142 | fatal("NULL regexp argument"); |
143 | |
144 | /* First pass: determine size, legality. */ |
145 | regfold = fold; |
146 | regparse = exp; |
147 | regxend = xend; |
148 | regprecomp = nsavestr(exp,xend-exp); |
149 | regsawbracket = 0; |
150 | regnpar = 1; |
151 | regsize = 0L; |
152 | regcode = ®dummy; |
153 | regc(MAGIC); |
154 | if (reg(0, &flags) == NULL) { |
155 | Safefree(regprecomp); |
156 | return(NULL); |
157 | } |
158 | |
159 | /* Small enough for pointer-storage convention? */ |
160 | if (regsize >= 32767L) /* Probably could be 65535L. */ |
161 | FAIL("regexp too big"); |
162 | |
163 | /* Allocate space. */ |
164 | Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp); |
165 | if (r == NULL) |
166 | FAIL("regexp out of space"); |
167 | |
168 | /* Second pass: emit code. */ |
169 | if (regsawbracket) |
170 | bcopy(regprecomp,exp,xend-exp); |
171 | r->precomp = regprecomp; |
172 | r->subbase = NULL; |
173 | regparse = exp; |
174 | regnpar = 1; |
175 | regcode = r->program; |
176 | regc(MAGIC); |
177 | if (reg(0, &flags) == NULL) |
178 | return(NULL); |
179 | |
180 | /* Dig out information for optimizations. */ |
181 | r->regstart = Nullstr; /* Worst-case defaults. */ |
182 | r->reganch = 0; |
183 | r->regmust = Nullstr; |
184 | r->regback = -1; |
185 | r->regstclass = Nullch; |
186 | scan = r->program+1; /* First BRANCH. */ |
187 | if (OP(regnext(scan)) == END) {/* Only one top-level choice. */ |
188 | scan = NEXTOPER(scan); |
189 | |
190 | first = scan; |
191 | while ((OP(first) > OPEN && OP(first) < CLOSE) || |
192 | (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || |
193 | (OP(first) == PLUS) ) |
194 | first = NEXTOPER(first); |
195 | |
196 | /* Starting-point info. */ |
197 | if (OP(first) == EXACTLY) { |
198 | r->regstart = |
199 | str_make(OPERAND(first)+1,*OPERAND(first)); |
200 | if (r->regstart->str_cur > !(sawstudy|fold)) |
201 | fbmcompile(r->regstart,fold); |
202 | } |
203 | else if ((exp = index(simple,OP(first))) && exp > simple) |
204 | r->regstclass = first; |
205 | else if (OP(first) == BOUND || OP(first) == NBOUND) |
206 | r->regstclass = first; |
207 | else if (OP(first) == BOL) |
208 | r->reganch++; |
209 | |
210 | #ifdef DEBUGGING |
211 | if (debug & 512) |
212 | fprintf(stderr,"first %d next %d offset %d\n", |
213 | OP(first), OP(NEXTOPER(first)), first - scan); |
214 | #endif |
215 | /* |
216 | * If there's something expensive in the r.e., find the |
217 | * longest literal string that must appear and make it the |
218 | * regmust. Resolve ties in favor of later strings, since |
219 | * the regstart check works with the beginning of the r.e. |
220 | * and avoiding duplication strengthens checking. Not a |
221 | * strong reason, but sufficient in the absence of others. |
222 | * [Now we resolve ties in favor of the earlier string if |
223 | * it happens that curback has been invalidated, since the |
224 | * earlier string may buy us something the later one won't.] |
225 | */ |
226 | longest = str_make("",0); |
227 | len = 0; |
228 | curback = 0; |
229 | back = 0; |
230 | while (scan != NULL) { |
231 | if (OP(scan) == BRANCH) { |
232 | if (OP(regnext(scan)) == BRANCH) { |
233 | curback = -30000; |
234 | while (OP(scan) == BRANCH) |
235 | scan = regnext(scan); |
236 | } |
237 | else /* single branch is ok */ |
238 | scan = NEXTOPER(scan); |
239 | } |
240 | if (OP(scan) == EXACTLY) { |
241 | first = scan; |
242 | while (OP(regnext(scan)) >= CLOSE) |
243 | scan = regnext(scan); |
244 | if (curback - back == len) { |
245 | str_ncat(longest, OPERAND(first)+1, |
246 | *OPERAND(first)); |
247 | len += *OPERAND(first); |
248 | curback += *OPERAND(first); |
249 | first = regnext(scan); |
250 | } |
251 | else if (*OPERAND(first) >= len + (curback >= 0)) { |
252 | len = *OPERAND(first); |
253 | str_nset(longest, OPERAND(first)+1,len); |
254 | back = curback; |
255 | curback += len; |
256 | first = regnext(scan); |
257 | } |
258 | else |
259 | curback += *OPERAND(first); |
260 | } |
261 | else if (index(varies,OP(scan))) |
262 | curback = -30000; |
263 | else if (index(simple,OP(scan))) |
264 | curback++; |
265 | scan = regnext(scan); |
266 | } |
267 | if (len) { |
268 | r->regmust = longest; |
269 | if (back < 0) |
270 | back = -1; |
271 | r->regback = back; |
272 | if (len > !(sawstudy||fold||OP(first)==EOL)) |
273 | fbmcompile(r->regmust,fold); |
274 | r->regmust->str_u.str_useful = 100; |
275 | if (OP(first) == EOL) /* is match anchored to EOL? */ |
276 | r->regmust->str_pok |= SP_TAIL; |
277 | } |
278 | else |
279 | str_free(longest); |
280 | } |
281 | |
282 | r->do_folding = fold; |
283 | r->nparens = regnpar - 1; |
284 | #ifdef DEBUGGING |
285 | if (debug & 512) |
286 | regdump(r); |
287 | #endif |
288 | return(r); |
289 | } |
290 | |
291 | /* |
292 | - reg - regular expression, i.e. main body or parenthesized thing |
293 | * |
294 | * Caller must absorb opening parenthesis. |
295 | * |
296 | * Combining parenthesis handling with the base level of regular expression |
297 | * is a trifle forced, but the need to tie the tails of the branches to what |
298 | * follows makes it hard to avoid. |
299 | */ |
300 | static char * |
301 | reg(paren, flagp) |
302 | int paren; /* Parenthesized? */ |
303 | int *flagp; |
304 | { |
305 | register char *ret; |
306 | register char *br; |
307 | register char *ender; |
308 | register int parno; |
309 | int flags; |
310 | |
311 | *flagp = HASWIDTH; /* Tentatively. */ |
312 | |
313 | /* Make an OPEN node, if parenthesized. */ |
314 | if (paren) { |
315 | if (regnpar >= NSUBEXP) |
316 | FAIL("too many () in regexp"); |
317 | parno = regnpar; |
318 | regnpar++; |
319 | ret = regnode(OPEN+parno); |
320 | } else |
321 | ret = NULL; |
322 | |
323 | /* Pick up the branches, linking them together. */ |
324 | br = regbranch(&flags); |
325 | if (br == NULL) |
326 | return(NULL); |
327 | if (ret != NULL) |
328 | regtail(ret, br); /* OPEN -> first. */ |
329 | else |
330 | ret = br; |
331 | if (!(flags&HASWIDTH)) |
332 | *flagp &= ~HASWIDTH; |
333 | *flagp |= flags&SPSTART; |
334 | while (*regparse == '|') { |
335 | regparse++; |
336 | br = regbranch(&flags); |
337 | if (br == NULL) |
338 | return(NULL); |
339 | regtail(ret, br); /* BRANCH -> BRANCH. */ |
340 | if (!(flags&HASWIDTH)) |
341 | *flagp &= ~HASWIDTH; |
342 | *flagp |= flags&SPSTART; |
343 | } |
344 | |
345 | /* Make a closing node, and hook it on the end. */ |
346 | ender = regnode((paren) ? CLOSE+parno : END); |
347 | regtail(ret, ender); |
348 | |
349 | /* Hook the tails of the branches to the closing node. */ |
350 | for (br = ret; br != NULL; br = regnext(br)) |
351 | regoptail(br, ender); |
352 | |
353 | /* Check for proper termination. */ |
354 | if (paren && *regparse++ != ')') { |
355 | FAIL("unmatched () in regexp"); |
356 | } else if (!paren && regparse < regxend) { |
357 | if (*regparse == ')') { |
358 | FAIL("unmatched () in regexp"); |
359 | } else |
360 | FAIL("junk on end of regexp"); /* "Can't happen". */ |
361 | /* NOTREACHED */ |
362 | } |
363 | |
364 | return(ret); |
365 | } |
366 | |
367 | /* |
368 | - regbranch - one alternative of an | operator |
369 | * |
370 | * Implements the concatenation operator. |
371 | */ |
372 | static char * |
373 | regbranch(flagp) |
374 | int *flagp; |
375 | { |
376 | register char *ret; |
377 | register char *chain; |
378 | register char *latest; |
379 | int flags; |
380 | |
381 | *flagp = WORST; /* Tentatively. */ |
382 | |
383 | ret = regnode(BRANCH); |
384 | chain = NULL; |
385 | while (regparse < regxend && *regparse != '|' && *regparse != ')') { |
386 | latest = regpiece(&flags); |
387 | if (latest == NULL) |
388 | return(NULL); |
389 | *flagp |= flags&HASWIDTH; |
390 | if (chain == NULL) /* First piece. */ |
391 | *flagp |= flags&SPSTART; |
392 | else |
393 | regtail(chain, latest); |
394 | chain = latest; |
395 | } |
396 | if (chain == NULL) /* Loop ran zero times. */ |
397 | (void) regnode(NOTHING); |
398 | |
399 | return(ret); |
400 | } |
401 | |
402 | /* |
403 | - regpiece - something followed by possible [*+?] |
404 | * |
405 | * Note that the branching code sequences used for ? and the general cases |
406 | * of * and + are somewhat optimized: they use the same NOTHING node as |
407 | * both the endmarker for their branch list and the body of the last branch. |
408 | * It might seem that this node could be dispensed with entirely, but the |
409 | * endmarker role is not redundant. |
410 | */ |
411 | static char * |
412 | regpiece(flagp) |
413 | int *flagp; |
414 | { |
415 | register char *ret; |
416 | register char op; |
417 | register char *next; |
418 | int flags; |
419 | char *origparse = regparse; |
420 | int orignpar = regnpar; |
421 | char *max; |
422 | int iter; |
423 | char ch; |
424 | |
425 | ret = regatom(&flags); |
426 | if (ret == NULL) |
427 | return(NULL); |
428 | |
429 | op = *regparse; |
430 | |
431 | /* Here's a total kludge: if after the atom there's a {\d+,?\d*} |
432 | * then we decrement the first number by one and reset our |
433 | * parsing back to the beginning of the same atom. If the first number |
434 | * is down to 0, decrement the second number instead and fake up |
435 | * a ? after it. Given the way this compiler doesn't keep track |
436 | * of offsets on the first pass, this is the only way to replicate |
437 | * a piece of code. Sigh. |
438 | */ |
439 | if (op == '{' && regcurly(regparse)) { |
440 | next = regparse + 1; |
441 | max = Nullch; |
442 | while (isdigit(*next) || *next == ',') { |
443 | if (*next == ',') { |
444 | if (max) |
445 | break; |
446 | else |
447 | max = next; |
448 | } |
449 | next++; |
450 | } |
451 | if (*next == '}') { /* got one */ |
452 | regsawbracket++; /* remember we clobbered exp */ |
453 | if (!max) |
454 | max = next; |
455 | regparse++; |
456 | iter = atoi(regparse); |
457 | if (iter > 0) { |
458 | ch = *max; |
459 | sprintf(regparse,"%.*d", max-regparse, iter - 1); |
460 | *max = ch; |
461 | if (*max == ',' && atoi(max+1) > 0) { |
462 | ch = *next; |
463 | sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1); |
464 | *next = ch; |
465 | } |
466 | if (iter != 1 || (*max == ',' || atoi(max+1))) { |
467 | regparse = origparse; /* back up input pointer */ |
468 | regnpar = orignpar; /* don't make more parens */ |
469 | } |
470 | else { |
471 | regparse = next; |
472 | goto nest_check; |
473 | } |
474 | *flagp = flags; |
475 | return ret; |
476 | } |
477 | if (*max == ',') { |
478 | max++; |
479 | iter = atoi(max); |
480 | if (max == next) { /* any number more? */ |
481 | regparse = next; |
482 | op = '*'; /* fake up one with a star */ |
483 | } |
484 | else if (iter > 0) { |
485 | op = '?'; /* fake up optional atom */ |
486 | ch = *next; |
487 | sprintf(max,"%.*d", next-max, iter - 1); |
488 | *next = ch; |
489 | if (iter == 1) |
490 | regparse = next; |
491 | else { |
492 | regparse = origparse - 1; /* offset ++ below */ |
493 | regnpar = orignpar; |
494 | } |
495 | } |
496 | else |
497 | fatal("Can't do {n,0}"); |
498 | } |
499 | else |
500 | fatal("Can't do {0}"); |
501 | } |
502 | } |
503 | |
504 | if (!ISMULT1(op)) { |
505 | *flagp = flags; |
506 | return(ret); |
507 | } |
508 | |
509 | if (!(flags&HASWIDTH) && op != '?') |
510 | FAIL("regexp *+ operand could be empty"); |
511 | *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); |
512 | |
513 | if (op == '*' && (flags&SIMPLE)) |
514 | reginsert(STAR, ret); |
515 | else if (op == '*') { |
516 | /* Emit x* as (x&|), where & means "self". */ |
517 | reginsert(BRANCH, ret); /* Either x */ |
518 | regoptail(ret, regnode(BACK)); /* and loop */ |
519 | regoptail(ret, ret); /* back */ |
520 | regtail(ret, regnode(BRANCH)); /* or */ |
521 | regtail(ret, regnode(NOTHING)); /* null. */ |
522 | } else if (op == '+' && (flags&SIMPLE)) |
523 | reginsert(PLUS, ret); |
524 | else if (op == '+') { |
525 | /* Emit x+ as x(&|), where & means "self". */ |
526 | next = regnode(BRANCH); /* Either */ |
527 | regtail(ret, next); |
528 | regtail(regnode(BACK), ret); /* loop back */ |
529 | regtail(next, regnode(BRANCH)); /* or */ |
530 | regtail(ret, regnode(NOTHING)); /* null. */ |
531 | } else if (op == '?') { |
532 | /* Emit x? as (x|) */ |
533 | reginsert(BRANCH, ret); /* Either x */ |
534 | regtail(ret, regnode(BRANCH)); /* or */ |
535 | next = regnode(NOTHING); /* null. */ |
536 | regtail(ret, next); |
537 | regoptail(ret, next); |
538 | } |
539 | nest_check: |
540 | regparse++; |
541 | if (ISMULT2(regparse)) |
542 | FAIL("nested *?+ in regexp"); |
543 | |
544 | return(ret); |
545 | } |
546 | |
547 | /* |
548 | - regatom - the lowest level |
549 | * |
550 | * Optimization: gobbles an entire sequence of ordinary characters so that |
551 | * it can turn them into a single node, which is smaller to store and |
552 | * faster to run. Backslashed characters are exceptions, each becoming a |
553 | * separate node; the code is simpler that way and it's not worth fixing. |
554 | * |
555 | * [Yes, it is worth fixing, some scripts can run twice the speed.] |
556 | */ |
557 | static char * |
558 | regatom(flagp) |
559 | int *flagp; |
560 | { |
561 | register char *ret; |
562 | int flags; |
563 | |
564 | *flagp = WORST; /* Tentatively. */ |
565 | |
566 | switch (*regparse++) { |
567 | case '^': |
568 | ret = regnode(BOL); |
569 | break; |
570 | case '$': |
571 | ret = regnode(EOL); |
572 | break; |
573 | case '.': |
574 | ret = regnode(ANY); |
575 | *flagp |= HASWIDTH|SIMPLE; |
576 | break; |
577 | case '[': |
578 | ret = regclass(); |
579 | *flagp |= HASWIDTH|SIMPLE; |
580 | break; |
581 | case '(': |
582 | ret = reg(1, &flags); |
583 | if (ret == NULL) |
584 | return(NULL); |
585 | *flagp |= flags&(HASWIDTH|SPSTART); |
586 | break; |
587 | case '|': |
588 | case ')': |
589 | FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */ |
590 | break; |
591 | case '?': |
592 | case '+': |
593 | case '*': |
594 | FAIL("?+* follows nothing in regexp"); |
595 | break; |
596 | case '\\': |
597 | switch (*regparse) { |
598 | case 'w': |
599 | ret = regnode(ALNUM); |
600 | *flagp |= HASWIDTH|SIMPLE; |
601 | regparse++; |
602 | break; |
603 | case 'W': |
604 | ret = regnode(NALNUM); |
605 | *flagp |= HASWIDTH|SIMPLE; |
606 | regparse++; |
607 | break; |
608 | case 'b': |
609 | ret = regnode(BOUND); |
610 | *flagp |= SIMPLE; |
611 | regparse++; |
612 | break; |
613 | case 'B': |
614 | ret = regnode(NBOUND); |
615 | *flagp |= SIMPLE; |
616 | regparse++; |
617 | break; |
618 | case 's': |
619 | ret = regnode(SPACE); |
620 | *flagp |= HASWIDTH|SIMPLE; |
621 | regparse++; |
622 | break; |
623 | case 'S': |
624 | ret = regnode(NSPACE); |
625 | *flagp |= HASWIDTH|SIMPLE; |
626 | regparse++; |
627 | break; |
628 | case 'd': |
629 | ret = regnode(DIGIT); |
630 | *flagp |= HASWIDTH|SIMPLE; |
631 | regparse++; |
632 | break; |
633 | case 'D': |
634 | ret = regnode(NDIGIT); |
635 | *flagp |= HASWIDTH|SIMPLE; |
636 | regparse++; |
637 | break; |
638 | case 'n': |
639 | case 'r': |
640 | case 't': |
641 | case 'f': |
642 | goto defchar; |
643 | case '0': case '1': case '2': case '3': case '4': |
644 | case '5': case '6': case '7': case '8': case '9': |
79a0689e |
645 | if (isdigit(regparse[1]) || *regparse == '0') |
a687059c |
646 | goto defchar; |
647 | else { |
648 | ret = regnode(REF + *regparse++ - '0'); |
649 | *flagp |= SIMPLE; |
650 | } |
651 | break; |
652 | case '\0': |
653 | if (regparse >= regxend) |
654 | FAIL("trailing \\ in regexp"); |
655 | /* FALL THROUGH */ |
656 | default: |
657 | goto defchar; |
658 | } |
659 | break; |
660 | default: { |
661 | register int len; |
662 | register char ender; |
663 | register char *p; |
664 | char *oldp; |
665 | int foo; |
666 | |
667 | defchar: |
668 | ret = regnode(EXACTLY); |
669 | regc(0); /* save spot for len */ |
670 | for (len=0, p=regparse-1; |
671 | len < 127 && p < regxend; |
672 | len++) |
673 | { |
674 | oldp = p; |
675 | switch (*p) { |
676 | case '^': |
677 | case '$': |
678 | case '.': |
679 | case '[': |
680 | case '(': |
681 | case ')': |
682 | case '|': |
683 | goto loopdone; |
684 | case '\\': |
685 | switch (*++p) { |
686 | case 'w': |
687 | case 'W': |
688 | case 'b': |
689 | case 'B': |
690 | case 's': |
691 | case 'S': |
692 | case 'd': |
693 | case 'D': |
694 | --p; |
695 | goto loopdone; |
696 | case 'n': |
697 | ender = '\n'; |
698 | p++; |
699 | break; |
700 | case 'r': |
701 | ender = '\r'; |
702 | p++; |
703 | break; |
704 | case 't': |
705 | ender = '\t'; |
706 | p++; |
707 | break; |
708 | case 'f': |
709 | ender = '\f'; |
710 | p++; |
711 | break; |
712 | case '0': case '1': case '2': case '3':case '4': |
713 | case '5': case '6': case '7': case '8':case '9': |
79a0689e |
714 | if (isdigit(p[1]) || *p == '0') { |
715 | foo = *p - '0'; |
716 | if (isdigit(p[1])) |
717 | foo = (foo<<3) + *++p - '0'; |
a687059c |
718 | if (isdigit(p[1])) |
719 | foo = (foo<<3) + *++p - '0'; |
720 | ender = foo; |
721 | p++; |
722 | } |
723 | else { |
724 | --p; |
725 | goto loopdone; |
726 | } |
727 | break; |
728 | case '\0': |
729 | if (p >= regxend) |
730 | FAIL("trailing \\ in regexp"); |
731 | /* FALL THROUGH */ |
732 | default: |
733 | ender = *p++; |
734 | break; |
735 | } |
736 | break; |
737 | default: |
738 | ender = *p++; |
739 | break; |
740 | } |
741 | if (regfold && isupper(ender)) |
742 | ender = tolower(ender); |
743 | if (ISMULT2(p)) { /* Back off on ?+*. */ |
744 | if (len) |
745 | p = oldp; |
746 | else { |
747 | len++; |
748 | regc(ender); |
749 | } |
750 | break; |
751 | } |
752 | regc(ender); |
753 | } |
754 | loopdone: |
755 | regparse = p; |
756 | if (len <= 0) |
757 | FAIL("internal disaster in regexp"); |
758 | *flagp |= HASWIDTH; |
759 | if (len == 1) |
760 | *flagp |= SIMPLE; |
761 | if (regcode != ®dummy) |
762 | *OPERAND(ret) = len; |
763 | regc('\0'); |
764 | } |
765 | break; |
766 | } |
767 | |
768 | return(ret); |
769 | } |
770 | |
771 | static void |
772 | regset(bits,def,c) |
773 | char *bits; |
774 | int def; |
775 | register int c; |
776 | { |
777 | if (regcode == ®dummy) |
778 | return; |
ac58e20f |
779 | c &= 255; |
a687059c |
780 | if (def) |
781 | bits[c >> 3] &= ~(1 << (c & 7)); |
782 | else |
783 | bits[c >> 3] |= (1 << (c & 7)); |
784 | } |
785 | |
786 | static char * |
787 | regclass() |
788 | { |
789 | register char *bits; |
790 | register int class; |
791 | register int lastclass; |
792 | register int range = 0; |
793 | register char *ret; |
794 | register int def; |
795 | |
796 | if (*regparse == '^') { /* Complement of range. */ |
797 | ret = regnode(ANYBUT); |
798 | regparse++; |
799 | def = 0; |
800 | } else { |
801 | ret = regnode(ANYOF); |
802 | def = 255; |
803 | } |
804 | bits = regcode; |
805 | for (class = 0; class < 32; class++) |
806 | regc(def); |
807 | if (*regparse == ']' || *regparse == '-') |
808 | regset(bits,def,lastclass = *regparse++); |
809 | while (regparse < regxend && *regparse != ']') { |
810 | class = UCHARAT(regparse++); |
811 | if (class == '\\') { |
812 | class = UCHARAT(regparse++); |
813 | switch (class) { |
814 | case 'w': |
815 | for (class = 'a'; class <= 'z'; class++) |
816 | regset(bits,def,class); |
817 | for (class = 'A'; class <= 'Z'; class++) |
818 | regset(bits,def,class); |
819 | for (class = '0'; class <= '9'; class++) |
820 | regset(bits,def,class); |
821 | regset(bits,def,'_'); |
822 | lastclass = 1234; |
823 | continue; |
824 | case 's': |
825 | regset(bits,def,' '); |
826 | regset(bits,def,'\t'); |
827 | regset(bits,def,'\r'); |
828 | regset(bits,def,'\f'); |
829 | regset(bits,def,'\n'); |
830 | lastclass = 1234; |
831 | continue; |
832 | case 'd': |
833 | for (class = '0'; class <= '9'; class++) |
834 | regset(bits,def,class); |
835 | lastclass = 1234; |
836 | continue; |
837 | case 'n': |
838 | class = '\n'; |
839 | break; |
840 | case 'r': |
841 | class = '\r'; |
842 | break; |
843 | case 't': |
844 | class = '\t'; |
845 | break; |
846 | case 'f': |
847 | class = '\f'; |
848 | break; |
849 | case 'b': |
850 | class = '\b'; |
851 | break; |
852 | case '0': case '1': case '2': case '3': case '4': |
853 | case '5': case '6': case '7': case '8': case '9': |
854 | class -= '0'; |
855 | if (isdigit(*regparse)) { |
856 | class <<= 3; |
857 | class += *regparse++ - '0'; |
858 | } |
859 | if (isdigit(*regparse)) { |
860 | class <<= 3; |
861 | class += *regparse++ - '0'; |
862 | } |
863 | break; |
864 | } |
865 | } |
866 | if (!range && class == '-' && regparse < regxend && |
867 | *regparse != ']') { |
868 | range = 1; |
869 | continue; |
870 | } |
871 | if (range) { |
872 | if (lastclass > class) |
873 | FAIL("invalid [] range in regexp"); |
874 | } |
875 | else |
876 | lastclass = class - 1; |
877 | range = 0; |
878 | for (lastclass++; lastclass <= class; lastclass++) { |
879 | regset(bits,def,lastclass); |
880 | if (regfold && isupper(lastclass)) |
881 | regset(bits,def,tolower(lastclass)); |
882 | } |
883 | lastclass = class; |
884 | } |
885 | if (*regparse != ']') |
886 | FAIL("unmatched [] in regexp"); |
a687059c |
887 | regparse++; |
888 | return ret; |
889 | } |
890 | |
891 | /* |
892 | - regnode - emit a node |
893 | */ |
894 | static char * /* Location. */ |
895 | regnode(op) |
896 | char op; |
897 | { |
898 | register char *ret; |
899 | register char *ptr; |
900 | |
901 | ret = regcode; |
902 | if (ret == ®dummy) { |
903 | #ifdef REGALIGN |
904 | if (!(regsize & 1)) |
905 | regsize++; |
906 | #endif |
907 | regsize += 3; |
908 | return(ret); |
909 | } |
910 | |
911 | #ifdef REGALIGN |
912 | #ifndef lint |
913 | if (!((long)ret & 1)) |
914 | *ret++ = 127; |
915 | #endif |
916 | #endif |
917 | ptr = ret; |
918 | *ptr++ = op; |
919 | *ptr++ = '\0'; /* Null "next" pointer. */ |
920 | *ptr++ = '\0'; |
921 | regcode = ptr; |
922 | |
923 | return(ret); |
924 | } |
925 | |
926 | /* |
927 | - regc - emit (if appropriate) a byte of code |
928 | */ |
929 | static void |
930 | regc(b) |
931 | char b; |
932 | { |
933 | if (regcode != ®dummy) |
934 | *regcode++ = b; |
935 | else |
936 | regsize++; |
937 | } |
938 | |
939 | /* |
940 | - reginsert - insert an operator in front of already-emitted operand |
941 | * |
942 | * Means relocating the operand. |
943 | */ |
944 | static void |
945 | reginsert(op, opnd) |
946 | char op; |
947 | char *opnd; |
948 | { |
949 | register char *src; |
950 | register char *dst; |
951 | register char *place; |
952 | |
953 | if (regcode == ®dummy) { |
954 | #ifdef REGALIGN |
955 | regsize += 4; |
956 | #else |
957 | regsize += 3; |
958 | #endif |
959 | return; |
960 | } |
961 | |
962 | src = regcode; |
963 | #ifdef REGALIGN |
964 | regcode += 4; |
965 | #else |
966 | regcode += 3; |
967 | #endif |
968 | dst = regcode; |
969 | while (src > opnd) |
970 | *--dst = *--src; |
971 | |
972 | place = opnd; /* Op node, where operand used to be. */ |
973 | *place++ = op; |
974 | *place++ = '\0'; |
975 | *place++ = '\0'; |
976 | } |
977 | |
978 | /* |
979 | - regtail - set the next-pointer at the end of a node chain |
980 | */ |
981 | static void |
982 | regtail(p, val) |
983 | char *p; |
984 | char *val; |
985 | { |
986 | register char *scan; |
987 | register char *temp; |
988 | register int offset; |
989 | |
990 | if (p == ®dummy) |
991 | return; |
992 | |
993 | /* Find last node. */ |
994 | scan = p; |
995 | for (;;) { |
996 | temp = regnext(scan); |
997 | if (temp == NULL) |
998 | break; |
999 | scan = temp; |
1000 | } |
1001 | |
1002 | #ifdef REGALIGN |
1003 | offset = val - scan; |
1004 | #ifndef lint |
1005 | *(short*)(scan+1) = offset; |
1006 | #else |
1007 | offset = offset; |
1008 | #endif |
1009 | #else |
1010 | if (OP(scan) == BACK) |
1011 | offset = scan - val; |
1012 | else |
1013 | offset = val - scan; |
1014 | *(scan+1) = (offset>>8)&0377; |
1015 | *(scan+2) = offset&0377; |
1016 | #endif |
1017 | } |
1018 | |
1019 | /* |
1020 | - regoptail - regtail on operand of first argument; nop if operandless |
1021 | */ |
1022 | static void |
1023 | regoptail(p, val) |
1024 | char *p; |
1025 | char *val; |
1026 | { |
1027 | /* "Operandless" and "op != BRANCH" are synonymous in practice. */ |
1028 | if (p == NULL || p == ®dummy || OP(p) != BRANCH) |
1029 | return; |
1030 | regtail(NEXTOPER(p), val); |
1031 | } |
1032 | |
1033 | /* |
1034 | - regcurly - a little FSA that accepts {\d+,?\d*} |
1035 | */ |
1036 | STATIC int |
1037 | regcurly(s) |
1038 | register char *s; |
1039 | { |
1040 | if (*s++ != '{') |
1041 | return FALSE; |
1042 | if (!isdigit(*s)) |
1043 | return FALSE; |
1044 | while (isdigit(*s)) |
1045 | s++; |
1046 | if (*s == ',') |
1047 | s++; |
1048 | while (isdigit(*s)) |
1049 | s++; |
1050 | if (*s != '}') |
1051 | return FALSE; |
1052 | return TRUE; |
1053 | } |
1054 | |
1055 | #ifdef DEBUGGING |
1056 | |
1057 | /* |
1058 | - regdump - dump a regexp onto stderr in vaguely comprehensible form |
1059 | */ |
1060 | void |
1061 | regdump(r) |
1062 | regexp *r; |
1063 | { |
1064 | register char *s; |
1065 | register char op = EXACTLY; /* Arbitrary non-END op. */ |
1066 | register char *next; |
1067 | extern char *index(); |
1068 | |
1069 | |
1070 | s = r->program + 1; |
1071 | while (op != END) { /* While that wasn't END last time... */ |
1072 | #ifdef REGALIGN |
1073 | if (!((long)s & 1)) |
1074 | s++; |
1075 | #endif |
1076 | op = OP(s); |
1077 | fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */ |
1078 | next = regnext(s); |
1079 | if (next == NULL) /* Next ptr. */ |
1080 | fprintf(stderr,"(0)"); |
1081 | else |
1082 | fprintf(stderr,"(%d)", (s-r->program)+(next-s)); |
1083 | s += 3; |
1084 | if (op == ANYOF || op == ANYBUT) { |
1085 | s += 32; |
1086 | } |
1087 | if (op == EXACTLY) { |
1088 | /* Literal string, where present. */ |
1089 | s++; |
1090 | while (*s != '\0') { |
1091 | (void)putchar(*s); |
1092 | s++; |
1093 | } |
1094 | s++; |
1095 | } |
1096 | (void)putchar('\n'); |
1097 | } |
1098 | |
1099 | /* Header fields of interest. */ |
1100 | if (r->regstart) |
1101 | fprintf(stderr,"start `%s' ", r->regstart->str_ptr); |
1102 | if (r->regstclass) |
1103 | fprintf(stderr,"stclass `%s' ", regprop(r->regstclass)); |
1104 | if (r->reganch) |
1105 | fprintf(stderr,"anchored "); |
1106 | if (r->regmust != NULL) |
1107 | fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr, |
1108 | r->regback); |
1109 | fprintf(stderr,"\n"); |
1110 | } |
1111 | |
1112 | /* |
1113 | - regprop - printable representation of opcode |
1114 | */ |
1115 | char * |
1116 | regprop(op) |
1117 | char *op; |
1118 | { |
1119 | register char *p; |
1120 | |
1121 | (void) strcpy(buf, ":"); |
1122 | |
1123 | switch (OP(op)) { |
1124 | case BOL: |
1125 | p = "BOL"; |
1126 | break; |
1127 | case EOL: |
1128 | p = "EOL"; |
1129 | break; |
1130 | case ANY: |
1131 | p = "ANY"; |
1132 | break; |
1133 | case ANYOF: |
1134 | p = "ANYOF"; |
1135 | break; |
1136 | case ANYBUT: |
1137 | p = "ANYBUT"; |
1138 | break; |
1139 | case BRANCH: |
1140 | p = "BRANCH"; |
1141 | break; |
1142 | case EXACTLY: |
1143 | p = "EXACTLY"; |
1144 | break; |
1145 | case NOTHING: |
1146 | p = "NOTHING"; |
1147 | break; |
1148 | case BACK: |
1149 | p = "BACK"; |
1150 | break; |
1151 | case END: |
1152 | p = "END"; |
1153 | break; |
1154 | case ALNUM: |
1155 | p = "ALNUM"; |
1156 | break; |
1157 | case NALNUM: |
1158 | p = "NALNUM"; |
1159 | break; |
1160 | case BOUND: |
1161 | p = "BOUND"; |
1162 | break; |
1163 | case NBOUND: |
1164 | p = "NBOUND"; |
1165 | break; |
1166 | case SPACE: |
1167 | p = "SPACE"; |
1168 | break; |
1169 | case NSPACE: |
1170 | p = "NSPACE"; |
1171 | break; |
1172 | case DIGIT: |
1173 | p = "DIGIT"; |
1174 | break; |
1175 | case NDIGIT: |
1176 | p = "NDIGIT"; |
1177 | break; |
1178 | case REF: |
1179 | case REF+1: |
1180 | case REF+2: |
1181 | case REF+3: |
1182 | case REF+4: |
1183 | case REF+5: |
1184 | case REF+6: |
1185 | case REF+7: |
1186 | case REF+8: |
1187 | case REF+9: |
1188 | (void)sprintf(buf+strlen(buf), "REF%d", OP(op)-REF); |
1189 | p = NULL; |
1190 | break; |
1191 | case OPEN+1: |
1192 | case OPEN+2: |
1193 | case OPEN+3: |
1194 | case OPEN+4: |
1195 | case OPEN+5: |
1196 | case OPEN+6: |
1197 | case OPEN+7: |
1198 | case OPEN+8: |
1199 | case OPEN+9: |
1200 | (void)sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); |
1201 | p = NULL; |
1202 | break; |
1203 | case CLOSE+1: |
1204 | case CLOSE+2: |
1205 | case CLOSE+3: |
1206 | case CLOSE+4: |
1207 | case CLOSE+5: |
1208 | case CLOSE+6: |
1209 | case CLOSE+7: |
1210 | case CLOSE+8: |
1211 | case CLOSE+9: |
1212 | (void)sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); |
1213 | p = NULL; |
1214 | break; |
1215 | case STAR: |
1216 | p = "STAR"; |
1217 | break; |
1218 | case PLUS: |
1219 | p = "PLUS"; |
1220 | break; |
1221 | default: |
1222 | FAIL("corrupted regexp opcode"); |
1223 | } |
1224 | if (p != NULL) |
1225 | (void) strcat(buf, p); |
1226 | return(buf); |
1227 | } |
1228 | #endif /* DEBUGGING */ |
1229 | |
1230 | regfree(r) |
1231 | struct regexp *r; |
1232 | { |
1233 | if (r->precomp) |
1234 | Safefree(r->precomp); |
1235 | if (r->subbase) |
1236 | Safefree(r->subbase); |
1237 | if (r->regmust) |
1238 | str_free(r->regmust); |
1239 | if (r->regstart) |
1240 | str_free(r->regstart); |
1241 | Safefree(r); |
1242 | } |