perl 1.0 patch 2: Various portability fixes.
[p5sagit/p5-mst-13.2.git] / search.c
CommitLineData
135863df 1/* $Header: search.c,v 1.0.1.1 88/01/24 03:55:05 root Exp $
8d063cd8 2 *
3 * $Log: search.c,v $
135863df 4 * Revision 1.0.1.1 88/01/24 03:55:05 root
5 * patch 2: made depend on perl.h.
6 *
8d063cd8 7 * Revision 1.0 87/12/18 13:05:59 root
8 * Initial revision
9 *
10 */
11
12/* string search routines */
13
8d063cd8 14#include "EXTERN.h"
15#include "handy.h"
16#include "util.h"
17#include "INTERN.h"
18#include "search.h"
135863df 19#include "EXTERN.h"
20#include "perl.h"
8d063cd8 21
22#define VERBOSE
23#define FLUSH
24#define MEM_SIZE int
25
26#ifndef BITSPERBYTE
27#define BITSPERBYTE 8
28#endif
29
30#define BMAPSIZ (127 / BITSPERBYTE + 1)
31
32#define CHAR 0 /* a normal character */
33#define ANY 1 /* . matches anything except newline */
34#define CCL 2 /* [..] character class */
35#define NCCL 3 /* [^..]negated character class */
36#define BEG 4 /* ^ beginning of a line */
37#define END 5 /* $ end of a line */
38#define LPAR 6 /* ( begin sub-match */
39#define RPAR 7 /* ) end sub-match */
40#define REF 8 /* \N backreference to the Nth submatch */
41#define WORD 9 /* \w matches alphanumeric character */
42#define NWORD 10 /* \W matches non-alphanumeric character */
43#define WBOUND 11 /* \b matches word boundary */
44#define NWBOUND 12 /* \B matches non-boundary */
45#define FINIS 13 /* the end of the pattern */
46
47#define CODEMASK 15
48
49/* Quantifiers: */
50
51#define MINZERO 16 /* minimum is 0, not 1 */
52#define MAXINF 32 /* maximum is infinity, not 1 */
53
54#define ASCSIZ 0200
55typedef char TRANSTABLE[ASCSIZ];
56
57static TRANSTABLE trans = {
580000,0001,0002,0003,0004,0005,0006,0007,
590010,0011,0012,0013,0014,0015,0016,0017,
600020,0021,0022,0023,0024,0025,0026,0027,
610030,0031,0032,0033,0034,0035,0036,0037,
620040,0041,0042,0043,0044,0045,0046,0047,
630050,0051,0052,0053,0054,0055,0056,0057,
640060,0061,0062,0063,0064,0065,0066,0067,
650070,0071,0072,0073,0074,0075,0076,0077,
660100,0101,0102,0103,0104,0105,0106,0107,
670110,0111,0112,0113,0114,0115,0116,0117,
680120,0121,0122,0123,0124,0125,0126,0127,
690130,0131,0132,0133,0134,0135,0136,0137,
700140,0141,0142,0143,0144,0145,0146,0147,
710150,0151,0152,0153,0154,0155,0156,0157,
720160,0161,0162,0163,0164,0165,0166,0167,
730170,0171,0172,0173,0174,0175,0176,0177,
74};
75static bool folding = FALSE;
76
77static int err;
78#define NOERR 0
79#define BEGFAIL 1
80#define FATAL 2
81
82static char *FirstCharacter;
83static char *matchend;
84static char *matchtill;
85
86void
87search_init()
88{
89#ifdef UNDEF
90 register int i;
91
92 for (i = 0; i < ASCSIZ; i++)
93 trans[i] = i;
94#else
95 ;
96#endif
97}
98
99void
100init_compex(compex)
101register COMPEX *compex;
102{
103 /* the following must start off zeroed */
104
105 compex->precomp = Nullch;
106 compex->complen = 0;
107 compex->subbase = Nullch;
108}
109
110#ifdef NOTUSED
111void
112free_compex(compex)
113register COMPEX *compex;
114{
115 if (compex->complen) {
116 safefree(compex->compbuf);
117 compex->complen = 0;
118 }
119 if (compex->subbase) {
120 safefree(compex->subbase);
121 compex->subbase = Nullch;
122 }
123}
124#endif
125
126static char *gbr_str = Nullch;
127static int gbr_siz = 0;
128
129char *
130getparen(compex,n)
131register COMPEX *compex;
132int n;
133{
134 int length = compex->subend[n] - compex->subbeg[n];
135
136 if (!n &&
137 (!compex->numsubs || n > compex->numsubs || !compex->subend[n] || length<0))
138 return "";
139 growstr(&gbr_str, &gbr_siz, length+1);
140 safecpy(gbr_str, compex->subbeg[n], length+1);
141 return gbr_str;
142}
143
144void
145case_fold(which)
146int which;
147{
148 register int i;
149
150 if (which != folding) {
151 if (which) {
152 for (i = 'A'; i <= 'Z'; i++)
153 trans[i] = tolower(i);
154 }
155 else {
156 for (i = 'A'; i <= 'Z'; i++)
157 trans[i] = i;
158 }
159 folding = which;
160 }
161}
162
163/* Compile the regular expression into internal form */
164
165char *
166compile(compex, sp, regex, fold)
167register COMPEX *compex;
168register char *sp;
169int regex;
170int fold;
171{
172 register int c;
173 register char *cp;
174 char *lastcp;
175 char paren[MAXSUB],
176 *parenp;
177 char **alt = compex->alternatives;
178 char *retmes = "Badly formed search string";
179
180 case_fold(compex->do_folding = fold);
181 if (compex->precomp)
182 safefree(compex->precomp);
183 compex->precomp = savestr(sp);
184 if (!compex->complen) {
185 compex->compbuf = safemalloc(84);
186 compex->complen = 80;
187 }
188 cp = compex->compbuf; /* point at compiled buffer */
189 *alt++ = cp; /* first alternative starts here */
190 parenp = paren; /* first paren goes here */
191 if (*sp == 0) { /* nothing to compile? */
192#ifdef NOTDEF
193 if (*cp == 0) /* nothing there yet? */
194 return "Null search string";
195#endif
196 if (*cp)
197 return Nullch; /* just keep old expression */
198 }
199 compex->numsubs = 0; /* no parens yet */
200 lastcp = 0;
201 for (;;) {
202 if (cp - compex->compbuf >= compex->complen) {
203 char *ocompbuf = compex->compbuf;
204
205 grow_comp(compex);
206 if (ocompbuf != compex->compbuf) { /* adjust pointers? */
207 char **tmpalt;
208
209 cp = compex->compbuf + (cp - ocompbuf);
210 if (lastcp)
211 lastcp = compex->compbuf + (lastcp - ocompbuf);
212 for (tmpalt = compex->alternatives; tmpalt < alt; tmpalt++)
213 if (*tmpalt)
214 *tmpalt = compex->compbuf + (*tmpalt - ocompbuf);
215 }
216 }
217 c = *sp++; /* get next char of pattern */
218 if (c == 0) { /* end of pattern? */
219 if (parenp != paren) { /* balanced parentheses? */
220#ifdef VERBOSE
221 retmes = "Missing right parenthesis";
222#endif
223 goto badcomp;
224 }
225 *cp++ = FINIS; /* append a stopper */
226 *alt++ = 0; /* terminate alternative list */
227 /*
228 compex->complen = cp - compex->compbuf + 1;
229 compex->compbuf = saferealloc(compex->compbuf,compex->complen+4); */
230 return Nullch; /* return success */
231 }
232 if (c != '*' && c != '?' && c != '+')
233 lastcp = cp;
234 if (!regex) { /* just a normal search string? */
235 *cp++ = CHAR; /* everything is a normal char */
236 *cp++ = trans[c];
237 }
238 else /* it is a regular expression */
239 switch (c) {
240
241 default:
242 normal_char:
243 *cp++ = CHAR;
244 *cp++ = trans[c];
245 continue;
246
247 case '.':
248 *cp++ = ANY;
249 continue;
250
251 case '[': { /* character class */
252 register int i;
253
254 if (cp - compex->compbuf >= compex->complen - BMAPSIZ) {
255 char *ocompbuf = compex->compbuf;
256
257 grow_comp(compex); /* reserve bitmap */
258 if (ocompbuf != compex->compbuf) {/* adjust pointers? */
259 char **tmpalt;
260
261 cp = compex->compbuf + (cp - ocompbuf);
262 if (lastcp)
263 lastcp = compex->compbuf + (lastcp - ocompbuf);
264 for (tmpalt = compex->alternatives; tmpalt < alt;
265 tmpalt++)
266 if (*tmpalt)
267 *tmpalt =
268 compex->compbuf + (*tmpalt - ocompbuf);
269 }
270 }
271 for (i = BMAPSIZ; i; --i)
272 cp[i] = 0;
273
274 if ((c = *sp++) == '^') {
275 c = *sp++;
276 *cp++ = NCCL; /* negated */
277 }
278 else
279 *cp++ = CCL; /* normal */
280
281 i = 0; /* remember oldchar */
282 do {
283 if (c == '\0') {
284#ifdef VERBOSE
285 retmes = "Missing ]";
286#endif
287 goto badcomp;
288 }
289 if (c == '\\' && *sp) {
290 switch (*sp) {
291 default:
292 c = *sp++;
293 break;
294 case '0': case '1': case '2': case '3':
295 case '4': case '5': case '6': case '7':
296 c = *sp++ - '0';
297 if (index("01234567",*sp)) {
298 c <<= 3;
299 c += *sp++ - '0';
300 }
301 if (index("01234567",*sp)) {
302 c <<= 3;
303 c += *sp++ - '0';
304 }
305 break;
306 case 'b':
307 c = '\b';
308 sp++;
309 break;
310 case 'n':
311 c = '\n';
312 sp++;
313 break;
314 case 'r':
315 c = '\r';
316 sp++;
317 break;
318 case 'f':
319 c = '\f';
320 sp++;
321 break;
322 case 't':
323 c = '\t';
324 sp++;
325 break;
326 }
327 }
328 if (*sp == '-' && *(++sp))
329 i = *sp++;
330 else
331 i = c;
332 while (c <= i) {
333 cp[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
334 if (fold && isalpha(c))
335 cp[(c ^ 32) / BITSPERBYTE] |=
336 1 << ((c ^ 32) % BITSPERBYTE);
337 /* set the other bit too */
338 c++;
339 }
340 } while ((c = *sp++) != ']');
341 if (cp[-1] == NCCL)
342 cp[0] |= 1;
343 cp += BMAPSIZ;
344 continue;
345 }
346
347 case '^':
348 if (cp != compex->compbuf && cp[-1] != FINIS)
349 goto normal_char;
350 *cp++ = BEG;
351 continue;
352
353 case '$':
354 if (isdigit(*sp)) {
355 *cp++ = REF;
356 *cp++ = *sp - '0';
357 break;
358 }
359 if (*sp && *sp != '|')
360 goto normal_char;
361 *cp++ = END;
362 continue;
363
364 case '*': case '?': case '+':
365 if (lastcp == 0 ||
366 (*lastcp & (MINZERO|MAXINF)) ||
367 *lastcp == LPAR ||
368 *lastcp == RPAR ||
369 *lastcp == BEG ||
370 *lastcp == END ||
371 *lastcp == WBOUND ||
372 *lastcp == NWBOUND )
373 goto normal_char;
374 if (c != '+')
375 *lastcp |= MINZERO;
376 if (c != '?')
377 *lastcp |= MAXINF;
378 continue;
379
380 case '(':
381 if (compex->numsubs >= MAXSUB) {
382#ifdef VERBOSE
383 retmes = "Too many parens";
384#endif
385 goto badcomp;
386 }
387 *parenp++ = ++compex->numsubs;
388 *cp++ = LPAR;
389 *cp++ = compex->numsubs;
390 break;
391 case ')':
392 if (parenp <= paren) {
393#ifdef VERBOSE
394 retmes = "Unmatched right paren";
395#endif
396 goto badcomp;
397 }
398 *cp++ = RPAR;
399 *cp++ = *--parenp;
400 break;
401 case '|':
402 if (parenp>paren) {
403#ifdef VERBOSE
404 retmes = "No | in subpattern"; /* Sigh! */
405#endif
406 goto badcomp;
407 }
408 *cp++ = FINIS;
409 if (alt - compex->alternatives >= MAXALT) {
410#ifdef VERBOSE
411 retmes = "Too many alternatives";
412#endif
413 goto badcomp;
414 }
415 *alt++ = cp;
416 break;
417 case '\\': /* backslashed thingie */
418 switch (c = *sp++) {
419 case '0': case '1': case '2': case '3': case '4':
420 case '5': case '6': case '7': case '8': case '9':
421 *cp++ = REF;
422 *cp++ = c - '0';
423 break;
424 case 'w':
425 *cp++ = WORD;
426 break;
427 case 'W':
428 *cp++ = NWORD;
429 break;
430 case 'b':
431 *cp++ = WBOUND;
432 break;
433 case 'B':
434 *cp++ = NWBOUND;
435 break;
436 default:
437 *cp++ = CHAR;
438 if (c == '\0')
439 goto badcomp;
440 switch (c) {
441 case 'n':
442 c = '\n';
443 break;
444 case 'r':
445 c = '\r';
446 break;
447 case 'f':
448 c = '\f';
449 break;
450 case 't':
451 c = '\t';
452 break;
453 }
454 *cp++ = c;
455 break;
456 }
457 break;
458 }
459 }
460badcomp:
461 compex->compbuf[0] = 0;
462 compex->numsubs = 0;
463 return retmes;
464}
465
466void
467grow_comp(compex)
468register COMPEX *compex;
469{
470 compex->complen += 80;
471 compex->compbuf = saferealloc(compex->compbuf, (MEM_SIZE)compex->complen + 4);
472}
473
474char *
475execute(compex, addr, beginning, minend)
476register COMPEX *compex;
477char *addr;
478bool beginning;
479int minend;
480{
481 register char *p1 = addr;
482 register char *trt = trans;
483 register int c;
484 register int scr;
485 register int c2;
486
487 if (addr == Nullch)
488 return Nullch;
489 if (compex->numsubs) { /* any submatches? */
490 for (c = 0; c <= compex->numsubs; c++)
491 compex->subbeg[c] = compex->subend[c] = Nullch;
492 }
493 case_fold(compex->do_folding); /* make sure table is correct */
494 if (beginning)
495 FirstCharacter = p1; /* for ^ tests */
496 else {
497 if (multiline || compex->alternatives[1] || compex->compbuf[0] != BEG)
498 FirstCharacter = Nullch;
499 else
500 return Nullch; /* can't match */
501 }
502 matchend = Nullch;
503 matchtill = addr + minend;
504 err = 0;
505 if (compex->compbuf[0] == CHAR && !compex->alternatives[1]) {
506 if (compex->do_folding) {
507 c = compex->compbuf[1]; /* fast check for first character */
508 do {
509 if (trt[*p1] == c && try(compex, p1, compex->compbuf))
510 goto got_it;
511 } while (*p1++ && !err);
512 }
513 else {
514 c = compex->compbuf[1]; /* faster check for first character */
515 if (compex->compbuf[2] == CHAR)
516 c2 = compex->compbuf[3];
517 else
518 c2 = 0;
519 do {
520 false_alarm:
521 while (scr = *p1++, scr && scr != c) ;
522 if (!scr)
523 break;
524 if (c2 && *p1 != c2) /* and maybe even second character */
525 goto false_alarm;
526 if (try(compex, p1, compex->compbuf+2)) {
527 p1--;
528 goto got_it;
529 }
530 } while (!err);
531 }
532 return Nullch;
533 }
534 else { /* normal algorithm */
535 do {
536 register char **alt = compex->alternatives;
537 while (*alt) {
538 if (try(compex, p1, *alt++))
539 goto got_it;
540 }
541 } while (*p1++ && err < FATAL);
542 return Nullch;
543 }
544
545got_it:
546 if (compex->numsubs) { /* any parens? */
547 trt = savestr(addr); /* in case addr is not static */
548 if (compex->subbase)
549 safefree(compex->subbase); /* (may be freeing addr!) */
550 compex->subbase = trt;
551 scr = compex->subbase - addr;
552 p1 += scr;
553 matchend += scr;
554 for (c = 0; c <= compex->numsubs; c++) {
555 if (compex->subend[c]) {
556 compex->subbeg[c] += scr;
557 compex->subend[c] += scr;
558 }
559 }
560 }
561 compex->subend[0] = matchend;
562 compex->subbeg[0] = p1;
563 return p1;
564}
565
566bool
567try(compex, sp, cp)
568COMPEX *compex;
569register char *cp;
570register char *sp;
571{
572 register char *basesp;
573 register char *trt = trans;
574 register int i;
575 register int backlen;
576 register int code;
577
578 while (*sp || (*cp & MAXINF) || *cp == BEG || *cp == RPAR ||
579 *cp == WBOUND || *cp == NWBOUND) {
580 switch ((code = *cp++) & CODEMASK) {
581
582 case CHAR:
583 basesp = sp;
584 i = *cp++;
585 if (code & MAXINF)
586 while (*sp && trt[*sp] == i) sp++;
587 else
588 if (*sp && trt[*sp] == i) sp++;
589 backlen = 1;
590 goto backoff;
591
592 backoff:
593 while (sp > basesp) {
594 if (try(compex, sp, cp))
595 goto right;
596 sp -= backlen;
597 }
598 if (code & MINZERO)
599 continue;
600 goto wrong;
601
602 case ANY:
603 basesp = sp;
604 if (code & MAXINF)
605 while (*sp && *sp != '\n') sp++;
606 else
607 if (*sp && *sp != '\n') sp++;
608 backlen = 1;
609 goto backoff;
610
611 case CCL:
612 basesp = sp;
613 if (code & MAXINF)
614 while (*sp && cclass(cp, *sp, 1)) sp++;
615 else
616 if (*sp && cclass(cp, *sp, 1)) sp++;
617 cp += BMAPSIZ;
618 backlen = 1;
619 goto backoff;
620
621 case NCCL:
622 basesp = sp;
623 if (code & MAXINF)
624 while (*sp && cclass(cp, *sp, 0)) sp++;
625 else
626 if (*sp && cclass(cp, *sp, 0)) sp++;
627 cp += BMAPSIZ;
628 backlen = 1;
629 goto backoff;
630
631 case END:
632 if (!*sp || *sp == '\n') {
633 matchtill--;
634 continue;
635 }
636 goto wrong;
637
638 case BEG:
639 if (sp == FirstCharacter || (
640 *sp && sp[-1] == '\n') ) {
641 matchtill--;
642 continue;
643 }
644 if (!multiline) /* no point in advancing more */
645 err = BEGFAIL;
646 goto wrong;
647
648 case WORD:
649 basesp = sp;
650 if (code & MAXINF)
651 while (*sp && isalnum(*sp)) sp++;
652 else
653 if (*sp && isalnum(*sp)) sp++;
654 backlen = 1;
655 goto backoff;
656
657 case NWORD:
658 basesp = sp;
659 if (code & MAXINF)
660 while (*sp && !isalnum(*sp)) sp++;
661 else
662 if (*sp && !isalnum(*sp)) sp++;
663 backlen = 1;
664 goto backoff;
665
666 case WBOUND:
667 if ((sp == FirstCharacter || !isalnum(sp[-1])) !=
668 (!*sp || !isalnum(*sp)) )
669 continue;
670 goto wrong;
671
672 case NWBOUND:
673 if ((sp == FirstCharacter || !isalnum(sp[-1])) ==
674 (!*sp || !isalnum(*sp)))
675 continue;
676 goto wrong;
677
678 case FINIS:
679 goto right;
680
681 case LPAR:
682 compex->subbeg[*cp++] = sp;
683 continue;
684
685 case RPAR:
686 i = *cp++;
687 compex->subend[i] = sp;
688 compex->lastparen = i;
689 continue;
690
691 case REF:
692 if (compex->subend[i = *cp++] == 0) {
693 fputs("Bad subpattern reference\n",stdout) FLUSH;
694 err = FATAL;
695 goto wrong;
696 }
697 basesp = sp;
698 backlen = compex->subend[i] - compex->subbeg[i];
699 if (code & MAXINF)
700 while (*sp && subpat(compex, i, sp)) sp += backlen;
701 else
702 if (*sp && subpat(compex, i, sp)) sp += backlen;
703 goto backoff;
704
705 default:
706 fputs("Botched pattern compilation\n",stdout) FLUSH;
707 err = FATAL;
708 return -1;
709 }
710 }
711 if (*cp == FINIS || *cp == END) {
712right:
713 if (matchend == Nullch || sp > matchend)
714 matchend = sp;
715 return matchend >= matchtill;
716 }
717wrong:
718 matchend = Nullch;
719 return FALSE;
720}
721
722bool
723subpat(compex, i, sp)
724register COMPEX *compex;
725register int i;
726register char *sp;
727{
728 register char *bp;
729
730 bp = compex->subbeg[i];
731 while (*sp && *bp == *sp) {
732 bp++;
733 sp++;
734 if (bp >= compex->subend[i])
735 return TRUE;
736 }
737 return FALSE;
738}
739
740bool
741cclass(set, c, af)
742register char *set;
743register int c;
744{
745 c &= 0177;
746#if BITSPERBYTE == 8
747 if (set[c >> 3] & 1 << (c & 7))
748#else
749 if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
750#endif
751 return af;
752 return !af;
753}