a "replacement" for awk and sed
[p5sagit/p5-mst-13.2.git] / search.c
1 /* $Header: search.c,v 1.0 87/12/18 13:05:59 root Exp $
2  *
3  * $Log:        search.c,v $
4  * Revision 1.0  87/12/18  13:05:59  root
5  * Initial revision
6  * 
7  */
8
9 /* string search routines */
10  
11 #include <stdio.h>
12 #include <ctype.h>
13
14 #include "EXTERN.h"
15 #include "handy.h"
16 #include "util.h"
17 #include "INTERN.h"
18 #include "search.h"
19
20 #define VERBOSE
21 #define FLUSH
22 #define MEM_SIZE int
23
24 #ifndef BITSPERBYTE
25 #define BITSPERBYTE 8
26 #endif
27
28 #define BMAPSIZ (127 / BITSPERBYTE + 1)
29
30 #define CHAR    0               /*      a normal character */
31 #define ANY     1               /* .    matches anything except newline */
32 #define CCL     2               /* [..] character class */
33 #define NCCL    3               /* [^..]negated character class */
34 #define BEG     4               /* ^    beginning of a line */
35 #define END     5               /* $    end of a line */
36 #define LPAR    6               /* (    begin sub-match */
37 #define RPAR    7               /* )    end sub-match */
38 #define REF     8               /* \N   backreference to the Nth submatch */
39 #define WORD    9               /* \w   matches alphanumeric character */
40 #define NWORD   10              /* \W   matches non-alphanumeric character */
41 #define WBOUND  11              /* \b   matches word boundary */
42 #define NWBOUND 12              /* \B   matches non-boundary  */
43 #define FINIS   13              /*      the end of the pattern */
44  
45 #define CODEMASK 15
46
47 /* Quantifiers: */
48
49 #define MINZERO 16              /* minimum is 0, not 1 */
50 #define MAXINF  32              /* maximum is infinity, not 1 */
51  
52 #define ASCSIZ 0200
53 typedef char    TRANSTABLE[ASCSIZ];
54
55 static  TRANSTABLE trans = {
56 0000,0001,0002,0003,0004,0005,0006,0007,
57 0010,0011,0012,0013,0014,0015,0016,0017,
58 0020,0021,0022,0023,0024,0025,0026,0027,
59 0030,0031,0032,0033,0034,0035,0036,0037,
60 0040,0041,0042,0043,0044,0045,0046,0047,
61 0050,0051,0052,0053,0054,0055,0056,0057,
62 0060,0061,0062,0063,0064,0065,0066,0067,
63 0070,0071,0072,0073,0074,0075,0076,0077,
64 0100,0101,0102,0103,0104,0105,0106,0107,
65 0110,0111,0112,0113,0114,0115,0116,0117,
66 0120,0121,0122,0123,0124,0125,0126,0127,
67 0130,0131,0132,0133,0134,0135,0136,0137,
68 0140,0141,0142,0143,0144,0145,0146,0147,
69 0150,0151,0152,0153,0154,0155,0156,0157,
70 0160,0161,0162,0163,0164,0165,0166,0167,
71 0170,0171,0172,0173,0174,0175,0176,0177,
72 };
73 static bool folding = FALSE;
74
75 static int err;
76 #define NOERR 0
77 #define BEGFAIL 1
78 #define FATAL 2
79
80 static char *FirstCharacter;
81 static char *matchend;
82 static char *matchtill;
83
84 void
85 search_init()
86 {
87 #ifdef UNDEF
88     register int    i;
89     
90     for (i = 0; i < ASCSIZ; i++)
91         trans[i] = i;
92 #else
93     ;
94 #endif
95 }
96
97 void
98 init_compex(compex)
99 register COMPEX *compex;
100 {
101     /* the following must start off zeroed */
102
103     compex->precomp = Nullch;
104     compex->complen = 0;
105     compex->subbase = Nullch;
106 }
107
108 #ifdef NOTUSED
109 void
110 free_compex(compex)
111 register COMPEX *compex;
112 {
113     if (compex->complen) {
114         safefree(compex->compbuf);
115         compex->complen = 0;
116     }
117     if (compex->subbase) {
118         safefree(compex->subbase);
119         compex->subbase = Nullch;
120     }
121 }
122 #endif
123
124 static char *gbr_str = Nullch;
125 static int gbr_siz = 0;
126
127 char *
128 getparen(compex,n)
129 register COMPEX *compex;
130 int n;
131 {
132     int length = compex->subend[n] - compex->subbeg[n];
133
134     if (!n &&
135         (!compex->numsubs || n > compex->numsubs || !compex->subend[n] || length<0))
136         return "";
137     growstr(&gbr_str, &gbr_siz, length+1);
138     safecpy(gbr_str, compex->subbeg[n], length+1);
139     return gbr_str;
140 }
141
142 void
143 case_fold(which)
144 int which;
145 {
146     register int i;
147
148     if (which != folding) {
149         if (which) {
150             for (i = 'A'; i <= 'Z'; i++)
151                 trans[i] = tolower(i);
152         }
153         else {
154             for (i = 'A'; i <= 'Z'; i++)
155                 trans[i] = i;
156         }
157         folding = which;
158     }
159 }
160
161 /* Compile the regular expression into internal form */
162
163 char *
164 compile(compex, sp, regex, fold)
165 register COMPEX *compex;
166 register char   *sp;
167 int regex;
168 int fold;
169 {
170     register int c;
171     register char  *cp;
172     char   *lastcp;
173     char    paren[MAXSUB],
174            *parenp;
175     char **alt = compex->alternatives;
176     char *retmes = "Badly formed search string";
177  
178     case_fold(compex->do_folding = fold);
179     if (compex->precomp)
180         safefree(compex->precomp);
181     compex->precomp = savestr(sp);
182     if (!compex->complen) {
183         compex->compbuf = safemalloc(84);
184         compex->complen = 80;
185     }
186     cp = compex->compbuf;               /* point at compiled buffer */
187     *alt++ = cp;                        /* first alternative starts here */
188     parenp = paren;                     /* first paren goes here */
189     if (*sp == 0) {                     /* nothing to compile? */
190 #ifdef NOTDEF
191         if (*cp == 0)                   /* nothing there yet? */
192             return "Null search string";
193 #endif
194         if (*cp)
195             return Nullch;                      /* just keep old expression */
196     }
197     compex->numsubs = 0;                        /* no parens yet */
198     lastcp = 0;
199     for (;;) {
200         if (cp - compex->compbuf >= compex->complen) {
201             char *ocompbuf = compex->compbuf;
202
203             grow_comp(compex);
204             if (ocompbuf != compex->compbuf) {  /* adjust pointers? */
205                 char **tmpalt;
206
207                 cp = compex->compbuf + (cp - ocompbuf);
208                 if (lastcp)
209                     lastcp = compex->compbuf + (lastcp - ocompbuf);
210                 for (tmpalt = compex->alternatives; tmpalt < alt; tmpalt++)
211                     if (*tmpalt)
212                         *tmpalt = compex->compbuf + (*tmpalt - ocompbuf);
213             }
214         }
215         c = *sp++;                      /* get next char of pattern */
216         if (c == 0) {                   /* end of pattern? */
217             if (parenp != paren) {      /* balanced parentheses? */
218 #ifdef VERBOSE
219                 retmes = "Missing right parenthesis";
220 #endif
221                 goto badcomp;
222             }
223             *cp++ = FINIS;              /* append a stopper */
224             *alt++ = 0;                 /* terminate alternative list */
225             /*
226             compex->complen = cp - compex->compbuf + 1;
227             compex->compbuf = saferealloc(compex->compbuf,compex->complen+4); */
228             return Nullch;              /* return success */
229         }
230         if (c != '*' && c != '?' && c != '+')
231             lastcp = cp;
232         if (!regex) {                   /* just a normal search string? */
233             *cp++ = CHAR;               /* everything is a normal char */
234             *cp++ = trans[c];
235         }
236         else                            /* it is a regular expression */
237             switch (c) {
238  
239                 default:
240                   normal_char:
241                     *cp++ = CHAR;
242                     *cp++ = trans[c];
243                     continue;
244
245                 case '.':
246                     *cp++ = ANY;
247                     continue;
248  
249                 case '[': {             /* character class */
250                     register int i;
251                     
252                     if (cp - compex->compbuf >= compex->complen - BMAPSIZ) {
253                         char *ocompbuf = compex->compbuf;
254
255                         grow_comp(compex);      /* reserve bitmap */
256                         if (ocompbuf != compex->compbuf) {/* adjust pointers? */
257                             char **tmpalt;
258
259                             cp = compex->compbuf + (cp - ocompbuf);
260                             if (lastcp)
261                                 lastcp = compex->compbuf + (lastcp - ocompbuf);
262                             for (tmpalt = compex->alternatives; tmpalt < alt;
263                               tmpalt++)
264                                 if (*tmpalt)
265                                     *tmpalt =
266                                         compex->compbuf + (*tmpalt - ocompbuf);
267                         }
268                     }
269                     for (i = BMAPSIZ; i; --i)
270                         cp[i] = 0;
271                     
272                     if ((c = *sp++) == '^') {
273                         c = *sp++;
274                         *cp++ = NCCL;   /* negated */
275                     }
276                     else
277                         *cp++ = CCL;    /* normal */
278                     
279                     i = 0;              /* remember oldchar */
280                     do {
281                         if (c == '\0') {
282 #ifdef VERBOSE
283                             retmes = "Missing ]";
284 #endif
285                             goto badcomp;
286                         }
287                         if (c == '\\' && *sp) {
288                             switch (*sp) {
289                             default:
290                                 c = *sp++;
291                                 break;
292                             case '0': case '1': case '2': case '3':
293                             case '4': case '5': case '6': case '7':
294                                 c = *sp++ - '0';
295                                 if (index("01234567",*sp)) {
296                                     c <<= 3;
297                                     c += *sp++ - '0';
298                                 }
299                                 if (index("01234567",*sp)) {
300                                     c <<= 3;
301                                     c += *sp++ - '0';
302                                 }
303                                 break;
304                             case 'b':
305                                 c = '\b';
306                                 sp++;
307                                 break;
308                             case 'n':
309                                 c = '\n';
310                                 sp++;
311                                 break;
312                             case 'r':
313                                 c = '\r';
314                                 sp++;
315                                 break;
316                             case 'f':
317                                 c = '\f';
318                                 sp++;
319                                 break;
320                             case 't':
321                                 c = '\t';
322                                 sp++;
323                                 break;
324                             }
325                         }
326                         if (*sp == '-' && *(++sp))
327                             i = *sp++;
328                         else
329                             i = c;
330                         while (c <= i) {
331                             cp[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
332                             if (fold && isalpha(c))
333                                 cp[(c ^ 32) / BITSPERBYTE] |=
334                                     1 << ((c ^ 32) % BITSPERBYTE);
335                                         /* set the other bit too */
336                             c++;
337                         }
338                     } while ((c = *sp++) != ']');
339                     if (cp[-1] == NCCL)
340                         cp[0] |= 1;
341                     cp += BMAPSIZ;
342                     continue;
343                 }
344  
345                 case '^':
346                     if (cp != compex->compbuf && cp[-1] != FINIS)
347                         goto normal_char;
348                     *cp++ = BEG;
349                     continue;
350  
351                 case '$':
352                     if (isdigit(*sp)) {
353                         *cp++ = REF;
354                         *cp++ = *sp - '0';
355                         break;
356                     }
357                     if (*sp && *sp != '|')
358                         goto normal_char;
359                     *cp++ = END;
360                     continue;
361  
362                 case '*': case '?': case '+':
363                     if (lastcp == 0 ||
364                         (*lastcp & (MINZERO|MAXINF)) ||
365                         *lastcp == LPAR ||
366                         *lastcp == RPAR ||
367                         *lastcp == BEG ||
368                         *lastcp == END ||
369                         *lastcp == WBOUND ||
370                         *lastcp == NWBOUND )
371                         goto normal_char;
372                     if (c != '+')
373                         *lastcp |= MINZERO;
374                     if (c != '?')
375                         *lastcp |= MAXINF;
376                     continue;
377  
378                 case '(':
379                     if (compex->numsubs >= MAXSUB) {
380 #ifdef VERBOSE
381                         retmes = "Too many parens";
382 #endif
383                         goto badcomp;
384                     }
385                     *parenp++ = ++compex->numsubs;
386                     *cp++ = LPAR;
387                     *cp++ = compex->numsubs;
388                     break;
389                 case ')':
390                     if (parenp <= paren) {
391 #ifdef VERBOSE
392                         retmes = "Unmatched right paren";
393 #endif
394                         goto badcomp;
395                     }
396                     *cp++ = RPAR;
397                     *cp++ = *--parenp;
398                     break;
399                 case '|':
400                     if (parenp>paren) {
401 #ifdef VERBOSE
402                         retmes = "No | in subpattern";  /* Sigh! */
403 #endif
404                         goto badcomp;
405                     }
406                     *cp++ = FINIS;
407                     if (alt - compex->alternatives >= MAXALT) {
408 #ifdef VERBOSE
409                         retmes = "Too many alternatives";
410 #endif
411                         goto badcomp;
412                     }
413                     *alt++ = cp;
414                     break;
415                 case '\\':              /* backslashed thingie */
416                     switch (c = *sp++) {
417                     case '0': case '1': case '2': case '3': case '4':
418                     case '5': case '6': case '7': case '8': case '9':
419                         *cp++ = REF;
420                         *cp++ = c - '0';
421                         break;
422                     case 'w':
423                         *cp++ = WORD;
424                         break;
425                     case 'W':
426                         *cp++ = NWORD;
427                         break;
428                     case 'b':
429                         *cp++ = WBOUND;
430                         break;
431                     case 'B':
432                         *cp++ = NWBOUND;
433                         break;
434                     default:
435                         *cp++ = CHAR;
436                         if (c == '\0')
437                             goto badcomp;
438                         switch (c) {
439                         case 'n':
440                             c = '\n';
441                             break;
442                         case 'r':
443                             c = '\r';
444                             break;
445                         case 'f':
446                             c = '\f';
447                             break;
448                         case 't':
449                             c = '\t';
450                             break;
451                         }
452                         *cp++ = c;
453                         break;
454                     }
455                     break;
456             }
457     }
458 badcomp:
459     compex->compbuf[0] = 0;
460     compex->numsubs = 0;
461     return retmes;
462 }
463
464 void
465 grow_comp(compex)
466 register COMPEX *compex;
467 {
468     compex->complen += 80;
469     compex->compbuf = saferealloc(compex->compbuf, (MEM_SIZE)compex->complen + 4);
470 }
471
472 char *
473 execute(compex, addr, beginning, minend)
474 register COMPEX *compex;
475 char *addr;
476 bool beginning;
477 int minend;
478 {
479     register char *p1 = addr;
480     register char *trt = trans;
481     register int c;
482     register int scr;
483     register int c2;
484  
485     if (addr == Nullch)
486         return Nullch;
487     if (compex->numsubs) {                      /* any submatches? */
488         for (c = 0; c <= compex->numsubs; c++)
489             compex->subbeg[c] = compex->subend[c] = Nullch;
490     }
491     case_fold(compex->do_folding);      /* make sure table is correct */
492     if (beginning)
493         FirstCharacter = p1;            /* for ^ tests */
494     else {
495         if (multiline || compex->alternatives[1] || compex->compbuf[0] != BEG)
496             FirstCharacter = Nullch;
497         else
498             return Nullch;              /* can't match */
499     }
500     matchend = Nullch;
501     matchtill = addr + minend;
502     err = 0;
503     if (compex->compbuf[0] == CHAR && !compex->alternatives[1]) {
504         if (compex->do_folding) {
505             c = compex->compbuf[1];     /* fast check for first character */
506             do {
507                 if (trt[*p1] == c && try(compex, p1, compex->compbuf))
508                     goto got_it;
509             } while (*p1++ && !err);
510         }
511         else {
512             c = compex->compbuf[1];     /* faster check for first character */
513             if (compex->compbuf[2] == CHAR)
514                 c2 = compex->compbuf[3];
515             else
516                 c2 = 0;
517             do {
518               false_alarm:
519                 while (scr = *p1++, scr && scr != c) ;
520                 if (!scr)
521                     break;
522                 if (c2 && *p1 != c2)    /* and maybe even second character */
523                     goto false_alarm;
524                 if (try(compex, p1, compex->compbuf+2)) {
525                     p1--;
526                     goto got_it;
527                 }
528             } while (!err);
529         }
530         return Nullch;
531     }
532     else {                      /* normal algorithm */
533         do {
534             register char **alt = compex->alternatives;
535             while (*alt) {
536                 if (try(compex, p1, *alt++))
537                     goto got_it;
538             }
539         } while (*p1++ && err < FATAL);
540         return Nullch;
541     }
542
543 got_it:
544     if (compex->numsubs) {                      /* any parens? */
545         trt = savestr(addr);            /* in case addr is not static */
546         if (compex->subbase)
547             safefree(compex->subbase);  /* (may be freeing addr!) */
548         compex->subbase = trt;
549         scr = compex->subbase - addr;
550         p1 += scr;
551         matchend += scr;
552         for (c = 0; c <= compex->numsubs; c++) {
553             if (compex->subend[c]) {
554                 compex->subbeg[c] += scr;
555                 compex->subend[c] += scr;
556             }
557         }
558     }
559     compex->subend[0] = matchend;
560     compex->subbeg[0] = p1;
561     return p1;
562 }
563  
564 bool
565 try(compex, sp, cp)
566 COMPEX *compex;
567 register char *cp;
568 register char *sp;
569 {
570     register char *basesp;
571     register char *trt = trans;
572     register int i;
573     register int backlen;
574     register int code;
575  
576     while (*sp || (*cp & MAXINF) || *cp == BEG || *cp == RPAR ||
577         *cp == WBOUND || *cp == NWBOUND) {
578         switch ((code = *cp++) & CODEMASK) {
579  
580             case CHAR:
581                 basesp = sp;
582                 i = *cp++;
583                 if (code & MAXINF)
584                     while (*sp && trt[*sp] == i) sp++;
585                 else
586                     if (*sp && trt[*sp] == i) sp++;
587                 backlen = 1;
588                 goto backoff;
589  
590           backoff:
591                 while (sp > basesp) {
592                     if (try(compex, sp, cp))
593                         goto right;
594                     sp -= backlen;
595                 }
596                 if (code & MINZERO)
597                     continue;
598                 goto wrong;
599  
600             case ANY:
601                 basesp = sp;
602                 if (code & MAXINF)
603                     while (*sp && *sp != '\n') sp++;
604                 else
605                     if (*sp && *sp != '\n') sp++;
606                 backlen = 1;
607                 goto backoff;
608
609             case CCL:
610                 basesp = sp;
611                 if (code & MAXINF)
612                     while (*sp && cclass(cp, *sp, 1)) sp++;
613                 else
614                     if (*sp && cclass(cp, *sp, 1)) sp++;
615                 cp += BMAPSIZ;
616                 backlen = 1;
617                 goto backoff;
618  
619             case NCCL:
620                 basesp = sp;
621                 if (code & MAXINF)
622                     while (*sp && cclass(cp, *sp, 0)) sp++;
623                 else
624                     if (*sp && cclass(cp, *sp, 0)) sp++;
625                 cp += BMAPSIZ;
626                 backlen = 1;
627                 goto backoff;
628  
629             case END:
630                 if (!*sp || *sp == '\n') {
631                     matchtill--;
632                     continue;
633                 }
634                 goto wrong;
635  
636             case BEG:
637                 if (sp == FirstCharacter || (
638                     *sp && sp[-1] == '\n') ) {
639                     matchtill--;
640                     continue;
641                 }
642                 if (!multiline)         /* no point in advancing more */
643                     err = BEGFAIL;
644                 goto wrong;
645  
646             case WORD:
647                 basesp = sp;
648                 if (code & MAXINF)
649                     while (*sp && isalnum(*sp)) sp++;
650                 else
651                     if (*sp && isalnum(*sp)) sp++;
652                 backlen = 1;
653                 goto backoff;
654  
655             case NWORD:
656                 basesp = sp;
657                 if (code & MAXINF)
658                     while (*sp && !isalnum(*sp)) sp++;
659                 else
660                     if (*sp && !isalnum(*sp)) sp++;
661                 backlen = 1;
662                 goto backoff;
663  
664             case WBOUND:
665                 if ((sp == FirstCharacter || !isalnum(sp[-1])) !=
666                         (!*sp || !isalnum(*sp)) )
667                     continue;
668                 goto wrong;
669  
670             case NWBOUND:
671                 if ((sp == FirstCharacter || !isalnum(sp[-1])) ==
672                         (!*sp || !isalnum(*sp)))
673                     continue;
674                 goto wrong;
675  
676             case FINIS:
677                 goto right;
678  
679             case LPAR:
680                 compex->subbeg[*cp++] = sp;
681                 continue;
682  
683             case RPAR:
684                 i = *cp++;
685                 compex->subend[i] = sp;
686                 compex->lastparen = i;
687                 continue;
688  
689             case REF:
690                 if (compex->subend[i = *cp++] == 0) {
691                     fputs("Bad subpattern reference\n",stdout) FLUSH;
692                     err = FATAL;
693                     goto wrong;
694                 }
695                 basesp = sp;
696                 backlen = compex->subend[i] - compex->subbeg[i];
697                 if (code & MAXINF)
698                     while (*sp && subpat(compex, i, sp)) sp += backlen;
699                 else
700                     if (*sp && subpat(compex, i, sp)) sp += backlen;
701                 goto backoff;
702  
703             default:
704                 fputs("Botched pattern compilation\n",stdout) FLUSH;
705                 err = FATAL;
706                 return -1;
707         }
708     }
709     if (*cp == FINIS || *cp == END) {
710 right:
711         if (matchend == Nullch || sp > matchend)
712             matchend = sp;
713         return matchend >= matchtill;
714     }
715 wrong:
716     matchend = Nullch;
717     return FALSE;
718 }
719  
720 bool
721 subpat(compex, i, sp)
722 register COMPEX *compex;
723 register int i;
724 register char *sp;
725 {
726     register char *bp;
727  
728     bp = compex->subbeg[i];
729     while (*sp && *bp == *sp) {
730         bp++;
731         sp++;
732         if (bp >= compex->subend[i])
733             return TRUE;
734     }
735     return FALSE;
736 }
737
738 bool
739 cclass(set, c, af)
740 register char  *set;
741 register int c;
742 {
743     c &= 0177;
744 #if BITSPERBYTE == 8
745     if (set[c >> 3] & 1 << (c & 7))
746 #else
747     if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
748 #endif
749         return af;
750     return !af;
751 }