X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=bbb7c8e4445c845b04cff7e0b4cf92d3b8de64f2;hb=c885792efecf3f527b3b5099727cc16b03eee1dc;hp=250813c204cddbd4cd58ae52584c66c7679edc14;hpb=a0d0e21ea6ea90a22318550944fe6cb09ae10cda;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index 250813c..bbb7c8e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -14,9 +14,14 @@ * blame Henry for some of the lack of readability. */ +/* The names of the functions have been changed from regcomp and + * regexec to pregcomp and pregexec in order to avoid conflicts + * with the POSIX routines of the same names. +*/ + /*SUPPRESS 112*/ /* - * regcomp and regexec -- regsub and regerror are not used in perl + * pregcomp and pregexec -- regsub and regerror are not used in perl * * Copyright (c) 1986 by University of Toronto. * Written by Henry Spencer. Not derived from licensed software. @@ -88,7 +93,7 @@ #define TRYAGAIN 0x8 /* Weeded out a declaration. */ /* - * Forward declarations for regcomp()'s friends. + * Forward declarations for pregcomp()'s friends. */ static char *reg _((I32, I32 *)); @@ -102,12 +107,12 @@ static char *regnode _((char)); static char *regpiece _((I32 *)); static void reginsert _((char, char *)); static void regoptail _((char *, char *)); -static void regset _((char *, I32, I32)); +static void regset _((char *, I32)); static void regtail _((char *, char *)); static char* nextchar _((void)); /* - - regcomp - compile a regular expression into internal code + - pregcomp - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a @@ -122,12 +127,11 @@ static char* nextchar _((void)); * of the structure of the compiled regexp. [I'll say.] */ regexp * -regcomp(exp,xend,pm) +pregcomp(exp,xend,pm) char* exp; char* xend; PMOP* pm; { - I32 fold = pm->op_pmflags & PMf_FOLD; register regexp *r; register char *scan; register SV *longish; @@ -145,13 +149,14 @@ PMOP* pm; if (exp == NULL) croak("NULL regexp argument"); - /* First pass: determine size, legality. */ + regprecomp = savepvn(exp, xend - exp); regflags = pm->op_pmflags; + regsawback = 0; + + /* First pass: determine size, legality. */ regparse = exp; regxend = xend; - regprecomp = savepvn(exp,xend-exp); regnaughty = 0; - regsawback = 0; regnpar = 1; regsize = 0L; regcode = ®dummy; @@ -166,17 +171,18 @@ PMOP* pm; if (regsize >= 32767L) /* Probably could be 65535L. */ FAIL("regexp too big"); - /* Allocate space. */ + /* Allocate space and initialize. */ Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp); if (r == NULL) FAIL("regexp out of space"); - - /* Second pass: emit code. */ - r->prelen = xend-exp; + r->prelen = xend - exp; r->precomp = regprecomp; r->subbeg = r->subbase = NULL; - regnaughty = 0; + + /* Second pass: emit code. */ regparse = exp; + regxend = xend; + regnaughty = 0; regnpar = 1; regcode = r->program; regc((char)MAGIC); @@ -185,7 +191,6 @@ PMOP* pm; /* Dig out information for optimizations. */ pm->op_pmflags = regflags; - fold = pm->op_pmflags & PMf_FOLD; r->regstart = Nullsv; /* Worst-case defaults. */ r->reganch = 0; r->regmust = Nullsv; @@ -211,16 +216,16 @@ PMOP* pm; /* Starting-point info. */ again: - if (OP(first) == EXACTLY) { + if (OP(first) == EXACT) { r->regstart = newSVpv(OPERAND(first)+1,*OPERAND(first)); - if (SvCUR(r->regstart) > !(sawstudy|fold)) - fbm_compile(r->regstart,fold); - else - sv_upgrade(r->regstart, SVt_PVBM); + if (SvCUR(r->regstart) > !sawstudy) + fbm_compile(r->regstart); + (void)SvUPGRADE(r->regstart, SVt_PVBM); } else if (strchr(simple+2,OP(first))) r->regstclass = first; - else if (OP(first) == BOUND || OP(first) == NBOUND) + else if (regkind[(U8)OP(first)] == BOUND || + regkind[(U8)OP(first)] == NBOUND) r->regstclass = first; else if (regkind[(U8)OP(first)] == BOL) { r->reganch = ROPT_ANCH; @@ -239,7 +244,7 @@ PMOP* pm; if (sawplus && (!sawopen || !regsawback)) r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */ - DEBUG_r(fprintf(stderr,"first %d next %d offset %d\n", + DEBUG_r(PerlIO_printf(Perl_debug_log, "first %d next %d offset %d\n", OP(first), OP(NEXTOPER(first)), first - scan)); /* * If there's something expensive in the r.e., find the @@ -268,12 +273,14 @@ PMOP* pm; } else /* single branch is ok */ scan = NEXTOPER(scan); + continue; } if (OP(scan) == UNLESSM) { curback = -30000; scan = regnext(scan); + continue; } - if (OP(scan) == EXACTLY) { + if (OP(scan) == EXACT) { char *t; first = scan; @@ -326,8 +333,8 @@ PMOP* pm; /* Prefer earlier on tie, unless we can tail match latter */ - if (SvCUR(longish) + (regkind[(U8)OP(first)] == EOL) > - SvCUR(longest)) + if (SvCUR(longish) + (regkind[(U8)OP(first)] == EOL) + > SvCUR(longest)) { sv_setsv(longest,longish); backest = backish; @@ -335,23 +342,18 @@ PMOP* pm; else sv_setpvn(longish,"",0); if (SvCUR(longest) - && - (!r->regstart - || - !fbm_instr((unsigned char*) SvPVX(r->regstart), - (unsigned char *) SvPVX(r->regstart) - + SvCUR(r->regstart), - longest) - ) - ) + && (!r->regstart + || !fbm_instr((unsigned char*) SvPVX(r->regstart), + (unsigned char *) (SvPVX(r->regstart) + + SvCUR(r->regstart)), + longest))) { r->regmust = longest; if (backest < 0) backest = -1; r->regback = backest; - if (SvCUR(longest) > !(sawstudy || fold || - regkind[(U8)OP(first)]==EOL)) - fbm_compile(r->regmust,fold); + if (SvCUR(longest) > !(sawstudy || regkind[(U8)OP(first)] == EOL)) + fbm_compile(r->regmust); (void)SvUPGRADE(r->regmust, SVt_PVBM); BmUSEFUL(r->regmust) = 100; if (regkind[(U8)OP(first)] == EOL && SvCUR(longish)) @@ -364,7 +366,6 @@ PMOP* pm; SvREFCNT_dec(longish); } - r->do_folding = fold; r->nparens = regnpar - 1; r->minlen = minlen; Newz(1002, r->startp, regnpar, char*); @@ -399,7 +400,7 @@ I32 *flagp; if (paren) { if (*regparse == '?') { regparse++; - paren = *nextchar(); + paren = *regparse++; ret = NULL; switch (paren) { case ':': @@ -414,7 +415,7 @@ I32 *flagp; while (*regparse && *regparse != ')') regparse++; if (*regparse != ')') - croak("Sequence (?#... not terminated", *regparse); + croak("Sequence (?#... not terminated"); nextchar(); *flagp = TRYAGAIN; return NULL; @@ -492,7 +493,7 @@ I32 *flagp; } /* Check for proper termination. */ - if (paren && *nextchar() != ')') { + if (paren && (regparse >= regxend || *nextchar() != ')')) { FAIL("unmatched () in regexp"); } else if (!paren && regparse < regxend) { if (*regparse == ')') { @@ -786,32 +787,32 @@ tryagain: nextchar(); break; case 'w': - ret = regnode(ALNUM); + ret = regnode((regflags & PMf_LOCALE) ? ALNUML : ALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'W': - ret = regnode(NALNUM); + ret = regnode((regflags & PMf_LOCALE) ? NALNUML : NALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'b': - ret = regnode(BOUND); + ret = regnode((regflags & PMf_LOCALE) ? BOUNDL : BOUND); *flagp |= SIMPLE; nextchar(); break; case 'B': - ret = regnode(NBOUND); + ret = regnode((regflags & PMf_LOCALE) ? NBOUNDL : NBOUND); *flagp |= SIMPLE; nextchar(); break; case 's': - ret = regnode(SPACE); + ret = regnode((regflags & PMf_LOCALE) ? SPACEL : SPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'S': - ret = regnode(NSPACE); + ret = regnode((regflags & PMf_LOCALE) ? NSPACEL : NSPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; @@ -861,6 +862,15 @@ tryagain: goto defchar; } break; + + case '#': + if (regflags & PMf_EXTENDED) { + while (regparse < regxend && *regparse != '\n') regparse++; + if (regparse < regxend) + goto tryagain; + } + /* FALL THROUGH */ + default: { register I32 len; register char ender; @@ -871,7 +881,9 @@ tryagain: regparse++; defchar: - ret = regnode(EXACTLY); + ret = regnode((regflags & PMf_FOLD) + ? ((regflags & PMf_LOCALE) ? EXACTFL : EXACTF) + : EXACT); regc(0); /* save spot for len */ for (len = 0, p = regparse - 1; len < 127 && p < regxend; @@ -932,10 +944,8 @@ tryagain: break; case 'c': p++; - ender = *p++; - if (isLOWER(ender)) - ender = toUPPER(ender); - ender ^= 64; + ender = UCHARAT(p++); + ender = toCTRL(ender); break; case '0': case '1': case '2': case '3':case '4': case '5': case '6': case '7': case '8':case '9': @@ -958,6 +968,11 @@ tryagain: break; } break; + case '#': + if (regflags & PMf_EXTENDED) { + while (p < regxend && *p != '\n') p++; + } + /* FALL THROUGH */ case ' ': case '\t': case '\n': case '\r': case '\f': case '\v': if (regflags & PMf_EXTENDED) { p++; @@ -969,8 +984,6 @@ tryagain: ender = *p++; break; } - if (regflags & PMf_FOLD && isUPPER(ender)) - ender = toLOWER(ender); if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; @@ -1002,24 +1015,20 @@ tryagain: } static void -regset(bits,def,c) -char *bits; -I32 def; +regset(opnd, c) +char *opnd; register I32 c; { - if (regcode == ®dummy) - return; - c &= 255; - if (def) - bits[c >> 3] &= ~(1 << (c & 7)); - else - bits[c >> 3] |= (1 << (c & 7)); + if (opnd == ®dummy) + return; + c &= 0xFF; + opnd[1 + (c >> 3)] |= (1 << (c & 7)); } static char * regclass() { - register char *bits; + register char *opnd; register I32 class; register I32 lastclass = 1234; register I32 range = 0; @@ -1028,16 +1037,21 @@ regclass() I32 numlen; ret = regnode(ANYOF); + opnd = regcode; + for (class = 0; class < 33; class++) + regc(0); if (*regparse == '^') { /* Complement of range. */ regnaughty++; regparse++; - def = 0; - } else { - def = 255; + if (opnd != ®dummy) + *opnd |= ANYOF_INVERT; + } + if (opnd != ®dummy) { + if (regflags & PMf_FOLD) + *opnd |= ANYOF_FOLD; + if (regflags & PMf_LOCALE) + *opnd |= ANYOF_LOCALE; } - bits = regcode; - for (class = 0; class < 32; class++) - regc(def); if (*regparse == ']' || *regparse == '-') goto skipcond; /* allow 1st char to be ] or - */ while (regparse < regxend && *regparse != ']') { @@ -1047,39 +1061,63 @@ regclass() class = UCHARAT(regparse++); switch (class) { case 'w': - for (class = 0; class < 256; class++) - if (isALNUM(class)) - regset(bits,def,class); + if (regflags & PMf_LOCALE) { + if (opnd != ®dummy) + *opnd |= ANYOF_ALNUML; + } + else { + for (class = 0; class < 256; class++) + if (isALNUM(class)) + regset(opnd, class); + } lastclass = 1234; continue; case 'W': - for (class = 0; class < 256; class++) - if (!isALNUM(class)) - regset(bits,def,class); + if (regflags & PMf_LOCALE) { + if (opnd != ®dummy) + *opnd |= ANYOF_NALNUML; + } + else { + for (class = 0; class < 256; class++) + if (!isALNUM(class)) + regset(opnd, class); + } lastclass = 1234; continue; case 's': - for (class = 0; class < 256; class++) - if (isSPACE(class)) - regset(bits,def,class); + if (regflags & PMf_LOCALE) { + if (opnd != ®dummy) + *opnd |= ANYOF_SPACEL; + } + else { + for (class = 0; class < 256; class++) + if (isSPACE(class)) + regset(opnd, class); + } lastclass = 1234; continue; case 'S': - for (class = 0; class < 256; class++) - if (!isSPACE(class)) - regset(bits,def,class); + if (regflags & PMf_LOCALE) { + if (opnd != ®dummy) + *opnd |= ANYOF_NSPACEL; + } + else { + for (class = 0; class < 256; class++) + if (!isSPACE(class)) + regset(opnd, class); + } lastclass = 1234; continue; case 'd': for (class = '0'; class <= '9'; class++) - regset(bits,def,class); + regset(opnd, class); lastclass = 1234; continue; case 'D': for (class = 0; class < '0'; class++) - regset(bits,def,class); + regset(opnd, class); for (class = '9' + 1; class < 256; class++) - regset(bits,def,class); + regset(opnd, class); lastclass = 1234; continue; case 'n': @@ -1108,10 +1146,8 @@ regclass() regparse += numlen; break; case 'c': - class = *regparse++; - if (isLOWER(class)) - class = toUPPER(class); - class ^= 64; + class = UCHARAT(regparse++); + class = toCTRL(class); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -1134,11 +1170,8 @@ regclass() continue; /* do it next time */ } } - for ( ; lastclass <= class; lastclass++) { - regset(bits,def,lastclass); - if (regflags & PMf_FOLD && isUPPER(lastclass)) - regset(bits,def,toLOWER(lastclass)); - } + for ( ; lastclass <= class; lastclass++) + regset(opnd, lastclass); lastclass = class; } if (*regparse != ']') @@ -1152,11 +1185,28 @@ nextchar() { char* retval = regparse++; - if (regflags & PMf_EXTENDED) { - while (isSPACE(*regparse)) + for (;;) { + if (*regparse == '(' && regparse[1] == '?' && + regparse[2] == '#') { + while (*regparse && *regparse != ')') + regparse++; regparse++; + continue; + } + if (regflags & PMf_EXTENDED) { + if (isSPACE(*regparse)) { + regparse++; + continue; + } + else if (*regparse == '#') { + while (*regparse && *regparse != '\n') + regparse++; + regparse++; + continue; + } + } + return retval; } - return retval; } /* @@ -1394,14 +1444,14 @@ register char *s; #ifdef DEBUGGING /* - - regdump - dump a regexp onto stderr in vaguely comprehensible form + - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ void regdump(r) regexp *r; { register char *s; - register char op = EXACTLY; /* Arbitrary non-END op. */ + register char op = EXACT; /* Arbitrary non-END op. */ register char *next; @@ -1412,48 +1462,48 @@ regexp *r; s++; #endif op = OP(s); - fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */ + PerlIO_printf(Perl_debug_log, "%2d%s", s-r->program, regprop(s)); /* Where, what. */ next = regnext(s); s += regarglen[(U8)op]; if (next == NULL) /* Next ptr. */ - fprintf(stderr,"(0)"); + PerlIO_printf(Perl_debug_log, "(0)"); else - fprintf(stderr,"(%d)", (s-r->program)+(next-s)); + PerlIO_printf(Perl_debug_log, "(%d)", (s-r->program)+(next-s)); s += 3; if (op == ANYOF) { - s += 32; + s += 33; } - if (op == EXACTLY) { + if (regkind[(U8)op] == EXACT) { /* Literal string, where present. */ s++; - (void)putc(' ', stderr); - (void)putc('<', stderr); + (void)PerlIO_putc(Perl_debug_log, ' '); + (void)PerlIO_putc(Perl_debug_log, '<'); while (*s != '\0') { - (void)putc(*s, stderr); + (void)PerlIO_putc(Perl_debug_log,*s); s++; } - (void)putc('>', stderr); + (void)PerlIO_putc(Perl_debug_log, '>'); s++; } - (void)putc('\n', stderr); + (void)PerlIO_putc(Perl_debug_log, '\n'); } /* Header fields of interest. */ if (r->regstart) - fprintf(stderr,"start `%s' ", SvPVX(r->regstart)); + PerlIO_printf(Perl_debug_log, "start `%s' ", SvPVX(r->regstart)); if (r->regstclass) - fprintf(stderr,"stclass `%s' ", regprop(r->regstclass)); + PerlIO_printf(Perl_debug_log, "stclass `%s' ", regprop(r->regstclass)); if (r->reganch & ROPT_ANCH) - fprintf(stderr,"anchored "); + PerlIO_printf(Perl_debug_log, "anchored "); if (r->reganch & ROPT_SKIP) - fprintf(stderr,"plus "); + PerlIO_printf(Perl_debug_log, "plus "); if (r->reganch & ROPT_IMPLICIT) - fprintf(stderr,"implicit "); + PerlIO_printf(Perl_debug_log, "implicit "); if (r->regmust != NULL) - fprintf(stderr,"must have \"%s\" back %ld ", SvPVX(r->regmust), + PerlIO_printf(Perl_debug_log, "must have \"%s\" back %ld ", SvPVX(r->regmust), (long) r->regback); - fprintf(stderr, "minlen %ld ", (long) r->minlen); - fprintf(stderr,"\n"); + PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen); + PerlIO_printf(Perl_debug_log, "\n"); } /* @@ -1498,8 +1548,14 @@ char *op; case BRANCH: p = "BRANCH"; break; - case EXACTLY: - p = "EXACTLY"; + case EXACT: + p = "EXACT"; + break; + case EXACTF: + p = "EXACTF"; + break; + case EXACTFL: + p = "EXACTFL"; break; case NOTHING: p = "NOTHING"; @@ -1510,29 +1566,17 @@ char *op; case END: p = "END"; break; - case ALNUM: - p = "ALNUM"; - break; - case NALNUM: - p = "NALNUM"; - break; case BOUND: p = "BOUND"; break; + case BOUNDL: + p = "BOUNDL"; + break; case NBOUND: p = "NBOUND"; break; - case SPACE: - p = "SPACE"; - break; - case NSPACE: - p = "NSPACE"; - break; - case DIGIT: - p = "DIGIT"; - break; - case NDIGIT: - p = "NDIGIT"; + case NBOUNDL: + p = "NBOUNDL"; break; case CURLY: (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}", ARG1(op),ARG2(op)); @@ -1578,6 +1622,36 @@ char *op; case WHILEM: p = "WHILEM"; break; + case DIGIT: + p = "DIGIT"; + break; + case NDIGIT: + p = "NDIGIT"; + break; + case ALNUM: + p = "ALNUM"; + break; + case NALNUM: + p = "NALNUM"; + break; + case SPACE: + p = "SPACE"; + break; + case NSPACE: + p = "NSPACE"; + break; + case ALNUML: + p = "ALNUML"; + break; + case NALNUML: + p = "NALNUML"; + break; + case SPACEL: + p = "SPACEL"; + break; + case NSPACEL: + p = "NSPACEL"; + break; default: FAIL("corrupted regexp opcode"); } @@ -1588,7 +1662,7 @@ char *op; #endif /* DEBUGGING */ void -regfree(r) +pregfree(r) struct regexp *r; { if (!r)