X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=4cfd3dbbdbc752f4f4eb457f972c513eda6affc7;hb=1ed8eac0dfbbdc6acb022ff1733a2473c102328b;hp=cf100d7ecef12a27ecca7b5ecae66a12e947d167;hpb=ad391ad9bbfeaf73d3944b50240313a5677bcc60;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index cf100d7..4cfd3db 100644 --- a/regcomp.c +++ b/regcomp.c @@ -40,10 +40,10 @@ /* *These* symbols are masked to allow static link. */ # define Perl_regnext my_regnext # define Perl_save_re_context my_save_re_context -# define Perl_reginitcolors my_reginitcolors +# define Perl_reginitcolors my_reginitcolors # define PERL_NO_GET_CONTEXT -#endif +#endif /*SUPPRESS 112*/ /* @@ -69,7 +69,7 @@ * **** Alterations to Henry's code are... **** - **** Copyright (c) 1991-2000, Larry Wall + **** Copyright (c) 1991-2001, Larry Wall **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -83,11 +83,7 @@ #define PERL_IN_REGCOMP_C #include "perl.h" -#ifdef PERL_IN_XSUB_RE -# if defined(PERL_CAPI) || defined(PERL_OBJECT) -# include "XSUB.h" -# endif -#else +#ifndef PERL_IN_XSUB_RE # include "INTERN.h" #endif @@ -115,10 +111,12 @@ typedef struct RExC_state_t { U16 flags16; /* are we folding, multilining? */ char *precomp; /* uncompiled string. */ regexp *rx; + char *start; /* Start of input for compile */ char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ I32 whilem_seen; /* number of WHILEM in this expr */ - regnode *emit; /* Code-emit pointer; ®dummy = don't */ + regnode *emit_start; /* Start of emitted-code area */ + regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; @@ -127,6 +125,7 @@ typedef struct RExC_state_t { I32 extralen; I32 seen_zerolen; I32 seen_evals; + I32 utf8; #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -136,10 +135,13 @@ typedef struct RExC_state_t { #define RExC_flags16 (pRExC_state->flags16) #define RExC_precomp (pRExC_state->precomp) #define RExC_rx (pRExC_state->rx) +#define RExC_start (pRExC_state->start) #define RExC_end (pRExC_state->end) #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) +#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */ #define RExC_emit (pRExC_state->emit) +#define RExC_emit_start (pRExC_state->emit_start) #define RExC_naughty (pRExC_state->naughty) #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) @@ -148,15 +150,11 @@ typedef struct RExC_state_t { #define RExC_extralen (pRExC_state->extralen) #define RExC_seen_zerolen (pRExC_state->seen_zerolen) #define RExC_seen_evals (pRExC_state->seen_evals) +#define RExC_utf8 (pRExC_state->utf8) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s))) -#ifdef atarist -#define PERL_META "^$.[()|?+*\\" -#else -#define META "^$.[()|?+*\\" -#endif #ifdef SPSTART #undef SPSTART /* dratted cpp namespace... */ @@ -197,7 +195,7 @@ typedef struct scan_data_t { * Forward declarations for pregcomp()'s friends. */ -static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) @@ -229,13 +227,11 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) #define SCF_WHILEM_VISITED_POS 0x2000 -#define RF_utf8 8 -#define UTF (PL_reg_flags & RF_utf8) +#define UTF RExC_utf8 #define LOC (RExC_flags16 & PMf_LOCALE) #define FOLD (RExC_flags16 & PMf_FOLD) -#define OOB_CHAR8 1234 -#define OOB_UTF8 123456 +#define OOB_UNICODE 12345678 #define OOB_NAMEDCLASS -1 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) @@ -250,10 +246,10 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, * of t/op/regmesg.t, the tests in t/op/re_tests, and those in * op/pragma/warn/regcomp. */ -#define MARKER1 "HERE" /* marker as it appears in the description */ -#define MARKER2 " << HERE " /* marker as it appears within the regex */ - -#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/" +#define MARKER1 "<-- HERE" /* marker as it appears in the description */ +#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ + +#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/" /* * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given @@ -263,7 +259,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define FAIL(msg) \ STMT_START { \ char *ellipses = ""; \ - unsigned len = strlen(RExC_precomp); \ + IV len = RExC_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ @@ -285,7 +281,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define FAIL2(pat,msg) \ STMT_START { \ char *ellipses = ""; \ - unsigned len = strlen(RExC_precomp); \ + IV len = RExC_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ @@ -305,7 +301,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, */ #define Simple_vFAIL(m) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + IV offset = RExC_parse - RExC_precomp; \ \ Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ m, (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -326,7 +322,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, */ #define Simple_vFAIL2(m,a1) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + IV offset = RExC_parse - RExC_precomp; \ \ S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -348,7 +344,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, */ #define Simple_vFAIL3(m, a1, a2) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + IV offset = RExC_parse - RExC_precomp; \ \ S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -369,7 +365,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, */ #define Simple_vFAIL4(m, a1, a2, a3) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + IV offset = RExC_parse - RExC_precomp; \ \ S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -380,7 +376,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, */ #define Simple_vFAIL5(m, a1, a2, a3, a4) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + IV offset = RExC_parse - RExC_precomp; \ S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -388,15 +384,23 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN(loc,m) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \ + IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\ m, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END \ +#define vWARNdep(loc,m) \ + STMT_START { \ + IV offset = loc - RExC_precomp; \ + int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \ + Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\ + m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + } STMT_END \ + #define vWARN2(loc, m, a1) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \ + IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ a1, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -404,7 +408,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN3(loc, m, a1, a2) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \ + IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ a1, a2, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -412,17 +416,70 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN4(loc, m, a1, a2, a3) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \ + IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ a1, a2, a3, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END +/* used for the parse_flags section for (?c) -- japhy */ +#define vWARN5(loc, m, a1, a2, a3, a4) \ + STMT_START { \ + IV offset = loc - RExC_precomp; \ + Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ + a1, a2, a3, a4, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ + } STMT_END + /* Allow for side effects in s */ -#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END +#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END + +/* Macros for recording node offsets. 20001227 mjd@plover.com + * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in + * element 2*n-1 of the array. Element #2n holds the byte length node #n. + * Element 0 holds the number n. + */ -static void clear_re(pTHXo_ void *r); +#define MJD_OFFSET_DEBUG(x) +/* #define MJD_OFFSET_DEBUG(x) fprintf x */ + + +# define Set_Node_Offset_To_R(node,byte) \ + STMT_START { \ + if (! SIZE_ONLY) { \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \ + } else { \ + RExC_offsets[2*(node)-1] = (byte); \ + } \ + } \ + } STMT_END + +# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) +# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) + +# define Set_Node_Length_To_R(node,len) \ + STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Length macro", node); \ + } else { \ + RExC_offsets[2*(node)] = (len); \ + } \ + } \ + } STMT_END + +# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len) +# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) +# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start) + +/* Get offsets and lengths */ +#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) +#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) + +static void clear_re(pTHX_ void *r); /* Mark that we cannot extend a found fixed substring at this point. Updata the longest found anchored substring and the longest found @@ -431,27 +488,26 @@ static void clear_re(pTHXo_ void *r); STATIC void S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data) { - dTHR; STRLEN l = CHR_SVLEN(data->last_found); STRLEN old_l = CHR_SVLEN(*data->longest); - + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { sv_setsv(*data->longest, data->last_found); if (*data->longest == data->longest_fixed) { data->offset_fixed = l ? data->last_start_min : data->pos_min; if (data->flags & SF_BEFORE_EOL) - data->flags + data->flags |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); else data->flags &= ~SF_FIX_BEFORE_EOL; } else { data->offset_float_min = l ? data->last_start_min : data->pos_min; - data->offset_float_max = (l - ? data->last_start_max + data->offset_float_max = (l + ? data->last_start_max : data->pos_min + data->pos_delta); if (data->flags & SF_BEFORE_EOL) - data->flags + data->flags |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL); else data->flags &= ~SF_FL_BEFORE_EOL; @@ -466,12 +522,9 @@ S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data) STATIC void S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { - int value; - ANYOF_CLASS_ZERO(cl); - for (value = 0; value < 256; ++value) - ANYOF_BITMAP_SET(cl, value); - cl->flags = ANYOF_EOS; + ANYOF_BITMAP_SETALL(cl); + cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; if (LOC) cl->flags |= ANYOF_LOCALE; } @@ -485,9 +538,10 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl) for (value = 0; value <= ANYOF_MAX; value += 2) if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) return 1; - for (value = 0; value < 256; ++value) - if (!ANYOF_BITMAP_TEST(cl, value)) - return 0; + if (!(cl->flags & ANYOF_UNICODE_ALL)) + return 0; + if (!ANYOF_BITMAP_TESTALLSET(cl)) + return 0; return 1; } @@ -532,6 +586,16 @@ S_cl_and(pTHX_ struct regnode_charclass_class *cl, } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ if (!(and_with->flags & ANYOF_EOS)) cl->flags &= ~ANYOF_EOS; + + if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) { + cl->flags &= ~ANYOF_UNICODE_ALL; + cl->flags |= ANYOF_UNICODE; + ARG_SET(cl, ARG(and_with)); + } + if (!(and_with->flags & ANYOF_UNICODE_ALL)) + cl->flags &= ~ANYOF_UNICODE_ALL; + if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL))) + cl->flags &= ~ANYOF_UNICODE; } /* 'OR' a given class with another one. Can create false positives */ @@ -563,7 +627,7 @@ S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, str } else { /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && (!(or_with->flags & ANYOF_FOLD) + && (!(or_with->flags & ANYOF_FOLD) || (cl->flags & ANYOF_FOLD)) ) { int i; @@ -582,8 +646,29 @@ S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, str } if (or_with->flags & ANYOF_EOS) cl->flags |= ANYOF_EOS; + + if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE && + ARG(cl) != ARG(or_with)) { + cl->flags |= ANYOF_UNICODE_ALL; + cl->flags &= ~ANYOF_UNICODE; + } + if (or_with->flags & ANYOF_UNICODE_ALL) { + cl->flags |= ANYOF_UNICODE_ALL; + cl->flags &= ~ANYOF_UNICODE; + } } +/* + * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2. + * These need to be revisited when a newer toolchain becomes available. + */ +#if defined(__sparc64__) && defined(__GNUC__) +# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) +# undef SPARC64_GCC_WORKAROUND +# define SPARC64_GCC_WORKAROUND 1 +# endif +#endif + /* REx optimizer. Converts nodes into quickier variants "in place". Finds fixed substrings. */ @@ -596,7 +681,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg /* deltap: Write maxlen-minlen here. */ /* last: Stop before this one. */ { - dTHR; I32 min = 0, pars = 0, code; regnode *scan = *scanp, *next; I32 delta = 0; @@ -605,7 +689,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; scan_data_t data_fake; struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */ - + while (scan && OP(scan) != END && scan < last) { /* Peephole optimizer: */ @@ -615,12 +699,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg U32 stringok = 1; #ifdef DEBUGGING regnode *stop = scan; -#endif +#endif next = scan + NODE_SZ_STR(scan); /* Skip NOTHING, merge EXACT*. */ while (n && - ( PL_regkind[(U8)OP(n)] == NOTHING || + ( PL_regkind[(U8)OP(n)] == NOTHING || (stringok && (OP(n) == OP(scan)))) && NEXT_OFF(n) && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { @@ -632,25 +716,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg #ifdef DEBUGGING if (stringok) stop = n; -#endif +#endif n = regnext(n); } - else { + else if (stringok) { int oldl = STR_LEN(scan); regnode *nnext = regnext(n); - - if (oldl + STR_LEN(n) > U8_MAX) + + if (oldl + STR_LEN(n) > U8_MAX) break; NEXT_OFF(scan) += NEXT_OFF(n); STR_LEN(scan) += STR_LEN(n); next = n + NODE_SZ_STR(n); /* Now we can overwrite *n : */ - Move(STRING(n), STRING(scan) + oldl, - STR_LEN(n), char); + Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); #ifdef DEBUGGING - if (stringok) - stop = next - 1; -#endif + stop = next - 1; +#endif n = nnext; } } @@ -676,7 +758,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); int noff; regnode *n = scan; - + /* Skip NOTHING and LONGJMP. */ while ((n = regnext(n)) && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n))) @@ -685,17 +767,17 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg off += noff; if (reg_off_by_arg[OP(scan)]) ARG(scan) = off; - else + else NEXT_OFF(scan) = off; } /* The principal pseudo-switch. Cannot be a switch, since we look into several different things. */ - if (OP(scan) == BRANCH || OP(scan) == BRANCHJ + if (OP(scan) == BRANCH || OP(scan) == BRANCHJ || OP(scan) == IFTHEN || OP(scan) == SUSPEND) { next = regnext(scan); code = OP(scan); - - if (OP(next) == code || code == IFTHEN || code == SUSPEND) { + + if (OP(next) == code || code == IFTHEN || code == SUSPEND) { I32 max1 = 0, min1 = I32_MAX, num = 0; struct regnode_charclass_class accum; @@ -709,7 +791,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg num++; data_fake.flags = 0; - if (data) { + if (data) { data_fake.whilem_c = data->whilem_c; data_fake.last_closep = data->last_closep; } @@ -723,13 +805,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg cl_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; - } + } if (flags & SCF_WHILEM_VISITED_POS) f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ minnext = study_chunk(pRExC_state, &scan, &deltanext, next, &data_fake, f); - if (min1 > minnext) + if (min1 > minnext) min1 = minnext; if (max1 < minnext + deltanext) max1 = minnext + deltanext; @@ -744,7 +826,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg data->whilem_c = data_fake.whilem_c; if (flags & SCF_DO_STCLASS) cl_or(pRExC_state, &accum, &this_class); - if (code == SUSPEND) + if (code == SUSPEND) break; } if (code == IFTHEN && num < 2) /* Empty ELSE branch */ @@ -770,7 +852,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg flags &= ~SCF_DO_STCLASS; } else { - /* Switch to OR mode: cache the old value of + /* Switch to OR mode: cache the old value of * data->start_class */ StructCopy(data->start_class, &and_with, struct regnode_charclass_class); @@ -790,15 +872,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg } else if (OP(scan) == EXACT) { I32 l = STR_LEN(scan); + UV uc = *((U8*)STRING(scan)); if (UTF) { - unsigned char *s = (unsigned char *)STRING(scan); - unsigned char *e = s + l; - I32 newl = 0; - while (s < e) { - newl++; - s += UTF8SKIP(s); - } - l = newl; + U8 *s = (U8*)STRING(scan); + l = utf8_length(s, s + l); + uc = utf8_to_uvchr(s, NULL); } min += l; if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ @@ -807,7 +885,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (data->last_end == -1) { /* Update the start info. */ data->last_start_min = data->pos_min; data->last_start_max = is_inf - ? I32_MAX : data->pos_min + data->pos_delta; + ? I32_MAX : data->pos_min + data->pos_delta; } sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan)); data->last_end = data->pos_min + l; @@ -818,21 +896,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg /* Check whether it is compatible with what we know already! */ int compat = 1; - if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) - && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) + if (uc >= 0x100 || + (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) + && !ANYOF_BITMAP_TEST(data->start_class, uc) && (!(data->start_class->flags & ANYOF_FOLD) - || !ANYOF_BITMAP_TEST(data->start_class, - PL_fold[*(U8*)STRING(scan)]))) + || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) + ) compat = 0; ANYOF_CLASS_ZERO(data->start_class); ANYOF_BITMAP_ZERO(data->start_class); if (compat) - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; + if (uc < 0x100) + data->start_class->flags &= ~ANYOF_UNICODE_ALL; } else if (flags & SCF_DO_STCLASS_OR) { /* false positive possible if the class is case-folded */ - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + if (uc < 0x100) + ANYOF_BITMAP_SET(data->start_class, uc); + else + data->start_class->flags |= ANYOF_UNICODE_ALL; data->start_class->flags &= ~ANYOF_EOS; cl_and(data->start_class, &and_with); } @@ -840,19 +924,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg } else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */ I32 l = STR_LEN(scan); + UV uc = *((U8*)STRING(scan)); /* Search for fixed substrings supports EXACT only. */ - if (flags & SCF_DO_SUBSTR) + if (flags & SCF_DO_SUBSTR) scan_commit(pRExC_state, data); if (UTF) { - unsigned char *s = (unsigned char *)STRING(scan); - unsigned char *e = s + l; - I32 newl = 0; - while (s < e) { - newl++; - s += UTF8SKIP(s); - } - l = newl; + U8 *s = (U8 *)STRING(scan); + l = utf8_length(s, s + l); + uc = utf8_to_uvchr(s, NULL); } min += l; if (data && (flags & SCF_DO_SUBSTR)) @@ -861,15 +941,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg /* Check whether it is compatible with what we know already! */ int compat = 1; - if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) - && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) - && !ANYOF_BITMAP_TEST(data->start_class, - PL_fold[*(U8*)STRING(scan)])) + if (uc >= 0x100 || + (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) + && !ANYOF_BITMAP_TEST(data->start_class, uc) + && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) compat = 0; ANYOF_CLASS_ZERO(data->start_class); ANYOF_BITMAP_ZERO(data->start_class); if (compat) { - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; data->start_class->flags |= ANYOF_FOLD; if (OP(scan) == EXACTFL) @@ -880,7 +960,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (data->start_class->flags & ANYOF_FOLD) { /* false positive possible if the class is case-folded. Assume that the locale settings are the same... */ - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + if (uc < 0x100) + ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; } cl_and(data->start_class, &and_with); @@ -888,11 +969,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg flags &= ~SCF_DO_STCLASS; } else if (strchr((char*)PL_varies,OP(scan))) { - I32 mincount, maxcount, minnext, deltanext, fl; + I32 mincount, maxcount, minnext, deltanext, fl = 0; I32 f = flags, pos_before = 0; regnode *oscan = scan; struct regnode_charclass_class this_class; struct regnode_charclass_class *oclass = NULL; + I32 next_is_eval = 0; switch (PL_regkind[(U8)OP(scan)]) { case WHILEM: /* End of (?:...)* . */ @@ -902,8 +984,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { next = NEXTOPER(scan); if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { - mincount = 1; - maxcount = REG_INFTY; + mincount = 1; + maxcount = REG_INFTY; next = regnext(scan); scan = NEXTOPER(scan); goto do_curly; @@ -916,12 +998,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg case STAR: if (flags & SCF_DO_STCLASS) { mincount = 0; - maxcount = REG_INFTY; + maxcount = REG_INFTY; next = regnext(scan); scan = NEXTOPER(scan); goto do_curly; } - is_inf = is_inf_internal = 1; + is_inf = is_inf_internal = 1; scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */ @@ -929,7 +1011,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg } goto optimize_curly_tail; case CURLY: - mincount = ARG1(scan); + mincount = ARG1(scan); maxcount = ARG2(scan); next = regnext(scan); if (OP(scan) == CURLYX) { @@ -938,6 +1020,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX); } scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; + next_is_eval = (OP(scan) == EVAL); do_curly: if (flags & SCF_DO_SUBSTR) { if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */ @@ -959,15 +1042,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg /* These are the cases when once a subexpression fails at a particular position, it cannot succeed even after backtracking at the enclosing scope. - + XXXX what if minimal match and we are at the initial run of {n,m}? */ if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) f &= ~SCF_WHILEM_VISITED_POS; /* This will finish on WHILEM, setting scan, or on NULL: */ - minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data, - mincount == 0 + minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data, + mincount == 0 ? (f & ~SCF_DO_SUBSTR) : f); if (flags & SCF_DO_STCLASS) @@ -977,7 +1060,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg cl_or(pRExC_state, data->start_class, &this_class); } else if (flags & SCF_DO_STCLASS_AND) { - /* Switch to OR mode: cache the old value of + /* Switch to OR mode: cache the old value of * data->start_class */ StructCopy(data->start_class, &and_with, struct regnode_charclass_class); @@ -998,8 +1081,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg } if (!scan) /* It was not CURLYX, but CURLY. */ scan = next; - if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0) - && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) + if (ckWARN(WARN_REGEXP) + /* ? quantifier ok, except for (?{ ... }) */ + && (next_is_eval || !(mincount == 0 && maxcount == 1)) + && (minnext == 0) && (deltanext == 0) + && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) && maxcount <= REG_INFTY/3) /* Complement check for big count */ { vWARN(RExC_parse, @@ -1007,30 +1093,35 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg } min += minnext * mincount; - is_inf_internal |= ((maxcount == REG_INFTY + is_inf_internal |= ((maxcount == REG_INFTY && (minnext + deltanext) > 0) || deltanext == I32_MAX); is_inf |= is_inf_internal; delta += (minnext + deltanext) * maxcount - minnext * mincount; /* Try powerful optimization CURLYX => CURLYN. */ - if ( OP(oscan) == CURLYX && data + if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR && !(data->flags & SF_HAS_EVAL) && !deltanext && minnext == 1 ) { /* Try to optimize to CURLYN. */ regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; - regnode *nxt1 = nxt, *nxt2; + regnode *nxt1 = nxt; +#ifdef DEBUGGING + regnode *nxt2; +#endif /* Skip open. */ nxt = regnext(nxt); if (!strchr((char*)PL_simple,OP(nxt)) && !(PL_regkind[(U8)OP(nxt)] == EXACT - && STR_LEN(nxt) == 1)) + && STR_LEN(nxt) == 1)) goto nogo; +#ifdef DEBUGGING nxt2 = nxt; +#endif nxt = regnext(nxt); - if (OP(nxt) != CLOSE) + if (OP(nxt) != CLOSE) goto nogo; /* Now we know that nxt2 is the only contents: */ oscan->flags = ARG(nxt); @@ -1043,12 +1134,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg OP(nxt) = OPTIMIZED; /* was CLOSE. */ OP(nxt + 1) = OPTIMIZED; /* was count. */ NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */ -#endif +#endif } nogo: /* Try optimization CURLYX => CURLYM. */ - if ( OP(oscan) == CURLYX && data + if ( OP(oscan) == CURLYX && data && !(data->flags & SF_HAS_PAR) && !(data->flags & SF_HAS_EVAL) && !deltanext ) { @@ -1059,7 +1150,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg OP(oscan) = CURLYM; while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ - && (OP(nxt2) != WHILEM)) + && (OP(nxt2) != WHILEM)) nxt = nxt2; OP(nxt2) = SUCCEED; /* Whas WHILEM */ /* Need to optimize away parenths. */ @@ -1067,7 +1158,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg /* Set the parenth number. */ regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ - if (OP(nxt) != CLOSE) + if (OP(nxt) != CLOSE) FAIL("Panic opt close"); oscan->flags = ARG(nxt); OP(nxt1) = OPTIMIZED; /* was OPEN. */ @@ -1077,11 +1168,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg OP(nxt + 1) = OPTIMIZED; /* was count. */ NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */ NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */ -#endif +#endif #if 0 while ( nxt1 && (OP(nxt1) != WHILEM)) { regnode *nnxt = regnext(nxt1); - + if (nnxt == nxt) { if (reg_off_by_arg[OP(nxt1)]) ARG_SET(nxt1, nxt2 - nxt1); @@ -1094,7 +1185,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg } #endif /* Optimize again: */ - study_chunk(pRExC_state, &nxt1, &deltanext, nxt, + study_chunk(pRExC_state, &nxt1, &deltanext, nxt, NULL, 0); } else @@ -1116,18 +1207,35 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg PREVOPER(nxt)->flags = data->whilem_c | (RExC_whilem_seen << 4); /* On WHILEM */ } - if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) + if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) pars++; if (flags & SCF_DO_SUBSTR) { SV *last_str = Nullsv; int counted = mincount != 0; if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */ - I32 b = pos_before >= data->last_start_min +#if defined(SPARC64_GCC_WORKAROUND) + I32 b = 0; + STRLEN l = 0; + char *s = NULL; + I32 old = 0; + + if (pos_before >= data->last_start_min) + b = pos_before; + else + b = data->last_start_min; + + l = 0; + s = SvPV(data->last_found, l); + old = b - data->last_start_min; + +#else + I32 b = pos_before >= data->last_start_min ? pos_before : data->last_start_min; STRLEN l; char *s = SvPV(data->last_found, l); I32 old = b - data->last_start_min; +#endif if (UTF) old = utf8_hop((U8*)s, old) - (U8*)s; @@ -1139,11 +1247,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg /* What was added is a constant string */ if (mincount > 1) { SvGROW(last_str, (mincount * l) + 1); - repeatcpy(SvPVX(last_str) + l, + repeatcpy(SvPVX(last_str) + l, SvPVX(last_str), l, mincount - 1); SvCUR(last_str) *= mincount; /* Add additional parts. */ - SvCUR_set(data->last_found, + SvCUR_set(data->last_found, SvCUR(data->last_found) - l); sv_catsv(data->last_found, last_str); data->last_end += l * (mincount - 1); @@ -1166,10 +1274,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (mincount && last_str) { sv_setsv(data->last_found, last_str); data->last_end = data->pos_min; - data->last_start_min = + data->last_start_min = data->pos_min - CHR_SVLEN(last_str); - data->last_start_max = is_inf - ? I32_MAX + data->last_start_max = is_inf + ? I32_MAX : data->pos_min + data->pos_delta - CHR_SVLEN(last_str); } @@ -1198,8 +1306,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg break; } } - else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) { - int value; + else if (strchr((char*)PL_simple,OP(scan))) { + int value = 0; if (flags & SCF_DO_SUBSTR) { scan_commit(pRExC_state,data); @@ -1212,20 +1320,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg /* Some of the logic below assumes that switching locale on will only add false positives. */ switch (PL_regkind[(U8)OP(scan)]) { - case ANYUTF8: case SANY: - case SANYUTF8: - case ALNUMUTF8: - case ANYOFUTF8: - case ALNUMLUTF8: - case NALNUMUTF8: - case NALNUMLUTF8: - case SPACEUTF8: - case NSPACEUTF8: - case SPACELUTF8: - case NSPACELUTF8: - case DIGITUTF8: - case NDIGITUTF8: default: do_default: /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ @@ -1266,7 +1361,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg else { for (value = 0; value < 256; value++) if (isALNUM(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -1295,7 +1390,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg else { for (value = 0; value < 256; value++) if (!isALNUM(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -1324,7 +1419,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg else { for (value = 0; value < 256; value++) if (isSPACE(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -1353,7 +1448,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg else { for (value = 0; value < 256; value++) if (!isSPACE(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -1384,7 +1479,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg else { for (value = 0; value < 256; value++) if (isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -1401,7 +1496,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg else { for (value = 0; value < 256; value++) if (!isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -1427,7 +1522,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg int f = 0; data_fake.flags = 0; - if (data) { + if (data) { data_fake.whilem_c = data->whilem_c; data_fake.last_closep = data->last_closep; } @@ -1501,7 +1596,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg finish: *scanp = scan; *deltap = is_inf_internal ? I32_MAX : delta; - if (flags & SCF_DO_SUBSTR && is_inf) + if (flags & SCF_DO_SUBSTR && is_inf) data->pos_delta = I32_MAX - data->pos_min; if (is_par > U8_MAX) is_par = 0; @@ -1521,10 +1616,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg STATIC I32 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s) { - dTHR; if (RExC_rx->data) { - Renewc(RExC_rx->data, - sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1), + Renewc(RExC_rx->data, + sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1), char, struct reg_data); Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8); RExC_rx->data->count += n; @@ -1542,10 +1636,9 @@ S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s) void Perl_reginitcolors(pTHX) { - dTHR; int i = 0; char *s = PerlEnv_getenv("PERL_RE_COLORS"); - + if (s) { PL_colors[0] = s = savepv(s); while (++i < 6) { @@ -1558,7 +1651,7 @@ Perl_reginitcolors(pTHX) PL_colors[i] = s = ""; } } else { - while (i < 6) + while (i < 6) PL_colors[i++] = ""; } PL_colorset = 1; @@ -1583,7 +1676,6 @@ Perl_reginitcolors(pTHX) regexp * Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) { - dTHR; register regexp *r; regnode *scan; regnode *first; @@ -1598,18 +1690,15 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (exp == NULL) FAIL("NULL regexp argument"); - /* XXXX This looks very suspicious... */ - if (pm->op_pmdynflags & PMdf_UTF8) { - PL_reg_flags |= RF_utf8; - } - else - PL_reg_flags = 0; + RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; - RExC_precomp = savepvn(exp, xend - exp); - DEBUG_r(if (!PL_colorset) reginitcolors()); - DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", - PL_colors[4],PL_colors[5],PL_colors[0], - (int)(xend - exp), RExC_precomp, PL_colors[1])); + RExC_precomp = exp; + DEBUG_r({ + if (!PL_colorset) reginitcolors(); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], + (int)(xend - exp), RExC_precomp, PL_colors[1]); + }); RExC_flags16 = pm->op_pmflags; RExC_sawback = 0; @@ -1620,6 +1709,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* First pass: determine size, legality. */ RExC_parse = exp; + RExC_start = exp; RExC_end = xend; RExC_naughty = 0; RExC_npar = 1; @@ -1631,7 +1721,6 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) REGC((U8)REG_MAGIC, (char*)RExC_emit); #endif if (reg(pRExC_state, 0, &flags) == NULL) { - Safefree(RExC_precomp); RExC_precomp = Nullch; return(NULL); } @@ -1658,7 +1747,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) #endif r->refcnt = 1; r->prelen = xend - exp; - r->precomp = RExC_precomp; + r->precomp = savepvn(RExC_precomp, r->prelen); r->subbeg = NULL; r->reganch = pm->op_pmflags & PMf_COMPILETIME; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ @@ -1667,13 +1756,24 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->startp = 0; /* Useful during FAIL. */ r->endp = 0; /* Useful during FAIL. */ + Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ + if (r->offsets) { + r->offsets[0] = RExC_size; + } + DEBUG_r(PerlIO_printf(Perl_debug_log, + "%s %"UVuf" bytes for offset annotations.\n", + r->offsets ? "Got" : "Couldn't get", + (UV)((2*RExC_size+1) * sizeof(U32)))); + RExC_rx = r; /* Second pass: emit code. */ + RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */ RExC_parse = exp; RExC_end = xend; RExC_naughty = 0; RExC_npar = 1; + RExC_emit_start = r->program; RExC_emit = r->program; /* Store the count of eval-groups for security checks: */ RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals); @@ -1686,7 +1786,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */ pm->op_pmflags = RExC_flags16; if (UTF) - r->reganch |= ROPT_UTF8; + r->reganch |= ROPT_UTF8; /* Unicode in it? */ r->regstclass = NULL; if (RExC_naughty >= 10) /* Probably an expensive pattern. */ r->reganch |= ROPT_NAUGHTY; @@ -1724,9 +1824,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* Starting-point info. */ again: if (PL_regkind[(U8)OP(first)] == EXACT) { - if (OP(first) == EXACT); /* Empty, get anchored substr later. */ - else if ((OP(first) == EXACTF || OP(first) == EXACTFL) - && !UTF) + if (OP(first) == EXACT) + ; /* Empty, get anchored substr later. */ + else if ((OP(first) == EXACTF || OP(first) == EXACTFL)) r->regstclass = first; } else if (strchr((char*)PL_simple,OP(first))) @@ -1755,7 +1855,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* turn .* into ^.* with an implied $*=1 */ int type = OP(NEXTOPER(first)); - if (type == REG_ANY || type == ANYUTF8) + if (type == REG_ANY) type = ROPT_ANCH_MBOL; else type = ROPT_ANCH_SBOL; @@ -1764,13 +1864,13 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) first = NEXTOPER(first); goto again; } - if (sawplus && (!sawopen || !RExC_sawback) + if (sawplus && (!sawopen || !RExC_sawback) && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->reganch |= ROPT_SKIP; /* Scan is after the zeroth branch, first is atomic matcher. */ - DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", (IV)(first - scan + 1))); /* * If there's something expensive in the r.e., find the @@ -1801,7 +1901,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */ &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag); if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) - && data.last_start_min == 0 && data.last_end > 0 + && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS))) r->reganch |= ROPT_CHECK_ALL; @@ -1854,27 +1954,27 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) SvREFCNT_dec(data.longest_fixed); longest_fixed_length = 0; } - if (r->regstclass - && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8 - || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY)) + if (r->regstclass + && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY)) r->regstclass = NULL; if ((!r->anchored_substr || r->anchored_offset) && stclass_flag && !(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { - SV *sv; I32 n = add_data(pRExC_state, 1, "f"); - New(1006, RExC_rx->data->data[n], 1, + New(1006, RExC_rx->data->data[n], 1, struct regnode_charclass_class); StructCopy(data.start_class, (struct regnode_charclass_class*)RExC_rx->data->data[n], struct regnode_charclass_class); r->regstclass = (regnode*)RExC_rx->data->data[n]; r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ - DEBUG_r((sv = sv_newmortal(), - regprop(sv, (regnode*)data.start_class), - PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", - SvPVX(sv)))); + PL_regdata = r->data; /* for regprop() */ + DEBUG_r({ SV *sv = sv_newmortal(); + regprop(sv, (regnode*)data.start_class); + PerlIO_printf(Perl_debug_log, + "synthetic stclass `%s'.\n", + SvPVX(sv));}); } /* A temporary algorithm prefers floated substr to fixed one to dig more info. */ @@ -1912,32 +2012,35 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->check_substr = r->anchored_substr = r->float_substr = Nullsv; if (!(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { - SV *sv; I32 n = add_data(pRExC_state, 1, "f"); - New(1006, RExC_rx->data->data[n], 1, + New(1006, RExC_rx->data->data[n], 1, struct regnode_charclass_class); StructCopy(data.start_class, (struct regnode_charclass_class*)RExC_rx->data->data[n], struct regnode_charclass_class); r->regstclass = (regnode*)RExC_rx->data->data[n]; r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ - DEBUG_r((sv = sv_newmortal(), - regprop(sv, (regnode*)data.start_class), - PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", - SvPVX(sv)))); + DEBUG_r({ SV* sv = sv_newmortal(); + regprop(sv, (regnode*)data.start_class); + PerlIO_printf(Perl_debug_log, + "synthetic stclass `%s'.\n", + SvPVX(sv));}); } } r->minlen = minlen; - if (RExC_seen & REG_SEEN_GPOS) + if (RExC_seen & REG_SEEN_GPOS) r->reganch |= ROPT_GPOS_SEEN; if (RExC_seen & REG_SEEN_LOOKBEHIND) r->reganch |= ROPT_LOOKBEHIND_SEEN; if (RExC_seen & REG_SEEN_EVAL) r->reganch |= ROPT_EVAL_SEEN; + if (RExC_seen & REG_SEEN_CANY) + r->reganch |= ROPT_CANY_SEEN; Newz(1002, r->startp, RExC_npar, I32); Newz(1002, r->endp, RExC_npar, I32); + PL_regdata = r->data; /* for regprop() */ DEBUG_r(regdump(r)); return(r); } @@ -1955,21 +2058,32 @@ STATIC regnode * S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ { - dTHR; register regnode *ret; /* Will be the head of the group. */ register regnode *br; register regnode *lastbr; register regnode *ender = 0; register I32 parno = 0; I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0; + + /* for (?g), (?gc), and (?o) warnings; warning + about (?c) will warn about (?g) -- japhy */ + + I32 wastedflags = 0x00, + wasted_o = 0x01, + wasted_g = 0x02, + wasted_gc = 0x02 | 0x04, + wasted_c = 0x04; + + char * parse_start = RExC_parse; /* MJD */ char *oregcomp_parse = RExC_parse; char c; *flagp = 0; /* Tentatively. */ + /* Make an OPEN node, if parenthesized. */ if (paren) { - if (*RExC_parse == '?') { + if (*RExC_parse == '?') { /* (?...) */ U16 posflags = 0, negflags = 0; U16 *flagsp = &posflags; int logical = 0; @@ -1979,24 +2093,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) paren = *RExC_parse++; ret = NULL; /* For look-ahead/behind. */ switch (paren) { - case '<': + case '<': /* (?<...) */ RExC_seen |= REG_SEEN_LOOKBEHIND; - if (*RExC_parse == '!') + if (*RExC_parse == '!') paren = ','; - if (*RExC_parse != '=' && *RExC_parse != '!') + if (*RExC_parse != '=' && *RExC_parse != '!') goto unknown; RExC_parse++; - case '=': - case '!': + case '=': /* (?=...) */ + case '!': /* (?!...) */ RExC_seen_zerolen++; - case ':': - case '>': + case ':': /* (?:...) */ + case '>': /* (?>...) */ break; - case '$': - case '@': + case '$': /* (?$...) */ + case '@': /* (?@...) */ vFAIL2("Sequence (?%c...) not implemented", (int)paren); break; - case '#': + case '#': /* (?#...) */ while (*RExC_parse && *RExC_parse != ')') RExC_parse++; if (*RExC_parse != ')') @@ -2004,17 +2118,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) nextchar(pRExC_state); *flagp = TRYAGAIN; return NULL; - case 'p': - if (SIZE_ONLY) - vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})"); + case 'p': /* (?p...) */ + if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) + vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})"); /* FALL THROUGH*/ - case '?': + case '?': /* (??...) */ logical = 1; paren = *RExC_parse++; /* FALL THROUGH */ - case '{': + case '{': /* (?{...}) */ { - dTHR; I32 count = 1, n = 0; char c; char *s = RExC_parse; @@ -2026,21 +2139,21 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) while (count && (c = *RExC_parse)) { if (c == '\\' && RExC_parse[1]) RExC_parse++; - else if (c == '{') + else if (c == '{') count++; - else if (c == '}') + else if (c == '}') count--; RExC_parse++; } if (*RExC_parse != ')') { - RExC_parse = s; + RExC_parse = s; vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); } if (!SIZE_ONLY) { AV *av; - - if (RExC_parse - 1 - s) + + if (RExC_parse - 1 - s) sv = newSVpvn(s, RExC_parse - 1 - s); else sv = newSVpvn("", 0); @@ -2048,6 +2161,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) ENTER; Perl_save_re_context(aTHX); rop = sv_compile_2op(sv, &sop, "re", &av); + sop->op_private |= OPpREFCOUNTED; + /* re_dup will OpREFCNT_inc */ + OpREFCNT_set(sop, 1); LEAVE; n = add_data(pRExC_state, 3, "nop"); @@ -2062,7 +2178,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) /* No compiled RE interpolated, has runtime components ===> unsafe. */ FAIL("Eval-group not allowed at runtime, use re 'eval'"); - if (PL_tainted) + if (PL_tainting && PL_tainted) FAIL("Eval-group in insecure regular expression"); } @@ -2072,15 +2188,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) if (!SIZE_ONLY) ret->flags = 2; regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); + /* deal with the length of this later - MJD */ return ret; } return reganode(pRExC_state, EVAL, n); } - case '(': + case '(': /* (?(?{...})...) and (?(?=...)...) */ { - if (RExC_parse[0] == '?') { - if (RExC_parse[1] == '=' || RExC_parse[1] == '!' - || RExC_parse[1] == '<' + if (RExC_parse[0] == '?') { /* (?(?...)) */ + if (RExC_parse[1] == '=' || RExC_parse[1] == '!' + || RExC_parse[1] == '<' || RExC_parse[1] == '{') { /* Lookahead or eval. */ I32 flag; @@ -2089,14 +2206,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) ret->flags = 1; regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag)); goto insert_if; - } + } } else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + /* (?(1)...) */ parno = atoi(RExC_parse++); while (isDIGIT(*RExC_parse)) RExC_parse++; - ret = reganode(pRExC_state, GROUPP, parno); + ret = reganode(pRExC_state, GROUPP, parno); + if ((c = *nextchar(pRExC_state)) != ')') vFAIL("Switch condition not recognized"); insert_if: @@ -2141,14 +2260,47 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) break; default: --RExC_parse; - parse_flags: + parse_flags: /* (?i) */ while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) { - if (*RExC_parse != 'o') - pmflag(flagsp, *RExC_parse); + /* (?g), (?gc) and (?o) are useless here + and must be globally applied -- japhy */ + + if (*RExC_parse == 'o' || *RExC_parse == 'g') { + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g; + if (! (wastedflags & wflagbit) ) { + wastedflags |= wflagbit; + vWARN5( + RExC_parse + 1, + "Useless (%s%c) - %suse /%c modifier", + flagsp == &negflags ? "?-" : "?", + *RExC_parse, + flagsp == &negflags ? "don't " : "", + *RExC_parse + ); + } + } + } + else if (*RExC_parse == 'c') { + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + if (! (wastedflags & wasted_c) ) { + wastedflags |= wasted_gc; + vWARN3( + RExC_parse + 1, + "Useless (%sc) - %suse /gc modifier", + flagsp == &negflags ? "?-" : "?", + flagsp == &negflags ? "don't " : "" + ); + } + } + } + else { pmflag(flagsp, *RExC_parse); } + ++RExC_parse; } if (*RExC_parse == '-') { flagsp = &negflags; + wastedflags = 0; /* reset so (?g-c) warns twice */ ++RExC_parse; goto parse_flags; } @@ -2169,26 +2321,34 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) return NULL; } } - else { + else { /* (...) */ parno = RExC_npar; RExC_npar++; ret = reganode(pRExC_state, OPEN, parno); + Set_Node_Length(ret, 1); /* MJD */ + Set_Node_Offset(ret, RExC_parse); /* MJD */ open = 1; } } - else + else /* ! paren */ ret = NULL; /* Pick up the branches, linking them together. */ + parse_start = RExC_parse; /* MJD */ br = regbranch(pRExC_state, &flags, 1); + /* branch_len = (paren != 0); */ + if (br == NULL) return(NULL); if (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { reginsert(pRExC_state, BRANCHJ, br); } - else + else { /* MJD */ reginsert(pRExC_state, BRANCH, br); + Set_Node_Length(br, paren != 0); + Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); + } have_branch = 1; if (SIZE_ONLY) RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ @@ -2214,6 +2374,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) RExC_extralen += 2; /* Account for LONGJMP. */ nextchar(pRExC_state); br = regbranch(pRExC_state, &flags, 0); + if (br == NULL) return(NULL); regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ @@ -2231,6 +2392,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) break; case 1: ender = reganode(pRExC_state, CLOSE, parno); + Set_Node_Offset(ender,RExC_parse+1); /* MJD */ + Set_Node_Length(ender,1); /* MJD */ break; case '<': case ',': @@ -2300,24 +2463,25 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) STATIC regnode * S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first) { - dTHR; register regnode *ret; register regnode *chain = NULL; register regnode *latest; I32 flags = 0, c = 0; - if (first) + if (first) ret = NULL; else { - if (!SIZE_ONLY && RExC_extralen) + if (!SIZE_ONLY && RExC_extralen) ret = reganode(pRExC_state, BRANCHJ,0); - else + else { ret = reg_node(pRExC_state, BRANCH); + Set_Node_Length(ret, 1); + } } - if (!first && SIZE_ONLY) + if (!first && SIZE_ONLY) RExC_extralen += 1; /* BRANCHJ */ - + *flagp = WORST; /* Tentatively. */ RExC_parse--; @@ -2366,7 +2530,6 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first) STATIC regnode * S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) { - dTHR; register regnode *ret; register char op; register char *next; @@ -2375,6 +2538,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) char *maxpos; I32 min; I32 max = REG_INFTY; + char *parse_start; ret = regatom(pRExC_state, &flags); if (ret == NULL) { @@ -2386,6 +2550,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) op = *RExC_parse; if (op == '{' && regcurly(RExC_parse)) { + parse_start = RExC_parse; /* MJD */ next = RExC_parse + 1; maxpos = Nullch; while (isDIGIT(*next) || *next == ',') { @@ -2418,6 +2583,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) if ((flags&SIMPLE)) { RExC_naughty += 2 + RExC_naughty / 2; reginsert(pRExC_state, CURLY, ret); + Set_Node_Offset(ret, parse_start+1); /* MJD */ + Set_Node_Cur_Length(ret); } else { regnode *w = reg_node(pRExC_state, WHILEM); @@ -2430,6 +2597,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ } reginsert(pRExC_state, CURLYX,ret); + /* MJD hk */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Length(ret, + op == '{' ? (RExC_parse - parse_start) : 1); + if (!SIZE_ONLY && RExC_extralen) NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); @@ -2473,8 +2645,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) if (!(flags&HASWIDTH) && op != '?') vFAIL("Regexp *+ operand could be empty"); -#endif +#endif + parse_start = RExC_parse; nextchar(pRExC_state); *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); @@ -2534,9 +2707,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) STATIC regnode * S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) { - dTHR; register regnode *ret = 0; I32 flags; + char *parse_start = 0; *flagp = WORST; /* Tentatively. */ @@ -2551,10 +2724,11 @@ tryagain: ret = reg_node(pRExC_state, SBOL); else ret = reg_node(pRExC_state, BOL); + Set_Node_Length(ret, 1); /* MJD */ break; case '$': nextchar(pRExC_state); - if (*RExC_parse) + if (*RExC_parse) RExC_seen_zerolen++; if (RExC_flags16 & PMf_MULTILINE) ret = reg_node(pRExC_state, MEOL); @@ -2562,35 +2736,29 @@ tryagain: ret = reg_node(pRExC_state, SEOL); else ret = reg_node(pRExC_state, EOL); + Set_Node_Length(ret, 1); /* MJD */ break; case '.': nextchar(pRExC_state); - if (UTF) { - if (RExC_flags16 & PMf_SINGLELINE) - ret = reg_node(pRExC_state, SANYUTF8); - else - ret = reg_node(pRExC_state, ANYUTF8); - *flagp |= HASWIDTH; - } - else { - if (RExC_flags16 & PMf_SINGLELINE) - ret = reg_node(pRExC_state, SANY); - else - ret = reg_node(pRExC_state, REG_ANY); - *flagp |= HASWIDTH|SIMPLE; - } + if (RExC_flags16 & PMf_SINGLELINE) + ret = reg_node(pRExC_state, SANY); + else + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; + Set_Node_Length(ret, 1); /* MJD */ break; case '[': { char *oregcomp_parse = ++RExC_parse; - ret = (UTF ? regclassutf8(pRExC_state) : regclass(pRExC_state)); + ret = regclass(pRExC_state); if (*RExC_parse != ']') { RExC_parse = oregcomp_parse; vFAIL("Unmatched ["); } nextchar(pRExC_state); *flagp |= HASWIDTH|SIMPLE; + Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ break; } case '(': @@ -2637,16 +2805,19 @@ tryagain: ret = reg_node(pRExC_state, SBOL); *flagp |= SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'G': ret = reg_node(pRExC_state, GPOS); RExC_seen |= REG_SEEN_GPOS; *flagp |= SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'Z': ret = reg_node(pRExC_state, SEOL); *flagp |= SIMPLE; + RExC_seen_zerolen++; /* Do not optimize RE away */ nextchar(pRExC_state); break; case 'z': @@ -2654,108 +2825,87 @@ tryagain: *flagp |= SIMPLE; RExC_seen_zerolen++; /* Do not optimize RE away */ nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'C': - ret = reg_node(pRExC_state, SANY); + ret = reg_node(pRExC_state, CANY); + RExC_seen |= REG_SEEN_CANY; *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'X': ret = reg_node(pRExC_state, CLUMP); *flagp |= HASWIDTH; nextchar(pRExC_state); - if (UTF && !PL_utf8_mark) - is_utf8_mark((U8*)"~"); /* preload table */ + Set_Node_Length(ret, 2); /* MJD */ break; case 'w': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? ALNUMLUTF8 : ALNUMUTF8) - : (LOC ? ALNUML : ALNUM)); + ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_alnum) - is_utf8_alnum((U8*)"a"); /* preload table */ + Set_Node_Length(ret, 2); /* MJD */ break; case 'W': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? NALNUMLUTF8 : NALNUMUTF8) - : (LOC ? NALNUML : NALNUM)); + ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_alnum) - is_utf8_alnum((U8*)"a"); /* preload table */ + Set_Node_Length(ret, 2); /* MJD */ break; case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, - UTF - ? (LOC ? BOUNDLUTF8 : BOUNDUTF8) - : (LOC ? BOUNDL : BOUND)); + ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND); *flagp |= SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_alnum) - is_utf8_alnum((U8*)"a"); /* preload table */ + Set_Node_Length(ret, 2); /* MJD */ break; case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, - UTF - ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8) - : (LOC ? NBOUNDL : NBOUND)); + ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND); *flagp |= SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_alnum) - is_utf8_alnum((U8*)"a"); /* preload table */ + Set_Node_Length(ret, 2); /* MJD */ break; case 's': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? SPACELUTF8 : SPACEUTF8) - : (LOC ? SPACEL : SPACE)); + ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_space) - is_utf8_space((U8*)" "); /* preload table */ + Set_Node_Length(ret, 2); /* MJD */ break; case 'S': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? NSPACELUTF8 : NSPACEUTF8) - : (LOC ? NSPACEL : NSPACE)); + ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_space) - is_utf8_space((U8*)" "); /* preload table */ + Set_Node_Length(ret, 2); /* MJD */ break; case 'd': - ret = reg_node(pRExC_state, UTF ? DIGITUTF8 : DIGIT); + ret = reg_node(pRExC_state, DIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_digit) - is_utf8_digit((U8*)"1"); /* preload table */ + Set_Node_Length(ret, 2); /* MJD */ break; case 'D': - ret = reg_node(pRExC_state, UTF ? NDIGITUTF8 : NDIGIT); + ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); - if (UTF && !PL_utf8_digit) - is_utf8_digit((U8*)"1"); /* preload table */ + Set_Node_Length(ret, 2); /* MJD */ break; case 'p': case 'P': - { /* a lovely hack--pretend we saw [\pX] instead */ + { char* oldregxend = RExC_end; + char* parse_start = RExC_parse; if (RExC_parse[1] == '{') { + /* a lovely hack--pretend we saw [\pX] instead */ RExC_end = strchr(RExC_parse, '}'); if (!RExC_end) { + U8 c = (U8)*RExC_parse; RExC_parse += 2; RExC_end = oldregxend; - vFAIL("Missing right brace on \\p{}"); + vFAIL2("Missing right brace on \\%c{}", c); } RExC_end++; } @@ -2763,10 +2913,11 @@ tryagain: RExC_end = RExC_parse + 2; RExC_parse--; - ret = regclassutf8(pRExC_state); + ret = regclass(pRExC_state); RExC_end = oldregxend; RExC_parse--; + Set_Node_Cur_Length(ret); /* MJD */ nextchar(pRExC_state); *flagp |= HASWIDTH|SIMPLE; } @@ -2789,6 +2940,7 @@ tryagain: if (num > 9 && num >= RExC_npar) goto defchar; else { + char * parse_start = RExC_parse - 1; /* MJD */ while (isDIGIT(*RExC_parse)) RExC_parse++; @@ -2799,6 +2951,10 @@ tryagain: ? (LOC ? REFFL : REFF) : REF, num); *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret); /* MJD */ RExC_parse--; nextchar(pRExC_state); } @@ -2829,6 +2985,10 @@ tryagain: register char *p; char *oldp, *s; STRLEN numlen; + STRLEN ulen; + U8 tmpbuf[UTF8_MAXLEN_UCLC+1]; + + parse_start = RExC_parse - 1; RExC_parse++; @@ -2857,6 +3017,8 @@ tryagain: case '\\': switch (*++p) { case 'A': + case 'C': + case 'X': case 'G': case 'Z': case 'z': @@ -2889,32 +3051,28 @@ tryagain: p++; break; case 'e': -#ifdef ASCIIish - ender = '\033'; -#else - ender = '\047'; -#endif + ender = ASCII_TO_NATIVE('\033'); p++; break; case 'a': -#ifdef ASCIIish - ender = '\007'; -#else - ender = '\057'; -#endif + ender = ASCII_TO_NATIVE('\007'); p++; break; case 'x': if (*++p == '{') { char* e = strchr(p, '}'); - + if (!e) { RExC_parse = p + 1; vFAIL("Missing right brace on \\x{}"); } else { - numlen = 1; /* allow underscores */ - ender = (UV)scan_hex(p + 1, e - p - 1, &numlen); + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX; + numlen = e - p - 1; + ender = grok_hex(p + 1, &numlen, &flags, NULL); + if (ender > 0xff) + RExC_utf8 = 1; /* numlen is generous */ if (numlen + len >= 127) { p--; @@ -2924,8 +3082,9 @@ tryagain: } } else { - numlen = 0; /* disallow underscores */ - ender = (UV)scan_hex(p, 2, &numlen); + I32 flags = PERL_SCAN_DISALLOW_PREFIX; + numlen = 2; + ender = grok_hex(p, &numlen, &flags, NULL); p += numlen; } break; @@ -2938,8 +3097,9 @@ tryagain: case '5': case '6': case '7': case '8':case '9': if (*p == '0' || (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) { - numlen = 0; /* disallow underscores */ - ender = (UV)scan_oct(p, 3, &numlen); + I32 flags = 0; + numlen = 3; + ender = grok_oct(p, &numlen, &flags, NULL); p += numlen; } else { @@ -2953,14 +3113,14 @@ tryagain: /* FALL THROUGH */ default: if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p)) - vWARN2(p +1, "Unrecognized escape \\%c passed through", *p); + vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); goto normal_default; } break; default: normal_default: - if ((*p & 0xc0) == 0xc0 && UTF) { - ender = utf8_to_uv((U8*)p, RExC_end - p, + if (UTF8_IS_START(*p) && UTF) { + ender = utf8n_to_uvchr((U8*)p, RExC_end - p, &numlen, 0); p += numlen; } @@ -2971,15 +3131,13 @@ tryagain: if (RExC_flags16 & PMf_EXTENDED) p = regwhite(p, RExC_end); if (UTF && FOLD) { - if (LOC) - ender = toLOWER_LC_uni(ender); - else - ender = toLOWER_uni(ender); + toLOWER_uni(ender, tmpbuf, &ulen); + ender = utf8_to_uvchr(tmpbuf, 0); } if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; - else if (ender >= 0x80 && UTF) { + else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { reguni(pRExC_state, ender, s, &numlen); s += numlen; len += numlen; @@ -2990,7 +3148,7 @@ tryagain: } break; } - if (ender >= 0x80 && UTF) { + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) { reguni(pRExC_state, ender, s, &numlen); s += numlen; len += numlen - 1; @@ -3000,6 +3158,7 @@ tryagain: } loopdone: RExC_parse = p - 1; + Set_Node_Cur_Length(ret); /* MJD */ nextchar(pRExC_state); { /* len is STRLEN which is unsigned, need to copy to signed */ @@ -3021,6 +3180,22 @@ tryagain: break; } + if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT && !RExC_utf8) { + STRLEN oldlen = STR_LEN(ret); + SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); + char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding); + STRLEN newlen = SvCUR(sv); + if (!SIZE_ONLY) { + DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n", + (int)oldlen, STRING(ret), (int)newlen, s)); + Copy(s, STRING(ret), newlen, char); + STR_LEN(ret) += newlen - oldlen; + RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen); + } else + RExC_size += STR_SZ(newlen) - STR_SZ(oldlen); + RExC_utf8 = 1; + } + return(ret); } @@ -3045,23 +3220,25 @@ S_regwhite(pTHX_ char *p, char *e) Character classes ([:foo:]) can also be negated ([:^foo:]). Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, - but trigger warnings because they are currently unimplemented. */ + but trigger failures because they are currently unimplemented. */ + +#define POSIXCC_DONE(c) ((c) == ':') +#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') +#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) + STATIC I32 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) { - dTHR; char *posixcc = 0; I32 namedclass = OOB_NAMEDCLASS; if (value == '[' && RExC_parse + 1 < RExC_end && /* I smell either [: or [= or [. -- POSIX has been here, right? */ - (*RExC_parse == ':' || - *RExC_parse == '=' || - *RExC_parse == '.')) { - char c = *RExC_parse; + POSIXCC(UCHARAT(RExC_parse))) { + char c = UCHARAT(RExC_parse); char* s = RExC_parse++; - - while (RExC_parse < RExC_end && *RExC_parse != c) + + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) RExC_parse++; if (RExC_parse == RExC_end) /* Grandfather lone [:, [=, [. */ @@ -3069,7 +3246,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) else { char* t = RExC_parse++; /* skip over the c */ - if (*RExC_parse == ']') { + if (UCHARAT(RExC_parse) == ']') { RExC_parse++; /* skip over the ending ] */ posixcc = s + 1; if (*s == ':') { @@ -3158,7 +3335,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) /* adjust RExC_parse so the warning shows after the class closes */ - while (*RExC_parse && *RExC_parse != ']') + while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') RExC_parse++; Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); } @@ -3177,9 +3354,7 @@ STATIC void S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) { if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && - (*RExC_parse == ':' || - *RExC_parse == '=' || - *RExC_parse == '.')) { + POSIXCC(UCHARAT(RExC_parse))) { char *s = RExC_parse; char c = *s++; @@ -3189,11 +3364,10 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c); /* [[=foo=]] and [[.foo.]] are still future. */ - if (c == '=' || c == '.') - { + if (POSIXCC_NOTYET(c)) { /* adjust RExC_parse so the error shows after the class closes */ - while (*RExC_parse && *RExC_parse++ != ']') + while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']') ; Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); } @@ -3204,75 +3378,156 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state) { - dTHR; - register U32 value; - register I32 lastvalue = OOB_CHAR8; - register I32 range = 0; + register UV value; + register UV nextvalue; + register IV prevvalue = OOB_UNICODE; + register IV range = 0; register regnode *ret; STRLEN numlen; - I32 namedclass; - char *rangebegin; + IV namedclass; + char *rangebegin = 0; bool need_class = 0; + SV *listsv = Nullsv; + register char *e; + UV n; + bool optimize_invert = TRUE; + + ret = reganode(pRExC_state, ANYOF, 0); + + if (!SIZE_ONLY) + ANYOF_FLAGS(ret) = 0; + + if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ + RExC_naughty++; + RExC_parse++; + if (!SIZE_ONLY) + ANYOF_FLAGS(ret) |= ANYOF_INVERT; + } - ret = reg_node(pRExC_state, ANYOF); if (SIZE_ONLY) RExC_size += ANYOF_SKIP; else { - ret->flags = 0; - ANYOF_BITMAP_ZERO(ret); RExC_emit += ANYOF_SKIP; if (FOLD) ANYOF_FLAGS(ret) |= ANYOF_FOLD; if (LOC) ANYOF_FLAGS(ret) |= ANYOF_LOCALE; + ANYOF_BITMAP_ZERO(ret); + listsv = newSVpvn("# comment\n", 10); } - if (*RExC_parse == '^') { /* Complement of range. */ - RExC_naughty++; - RExC_parse++; - if (!SIZE_ONLY) - ANYOF_FLAGS(ret) |= ANYOF_INVERT; - } - if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) + nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; + + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue)) checkposixcc(pRExC_state); - if (*RExC_parse == ']' || *RExC_parse == '-') - goto skipcond; /* allow 1st char to be ] or - */ - while (RExC_parse < RExC_end && *RExC_parse != ']') { - skipcond: - namedclass = OOB_NAMEDCLASS; + if (UCHARAT(RExC_parse) == ']' || UCHARAT(RExC_parse) == '-') + goto charclassloop; /* allow 1st char to be ] or - */ + + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { + + charclassloop: + + namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ + if (!range) rangebegin = RExC_parse; - value = UCHARAT(RExC_parse++); - if (value == '[') + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, 0); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; + if (value == '[' && POSIXCC(nextvalue)) namedclass = regpposixcc(pRExC_state, value); else if (value == '\\') { - value = UCHARAT(RExC_parse++); + if (UTF) { + value = utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, 0); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); /* Some compilers cannot handle switching on 64-bit integer - * values, therefore the 'value' cannot be an UV. --jhi */ - switch (value) { + * values, therefore value cannot be an UV. Yes, this will + * be a problem later if we want switch on Unicode. + * A similar issue a little bit later when switching on + * namedclass. --jhi */ + switch ((I32)value) { case 'w': namedclass = ANYOF_ALNUM; break; case 'W': namedclass = ANYOF_NALNUM; break; case 's': namedclass = ANYOF_SPACE; break; case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; case 'D': namedclass = ANYOF_NDIGIT; break; + case 'p': + case 'P': + if (*RExC_parse == '{') { + U8 c = (U8)value; + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL2("Missing right brace on \\%c{}", c); + while (isSPACE(UCHARAT(RExC_parse))) + RExC_parse++; + if (e == RExC_parse) + vFAIL2("Empty \\%c{}", c); + n = e - RExC_parse; + while (isSPACE(UCHARAT(RExC_parse + n - 1))) + n--; + } + else { + e = RExC_parse; + n = 1; + } + if (!SIZE_ONLY) { + if (UCHARAT(RExC_parse) == '^') { + RExC_parse++; + n--; + value = value == 'p' ? 'P' : 'p'; /* toggle */ + while (isSPACE(UCHARAT(RExC_parse))) { + RExC_parse++; + n--; + } + } + if (value == 'p') + Perl_sv_catpvf(aTHX_ listsv, + "+utf8::%.*s\n", (int)n, RExC_parse); + else + Perl_sv_catpvf(aTHX_ listsv, + "!utf8::%.*s\n", (int)n, RExC_parse); + } + RExC_parse = e + 1; + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + continue; case 'n': value = '\n'; break; case 'r': value = '\r'; break; case 't': value = '\t'; break; case 'f': value = '\f'; break; case 'b': value = '\b'; break; -#ifdef ASCIIish - case 'e': value = '\033'; break; - case 'a': value = '\007'; break; -#else - case 'e': value = '\047'; break; - case 'a': value = '\057'; break; -#endif + case 'e': value = ASCII_TO_NATIVE('\033');break; + case 'a': value = ASCII_TO_NATIVE('\007');break; case 'x': - numlen = 0; /* disallow underscores */ - value = (UV)scan_hex(RExC_parse, 2, &numlen); - RExC_parse += numlen; + if (*RExC_parse == '{') { + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX; + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL("Missing right brace on \\x{}"); + + numlen = e - RExC_parse; + value = grok_hex(RExC_parse, &numlen, &flags, NULL); + RExC_parse = e + 1; + } + else { + I32 flags = PERL_SCAN_DISALLOW_PREFIX; + numlen = 2; + value = grok_hex(RExC_parse, &numlen, &flags, NULL); + RExC_parse += numlen; + } break; case 'c': value = UCHARAT(RExC_parse++); @@ -3280,22 +3535,31 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - numlen = 0; /* disallow underscores */ - value = (UV)scan_oct(--RExC_parse, 3, &numlen); + { + I32 flags = 0; + numlen = 3; + value = grok_oct(--RExC_parse, &numlen, &flags, NULL); RExC_parse += numlen; break; + } default: if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) - - vWARN2(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value); + vWARN2(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); break; } - } - if (namedclass > OOB_NAMEDCLASS) { - if (!need_class && !SIZE_ONLY) + } /* end of \blah */ + + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + + if (!SIZE_ONLY && !need_class) ANYOF_CLASS_ZERO(ret); + need_class = 1; - if (range) { /* a-\d, a-[:digit:] */ + + /* a bad range like a-\d, a-[:digit:] ? */ + if (range) { if (!SIZE_ONLY) { if (ckWARN(WARN_REGEXP)) vWARN4(RExC_parse, @@ -3303,13 +3567,28 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) RExC_parse - rangebegin, RExC_parse - rangebegin, rangebegin); - ANYOF_BITMAP_SET(ret, lastvalue); - ANYOF_BITMAP_SET(ret, '-'); + if (prevvalue < 256) { + ANYOF_BITMAP_SET(ret, prevvalue); + ANYOF_BITMAP_SET(ret, '-'); + } + else { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + Perl_sv_catpvf(aTHX_ listsv, + "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); + } } - range = 0; /* this is not a true range */ + + range = 0; /* this was not a true range */ } + if (!SIZE_ONLY) { - switch (namedclass) { + if (namedclass > OOB_NAMEDCLASS) + optimize_invert = FALSE; + /* Possible truncation here but in some 64-bit environments + * the compiler gets heartburn about switch on 64-bit values. + * A similar issue a little earlier when switching on value. + * --jhi */ + switch ((I32)namedclass) { case ANYOF_ALNUM: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_ALNUM); @@ -3318,6 +3597,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; case ANYOF_NALNUM: if (LOC) @@ -3327,42 +3607,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; - case ANYOF_SPACE: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_SPACE); - else { - for (value = 0; value < 256; value++) - if (isSPACE(value)) - ANYOF_BITMAP_SET(ret, value); - } - break; - case ANYOF_NSPACE: + case ANYOF_ALNUMC: if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NSPACE); + ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); else { for (value = 0; value < 256; value++) - if (!isSPACE(value)) + if (isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - break; - case ANYOF_DIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_DIGIT); - else { - for (value = '0'; value <= '9'; value++) - ANYOF_BITMAP_SET(ret, value); - } - break; - case ANYOF_NDIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); - else { - for (value = 0; value < '0'; value++) - ANYOF_BITMAP_SET(ret, value); - for (value = '9' + 1; value < 256; value++) - ANYOF_BITMAP_SET(ret, value); - } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; case ANYOF_NALNUMC: if (LOC) @@ -3372,15 +3627,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - break; - case ANYOF_ALNUMC: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); - else { - for (value = 0; value < 256; value++) - if (isALNUMC(value)) - ANYOF_BITMAP_SET(ret, value); - } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; case ANYOF_ALPHA: if (LOC) @@ -3390,6 +3637,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; case ANYOF_NALPHA: if (LOC) @@ -3399,34 +3647,39 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; case ANYOF_ASCII: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_ASCII); else { -#ifdef ASCIIish +#ifndef EBCDIC for (value = 0; value < 128; value++) ANYOF_BITMAP_SET(ret, value); #else /* EBCDIC */ - for (value = 0; value < 256; value++) + for (value = 0; value < 256; value++) { if (isASCII(value)) - ANYOF_BITMAP_SET(ret, value); + ANYOF_BITMAP_SET(ret, value); + } #endif /* EBCDIC */ } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; case ANYOF_NASCII: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_NASCII); else { -#ifdef ASCIIish +#ifndef EBCDIC for (value = 128; value < 256; value++) ANYOF_BITMAP_SET(ret, value); #else /* EBCDIC */ - for (value = 0; value < 256; value++) + for (value = 0; value < 256; value++) { if (!isASCII(value)) - ANYOF_BITMAP_SET(ret, value); + ANYOF_BITMAP_SET(ret, value); + } #endif /* EBCDIC */ } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; case ANYOF_BLANK: if (LOC) @@ -3436,6 +3689,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; case ANYOF_NBLANK: if (LOC) @@ -3445,6 +3699,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; case ANYOF_CNTRL: if (LOC) @@ -3454,7 +3709,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } - lastvalue = OOB_CHAR8; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; case ANYOF_NCNTRL: if (LOC) @@ -3464,6 +3719,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); + break; + case ANYOF_DIGIT: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_DIGIT); + else { + /* consecutive digits assumed */ + for (value = '0'; value <= '9'; value++) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); + break; + case ANYOF_NDIGIT: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); + else { + /* consecutive digits assumed */ + for (value = 0; value < '0'; value++) + ANYOF_BITMAP_SET(ret, value); + for (value = '9' + 1; value < 256; value++) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; case ANYOF_GRAPH: if (LOC) @@ -3473,6 +3751,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; case ANYOF_NGRAPH: if (LOC) @@ -3482,6 +3761,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; case ANYOF_LOWER: if (LOC) @@ -3491,6 +3771,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; case ANYOF_NLOWER: if (LOC) @@ -3500,6 +3781,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; case ANYOF_PRINT: if (LOC) @@ -3509,6 +3791,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; case ANYOF_NPRINT: if (LOC) @@ -3518,6 +3801,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; case ANYOF_PSXSPC: if (LOC) @@ -3527,6 +3811,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; case ANYOF_NPSXSPC: if (LOC) @@ -3536,6 +3821,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; case ANYOF_PUNCT: if (LOC) @@ -3545,6 +3831,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; case ANYOF_NPUNCT: if (LOC) @@ -3554,6 +3841,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); + break; + case ANYOF_SPACE: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_SPACE); + else { + for (value = 0; value < 256; value++) + if (isSPACE(value)) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n"); + break; + case ANYOF_NSPACE: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NSPACE); + else { + for (value = 0; value < 256; value++) + if (!isSPACE(value)) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n"); break; case ANYOF_UPPER: if (LOC) @@ -3563,6 +3871,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; case ANYOF_NUPPER: if (LOC) @@ -3572,6 +3881,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; case ANYOF_XDIGIT: if (LOC) @@ -3581,6 +3891,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; case ANYOF_NXDIGIT: if (LOC) @@ -3590,6 +3901,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; default: vFAIL("Invalid [::] class"); @@ -3599,22 +3911,25 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_FLAGS(ret) |= ANYOF_CLASS; continue; } - } + } /* end of namedclass \blah */ + if (range) { - if (lastvalue > value) /* b-a */ { + if (prevvalue > value) /* b-a */ { Simple_vFAIL4("Invalid [] range \"%*.*s\"", RExC_parse - rangebegin, RExC_parse - rangebegin, rangebegin); + range = 0; /* not a valid range */ } - range = 0; } else { - lastvalue = value; + prevvalue = value; /* save the beginning of the range */ if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && RExC_parse[1] != ']') { RExC_parse++; - if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ + + /* a bad range like \w-, [:word:]- ? */ + if (namedclass > OOB_NAMEDCLASS) { if (ckWARN(WARN_REGEXP)) vWARN4(RExC_parse, "False [] range \"%*.*s\"", @@ -3624,319 +3939,94 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!SIZE_ONLY) ANYOF_BITMAP_SET(ret, '-'); } else - range = 1; - continue; /* do it next time */ + range = 1; /* yeah, it's a range! */ + continue; /* but do it the next time */ } } + /* now is the next time */ if (!SIZE_ONLY) { -#ifndef ASCIIish /* EBCDIC, for example. */ - if ((isLOWER(lastvalue) && isLOWER(value)) || - (isUPPER(lastvalue) && isUPPER(value))) - { - I32 i; - if (isLOWER(lastvalue)) { - for (i = lastvalue; i <= value; i++) - if (isLOWER(i)) - ANYOF_BITMAP_SET(ret, i); - } else { - for (i = lastvalue; i <= value; i++) - if (isUPPER(i)) - ANYOF_BITMAP_SET(ret, i); + IV i; + + if (prevvalue < 256) { + IV ceilvalue = value < 256 ? value : 255; + +#ifdef EBCDIC + if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || + (isUPPER(prevvalue) && isUPPER(ceilvalue))) + { + if (isLOWER(prevvalue)) { + for (i = prevvalue; i <= ceilvalue; i++) + if (isLOWER(i)) + ANYOF_BITMAP_SET(ret, i); + } else { + for (i = prevvalue; i <= ceilvalue; i++) + if (isUPPER(i)) + ANYOF_BITMAP_SET(ret, i); + } } - } - else + else #endif - for ( ; lastvalue <= value; lastvalue++) - ANYOF_BITMAP_SET(ret, lastvalue); + for (i = prevvalue; i <= ceilvalue; i++) + ANYOF_BITMAP_SET(ret, i); + } + if (value > 255 || UTF) { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + if (prevvalue < value) + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", + (UV)prevvalue, (UV)value); + else if (prevvalue == value) + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", + (UV)value); + } } - range = 0; + + range = 0; /* this range (if it was one) is done now */ } + if (need_class) { + ANYOF_FLAGS(ret) |= ANYOF_LARGE; if (SIZE_ONLY) RExC_size += ANYOF_CLASS_ADD_SKIP; else RExC_emit += ANYOF_CLASS_ADD_SKIP; } + /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ if (!SIZE_ONLY && - (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) { + /* If the only flag is folding (plus possibly inversion). */ + ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) + ) { for (value = 0; value < 256; ++value) { if (ANYOF_BITMAP_TEST(ret, value)) { - I32 cf = PL_fold[value]; - ANYOF_BITMAP_SET(ret, cf); + IV fold = PL_fold[value]; + + if (fold != value) + ANYOF_BITMAP_SET(ret, fold); } } ANYOF_FLAGS(ret) &= ~ANYOF_FOLD; } + /* optimize inverted simple patterns (e.g. [^a-z]) */ - if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { + if (!SIZE_ONLY && optimize_invert && + /* If the only flag is inversion. */ + (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; - ANYOF_FLAGS(ret) = 0; + ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL; } - return ret; -} - -STATIC regnode * -S_regclassutf8(pTHX_ RExC_state_t *pRExC_state) -{ - dTHR; - register char *e; - register U32 value; - register U32 lastvalue = OOB_UTF8; - register I32 range = 0; - register regnode *ret; - STRLEN numlen; - I32 n; - SV *listsv; - U8 flags = 0; - I32 namedclass; - char *rangebegin; - if (*RExC_parse == '^') { /* Complement of range. */ - RExC_naughty++; - RExC_parse++; - if (!SIZE_ONLY) - flags |= ANYOF_INVERT; - } if (!SIZE_ONLY) { - if (FOLD) - flags |= ANYOF_FOLD; - if (LOC) - flags |= ANYOF_LOCALE; - listsv = newSVpvn("# comment\n", 10); - } - - if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) - checkposixcc(pRExC_state); - - if (*RExC_parse == ']' || *RExC_parse == '-') - goto skipcond; /* allow 1st char to be ] or - */ + AV *av = newAV(); + SV *rv; - while (RExC_parse < RExC_end && *RExC_parse != ']') { - skipcond: - namedclass = OOB_NAMEDCLASS; - if (!range) - rangebegin = RExC_parse; - value = utf8_to_uv((U8*)RExC_parse, - RExC_end - RExC_parse, - &numlen, 0); - RExC_parse += numlen; - if (value == '[') - namedclass = regpposixcc(pRExC_state, value); - else if (value == '\\') { - value = (U32)utf8_to_uv((U8*)RExC_parse, - RExC_end - RExC_parse, - &numlen, 0); - RExC_parse += numlen; - /* Some compilers cannot handle switching on 64-bit integer - * values, therefore value cannot be an UV. Yes, this will - * be a problem later if we want switch on Unicode. --jhi */ - switch (value) { - case 'w': namedclass = ANYOF_ALNUM; break; - case 'W': namedclass = ANYOF_NALNUM; break; - case 's': namedclass = ANYOF_SPACE; break; - case 'S': namedclass = ANYOF_NSPACE; break; - case 'd': namedclass = ANYOF_DIGIT; break; - case 'D': namedclass = ANYOF_NDIGIT; break; - case 'p': - case 'P': - if (*RExC_parse == '{') { - e = strchr(RExC_parse++, '}'); - if (!e) - vFAIL("Missing right brace on \\p{}"); - n = e - RExC_parse; - } - else { - e = RExC_parse; - n = 1; - } - if (!SIZE_ONLY) { - if (value == 'p') - Perl_sv_catpvf(aTHX_ listsv, - "+utf8::%.*s\n", (int)n, RExC_parse); - else - Perl_sv_catpvf(aTHX_ listsv, - "!utf8::%.*s\n", (int)n, RExC_parse); - } - RExC_parse = e + 1; - lastvalue = OOB_UTF8; - continue; - case 'n': value = '\n'; break; - case 'r': value = '\r'; break; - case 't': value = '\t'; break; - case 'f': value = '\f'; break; - case 'b': value = '\b'; break; -#ifdef ASCIIish - case 'e': value = '\033'; break; - case 'a': value = '\007'; break; -#else - case 'e': value = '\047'; break; - case 'a': value = '\057'; break; -#endif - case 'x': - if (*RExC_parse == '{') { - e = strchr(RExC_parse++, '}'); - if (!e) - vFAIL("Missing right brace on \\x{}"); - numlen = 1; /* allow underscores */ - value = (UV)scan_hex(RExC_parse, - e - RExC_parse, - &numlen); - RExC_parse = e + 1; - } - else { - numlen = 0; /* disallow underscores */ - value = (UV)scan_hex(RExC_parse, 2, &numlen); - RExC_parse += numlen; - } - break; - case 'c': - value = UCHARAT(RExC_parse++); - value = toCTRL(value); - break; - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - numlen = 0; /* disallow underscores */ - value = (UV)scan_oct(--RExC_parse, 3, &numlen); - RExC_parse += numlen; - break; - default: - if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) - vWARN2(RExC_parse, - "Unrecognized escape \\%c in character class passed through", - (int)value); - break; - } - } - if (namedclass > OOB_NAMEDCLASS) { - if (range) { /* a-\d, a-[:digit:] */ - if (!SIZE_ONLY) { - if (ckWARN(WARN_REGEXP)) - vWARN4(RExC_parse, - "False [] range \"%*.*s\"", - RExC_parse - rangebegin, - RExC_parse - rangebegin, - rangebegin); - Perl_sv_catpvf(aTHX_ listsv, - /* 0x002D is Unicode for '-' */ - "%04"UVxf"\n002D\n", (UV)lastvalue); - } - range = 0; - } - if (!SIZE_ONLY) { - switch (namedclass) { - case ANYOF_ALNUM: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; - case ANYOF_NALNUM: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; - case ANYOF_ALNUMC: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; - case ANYOF_NALNUMC: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; - case ANYOF_ALPHA: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; - case ANYOF_NALPHA: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; - case ANYOF_ASCII: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; - case ANYOF_NASCII: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; - case ANYOF_CNTRL: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; - case ANYOF_NCNTRL: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break; - case ANYOF_GRAPH: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; - case ANYOF_NGRAPH: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; - case ANYOF_DIGIT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break; - case ANYOF_NDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; - case ANYOF_LOWER: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; - case ANYOF_NLOWER: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; - case ANYOF_PRINT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; - case ANYOF_NPRINT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; - case ANYOF_PUNCT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; - case ANYOF_NPUNCT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; - case ANYOF_SPACE: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break; - case ANYOF_NSPACE: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break; - case ANYOF_BLANK: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; - case ANYOF_NBLANK: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; - case ANYOF_PSXSPC: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; - case ANYOF_NPSXSPC: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; - case ANYOF_UPPER: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; - case ANYOF_NUPPER: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; - case ANYOF_XDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; - case ANYOF_NXDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; - } - continue; - } - } - if (range) { - if (lastvalue > value) { /* b-a */ - Simple_vFAIL4("Invalid [] range \"%*.*s\"", - RExC_parse - rangebegin, - RExC_parse - rangebegin, - rangebegin); - } - range = 0; - } - else { - lastvalue = value; - if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && - RExC_parse[1] != ']') { - RExC_parse++; - if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ - if (ckWARN(WARN_REGEXP)) - vWARN4(RExC_parse, - "False [] range \"%*.*s\"", - RExC_parse - rangebegin, - RExC_parse - rangebegin, - rangebegin); - if (!SIZE_ONLY) - Perl_sv_catpvf(aTHX_ listsv, - /* 0x002D is Unicode for '-' */ - "002D\n"); - } else - range = 1; - continue; /* do it next time */ - } - } - /* now is the next time */ - if (!SIZE_ONLY) - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - (UV)lastvalue, (UV)value); - range = 0; - } - - ret = reganode(pRExC_state, ANYOFUTF8, 0); - - if (!SIZE_ONLY) { - SV *rv = swash_init("utf8", "", listsv, 1, 0); - SvREFCNT_dec(listsv); - n = add_data(pRExC_state, 1,"s"); + av_store(av, 0, listsv); + av_store(av, 1, NULL); + rv = newRV_noinc((SV*)av); + n = add_data(pRExC_state, 1, "s"); RExC_rx->data->data[n] = (void*)rv; - ARG1_SET(ret, flags); - ARG2_SET(ret, n); + ARG_SET(ret, n); } return ret; @@ -3945,7 +4035,6 @@ S_regclassutf8(pTHX_ RExC_state_t *pRExC_state) STATIC char* S_nextchar(pTHX_ RExC_state_t *pRExC_state) { - dTHR; char* retval = RExC_parse++; for (;;) { @@ -3978,7 +4067,6 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) STATIC regnode * /* Location. */ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) { - dTHR; register regnode *ret; register regnode *ptr; @@ -3992,6 +4080,18 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE(ptr, op); + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", + "reg_node", __LINE__, + reg_name[op], + RExC_emit - RExC_emit_start > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + RExC_emit - RExC_emit_start, + RExC_parse - RExC_start, + RExC_offsets[0])); + Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); + } + RExC_emit = ptr; return(ret); @@ -4003,7 +4103,6 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) STATIC regnode * /* Location. */ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) { - dTHR; register regnode *ret; register regnode *ptr; @@ -4017,6 +4116,17 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE_ARG(ptr, op, arg); + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", + "reganode", + RExC_emit - RExC_emit_start > RExC_offsets[0] ? + "Overwriting end of array!\n" : "OK", + RExC_emit - RExC_emit_start, + RExC_parse - RExC_start, + RExC_offsets[0])); + Set_Cur_Node_Offset; + } + RExC_emit = ptr; return(ret); @@ -4028,8 +4138,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) STATIC void S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp) { - dTHR; - *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s); + *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); } /* @@ -4040,12 +4149,11 @@ S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp) STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd) { - dTHR; register regnode *src; register regnode *dst; register regnode *place; register int offset = regarglen[(U8)op]; - + /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ if (SIZE_ONLY) { @@ -4056,10 +4164,33 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd) src = RExC_emit; RExC_emit += NODE_STEP_REGNODE + offset; dst = RExC_emit; - while (src > opnd) + while (src > opnd) { StructCopy(--src, --dst, regnode); + if (RExC_offsets) { /* MJD 20010112 */ + MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n", + "reg_insert", + dst - RExC_emit_start > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + src - RExC_emit_start, + dst - RExC_emit_start, + RExC_offsets[0])); + Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); + Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); + } + } + place = opnd; /* Op node, where operand used to be. */ + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", + "reginsert", + place - RExC_emit_start > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + place - RExC_emit_start, + RExC_parse - RExC_start, + RExC_offsets[0])); + Set_Node_Offset(place, RExC_parse); + } src = NEXTOPER(place); FILL_ADVANCE_NODE(place, op); Zero(src, offset, regnode); @@ -4071,7 +4202,6 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd) STATIC void S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val) { - dTHR; register regnode *scan; register regnode *temp; @@ -4101,7 +4231,6 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val) STATIC void S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val) { - dTHR; /* "Operandless" and "op != BRANCH" are synonymous in practice. */ if (p == NULL || SIZE_ONLY) return; @@ -4137,10 +4266,11 @@ S_regcurly(pTHX_ register char *s) } +#ifdef DEBUGGING + STATIC regnode * S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) { -#ifdef DEBUGGING register U8 op = EXACT; /* Arbitrary non-END op. */ register regnode *next; @@ -4160,13 +4290,13 @@ S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) (int)(2*l + 1), "", SvPVX(sv)); if (next == NULL) /* Next ptr. */ PerlIO_printf(Perl_debug_log, "(0)"); - else + else PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start)); (void)PerlIO_putc(Perl_debug_log, '\n'); after_print: if (PL_regkind[(U8)op] == BRANCHJ) { - register regnode *nnode = (OP(next) == LONGJMP - ? regnext(next) + register regnode *nnode = (OP(next) == LONGJMP + ? regnext(next) : next); if (last && nnode > last) nnode = last; @@ -4187,8 +4317,10 @@ S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1); } else if (op == ANYOF) { + /* arglen 1 + class block */ + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE) + ? ANYOF_CLASS_SKIP : ANYOF_SKIP); node = NEXTOPER(node); - node += ANYOF_SKIP; } else if (PL_regkind[(U8)op] == EXACT) { /* Literal string, where present. */ @@ -4204,10 +4336,11 @@ S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) else if (op == WHILEM) l--; } -#endif /* DEBUGGING */ return node; } +#endif /* DEBUGGING */ + /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ @@ -4215,7 +4348,6 @@ void Perl_regdump(pTHX_ regexp *r) { #ifdef DEBUGGING - dTHR; SV *sv = sv_newmortal(); (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0); @@ -4223,25 +4355,25 @@ Perl_regdump(pTHX_ regexp *r) /* Header fields of interest. */ if (r->anchored_substr) PerlIO_printf(Perl_debug_log, - "anchored `%s%.*s%s'%s at %"IVdf" ", + "anchored `%s%.*s%s'%s at %"IVdf" ", PL_colors[0], (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)), - SvPVX(r->anchored_substr), + SvPVX(r->anchored_substr), PL_colors[1], SvTAIL(r->anchored_substr) ? "$" : "", (IV)r->anchored_offset); if (r->float_substr) PerlIO_printf(Perl_debug_log, - "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", + "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ", PL_colors[0], - (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)), + (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)), SvPVX(r->float_substr), PL_colors[1], SvTAIL(r->float_substr) ? "$" : "", (IV)r->float_min_offset, (UV)r->float_max_offset); if (r->check_substr) - PerlIO_printf(Perl_debug_log, - r->check_substr == r->float_substr + PerlIO_printf(Perl_debug_log, + r->check_substr == r->float_substr ? "(checking floating" : "(checking anchored"); if (r->reganch & ROPT_NOSCAN) PerlIO_printf(Perl_debug_log, " noscan"); @@ -4276,13 +4408,25 @@ Perl_regdump(pTHX_ regexp *r) if (r->reganch & ROPT_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); + if (r->offsets) { + U32 i; + U32 len = r->offsets[0]; + PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]); + for (i = 1; i <= len; i++) + PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", + (UV)r->offsets[i*2-1], + (UV)r->offsets[i*2]); + PerlIO_printf(Perl_debug_log, "\n"); + } #endif /* DEBUGGING */ } +#ifdef DEBUGGING + STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (c <= ' ' || c == 127 || c == 255) + if (isCNTRL(c) || c == 255 || !isPRINT(c)) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); else if (c == '-' || c == ']' || c == '\\' || c == '^') Perl_sv_catpvf(aTHX_ sv, "\\%c", c); @@ -4290,6 +4434,8 @@ S_put_byte(pTHX_ SV *sv, int c) Perl_sv_catpvf(aTHX_ sv, "%c", c); } +#endif /* DEBUGGING */ + /* - regprop - printable representation of opcode */ @@ -4297,7 +4443,6 @@ void Perl_regprop(pTHX_ SV *sv, regnode *o) { #ifdef DEBUGGING - dTHR; register int k; sv_setpvn(sv, "", 0); @@ -4309,9 +4454,20 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) k = PL_regkind[(U8)OP(o)]; - if (k == EXACT) - Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0], - STR_LEN(o), STRING(o), PL_colors[1]); + if (k == EXACT) { + SV *dsv = sv_2mortal(newSVpvn("", 0)); + bool do_utf8 = DO_UTF8(sv); + char *s = do_utf8 ? + pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60, 0) : + STRING(o); + int len = do_utf8 ? + strlen(s) : + STR_LEN(o); + Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", + PL_colors[0], + len, s, + PL_colors[1]); + } else if (k == CURLY) { if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ @@ -4325,8 +4481,9 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == ANYOF) { int i, rangestart = -1; - const char * const out[] = { /* Should be syncronized with - ANYOF_ #xdefines in regcomp.h */ + U8 flags = ANYOF_FLAGS(o); + const char * const anyofs[] = { /* Should be syncronized with + * ANYOF_ #xdefines in regcomp.h */ "\\w", "\\W", "\\s", @@ -4359,38 +4516,102 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) "[:^blank:]" }; - if (o->flags & ANYOF_LOCALE) + if (flags & ANYOF_LOCALE) sv_catpv(sv, "{loc}"); - if (o->flags & ANYOF_FOLD) + if (flags & ANYOF_FOLD) sv_catpv(sv, "{i}"); Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - if (o->flags & ANYOF_INVERT) + if (flags & ANYOF_INVERT) sv_catpv(sv, "^"); - if (OP(o) == ANYOF) { - for (i = 0; i <= 256; i++) { - if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { + for (i = 0; i <= 256; i++) { + if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) put_byte(sv, rangestart); - sv_catpv(sv, "-"); - put_byte(sv, i - 1); - } - rangestart = -1; + else { + put_byte(sv, rangestart); + sv_catpv(sv, "-"); + put_byte(sv, i - 1); } + rangestart = -1; } - if (o->flags & ANYOF_CLASS) - for (i = 0; i < sizeof(out)/sizeof(char*); i++) - if (ANYOF_CLASS_TEST(o,i)) - sv_catpv(sv, out[i]); } - else { - sv_catpv(sv, "{ANYOFUTF8}"); /* TODO: full decode */ + + if (o->flags & ANYOF_CLASS) + for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++) + if (ANYOF_CLASS_TEST(o,i)) + sv_catpv(sv, anyofs[i]); + + if (flags & ANYOF_UNICODE) + sv_catpv(sv, "{unicode}"); + else if (flags & ANYOF_UNICODE_ALL) + sv_catpv(sv, "{unicode_all}"); + + { + SV *lv; + SV *sw = regclass_swash(o, FALSE, &lv); + + if (lv) { + if (sw) { + UV i; + U8 s[UTF8_MAXLEN+1]; + + for (i = 0; i <= 256; i++) { /* just the first 256 */ + U8 *e = uvchr_to_utf8(s, i); + + if (i < 256 && swash_fetch(sw, s, TRUE)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + U8 *p; + + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) { + for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++) + put_byte(sv, *p); + } + else { + for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++) + put_byte(sv, *p); + sv_catpv(sv, "-"); + for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++) + put_byte(sv, *p); + } + rangestart = -1; + } + } + + sv_catpv(sv, "..."); /* et cetera */ + } + + { + char *s = savepv(SvPVX(lv)); + char *origs = s; + + while(*s && *s != '\n') s++; + + if (*s == '\n') { + char *t = ++s; + + while (*s) { + if (*s == '\n') + *s = ' '; + s++; + } + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); + } + + Safefree(origs); + } + } } + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -4420,7 +4641,6 @@ Perl_re_intuit_string(pTHX_ regexp *prog) void Perl_pregfree(pTHX_ struct regexp *r) { - dTHR; DEBUG_r(if (!PL_colorset) reginitcolors()); if (!r || (--r->refcnt > 0)) @@ -4434,6 +4654,8 @@ Perl_pregfree(pTHX_ struct regexp *r) if (r->precomp) Safefree(r->precomp); + if (r->offsets) /* 20010421 MJD */ + Safefree(r->offsets); if (RX_MATCH_COPIED(r)) Safefree(r->subbeg); if (r->substrs) { @@ -4450,6 +4672,7 @@ Perl_pregfree(pTHX_ struct regexp *r) SV** old_curpad; while (--n >= 0) { + /* If you add a ->what type here, update the comment in regcomp.h */ switch (r->data->what[n]) { case 's': SvREFCNT_dec((SV*)r->data->data[n]); @@ -4472,7 +4695,11 @@ Perl_pregfree(pTHX_ struct regexp *r) } else PL_curpad = NULL; - op_free((OP_4tree*)r->data->data[n]); + + if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) { + op_free((OP_4tree*)r->data->data[n]); + } + PL_comppad = old_comppad; PL_curpad = old_curpad; SvREFCNT_dec((SV*)new_comppad); @@ -4501,7 +4728,6 @@ Perl_pregfree(pTHX_ struct regexp *r) regnode * Perl_regnext(pTHX_ register regnode *p) { - dTHR; register I32 offset; if (p == &PL_regdummy) @@ -4552,9 +4778,7 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) void Perl_save_re_context(pTHX) -{ - dTHR; - +{ #if 0 SAVEPPTR(RExC_precomp); /* uncompiled string. */ SAVEI32(RExC_npar); /* () count. */ @@ -4578,10 +4802,8 @@ Perl_save_re_context(pTHX) SAVEVPTR(PL_regendp); /* Ditto for endp. */ SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */ SAVEPPTR(PL_regtill); /* How far we are required to go. */ - SAVEI8(PL_regprev); /* char before regbol, \n if none */ - SAVEVPTR(PL_reg_start_tmp); /* from regexec.c */ + SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */ PL_reg_start_tmp = 0; - SAVEFREEPV(PL_reg_start_tmp); SAVEI32(PL_reg_start_tmpl); /* from regexec.c */ PL_reg_start_tmpl = 0; SAVEVPTR(PL_regdata); @@ -4595,24 +4817,20 @@ Perl_save_re_context(pTHX) SAVEVPTR(PL_reg_re); /* from regexec.c */ SAVEPPTR(PL_reg_ganch); /* from regexec.c */ SAVESPTR(PL_reg_sv); /* from regexec.c */ + SAVEI8(PL_reg_match_utf8); /* from regexec.c */ SAVEVPTR(PL_reg_magic); /* from regexec.c */ SAVEI32(PL_reg_oldpos); /* from regexec.c */ SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */ SAVEVPTR(PL_reg_curpm); /* from regexec.c */ SAVEI32(PL_regnpar); /* () count. */ + SAVEI32(PL_regsize); /* from regexec.c */ #ifdef DEBUGGING - SAVEPPTR(PL_reg_starttry); /* from regexec.c */ + SAVEPPTR(PL_reg_starttry); /* from regexec.c */ #endif } -#ifdef PERL_OBJECT -#include "XSUB.h" -#undef this -#define this pPerl -#endif - static void -clear_re(pTHXo_ void *r) +clear_re(pTHX_ void *r) { ReREFCNT_dec((regexp *)r); }