X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.c;h=dd48f3cf99f8873fae09b65a9d54396c59d23ee7;hb=b6455c53c26be8a62e12a3f2a24a3a5086dd2c7b;hp=ae9efbf53a8d70b07e4c964ef69a206f2f954332;hpb=c737faaf63999e5a68ef536d362cea408ab990b7;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.c b/regcomp.c index ae9efbf..dd48f3c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1356,7 +1356,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store unicode characters. We use the + (trie->charmap) and we use a an HV* to store Unicode characters. We use the native representation of the character value as the key and IV's for the coded index. @@ -1405,7 +1405,20 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* store the codepoint in the bitmap, and if its ascii also store its folded equivelent. */ TRIE_BITMAP_SET(trie,uvc); - if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + codepoints in the 127 < uvc < 256 range */ + if (127 < uvc && uvc < 192) { + TRIE_BITMAP_SET(trie,194); + } else if (191 < uvc ) { + TRIE_BITMAP_SET(trie,195); + /* && uvc < 256 -- we know uvc is < 256 already */ + } + } set_bit = 0; /* We've done our bit :-) */ } } else { @@ -1966,7 +1979,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( count == 1 ) { SV **tmp = av_fetch( revcharmap, idx, 0); - char *ch = SvPV_nolen( *tmp ); + STRLEN len; + char *ch = SvPV( *tmp, len ); DEBUG_OPTIMISE_r({ SV *sv=sv_newmortal(); PerlIO_printf( Perl_debug_log, @@ -1985,11 +1999,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs str=STRING(convert); STR_LEN(convert)=0; } - while (*ch) { + STR_LEN(convert) += len; + while (len--) *str++ = *ch++; - STR_LEN(convert)++; - } - } else { #ifdef DEBUGGING if (state>1) @@ -2400,6 +2412,34 @@ typedef struct scan_frame { #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) +#define CASE_SYNST_FNC(nAmE) \ +case nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break; \ +case N ## nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break + + + STATIC I32 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, I32 *deltap, @@ -3270,7 +3310,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str); + mg->mg_len += CHR_SVLEN(last_str) - l; } data->last_end += l * (mincount - 1); } @@ -3330,6 +3370,46 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, break; } } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + int value = 0; + data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ + if (flags & SCF_DO_STCLASS_AND) { + for (value = 0; value < 256; value++) + if (!is_VERTWS_cp(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { + for (value = 0; value < 256; value++) + if (is_VERTWS_cp(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + if (flags & SCF_DO_STCLASS_OR) + cl_and(data->start_class, and_withp); + flags &= ~SCF_DO_STCLASS; + } + min += 1; + delta += 1; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += 1; + data->longest = &(data->longest_float); + } + + } + else if (OP(scan) == FOLDCHAR) { + int d = ARG(scan)==0xDF ? 1 : 2; + flags &= ~SCF_DO_STCLASS; + min += 1; + delta += d; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += d; + data->longest = &(data->longest_float); + } + } else if (strchr((const char*)PL_simple,OP(scan))) { int value = 0; @@ -3524,6 +3604,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } break; + CASE_SYNST_FNC(VERTWS); + CASE_SYNST_FNC(HORIZWS); + } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, and_withp); @@ -3894,6 +3977,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ + /* Else: zero-length, ignore. */ scan = regnext(scan); } @@ -4016,8 +4100,8 @@ extern const struct regexp_engine my_reg_engine; #endif #ifndef PERL_IN_XSUB_RE -regexp * -Perl_pregcomp(pTHX_ char *exp, char *xend, U32 pm_flags) +REGEXP * +Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags) { dVAR; HV * const table = GvHV(PL_hintgv); @@ -4032,21 +4116,23 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, U32 pm_flags) PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", SvIV(*ptr)); }); - return CALLREGCOMP_ENG(eng, exp, xend, pm_flags); + return CALLREGCOMP_ENG(eng, pattern, flags); } } - return Perl_re_compile(aTHX_ exp, xend, pm_flags); + return Perl_re_compile(aTHX_ pattern, flags); } #endif -regexp * -Perl_re_compile(pTHX_ char *exp, char *xend, U32 pm_flags) +REGEXP * +Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) { dVAR; - register regexp *r; + register REGEXP *r; register regexp_internal *ri; + STRLEN plen; + char* exp = SvPV((SV*)pattern, plen); + char* xend = exp + plen; regnode *scan; - regnode *first; I32 flags; I32 minlen = 0; I32 sawplus = 0; @@ -4060,16 +4146,13 @@ Perl_re_compile(pTHX_ char *exp, char *xend, U32 pm_flags) #endif GET_RE_DEBUG_FLAGS_DECL; DEBUG_r(if (!PL_colorset) reginitcolors()); - - if (exp == NULL) - FAIL("NULL regexp argument"); RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8; DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RExC_utf8, - dsv, exp, (xend - exp), 60); + dsv, exp, plen, 60); PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", PL_colors[4],PL_colors[5],s); }); @@ -4115,7 +4198,7 @@ redo_first_pass: return(NULL); } if (RExC_utf8 && !RExC_orig_utf8) { - /* It's possible to write a regexp in ascii that represents unicode + /* It's possible to write a regexp in ascii that represents Unicode codepoints outside of the byte range, such as via \x{100}. If we detect such a sequence we have to convert the entire pattern to utf8 and then recompile, as our sizing calculation will have been based @@ -4124,7 +4207,7 @@ redo_first_pass: thing. XXX: somehow figure out how to make this less expensive... -- dmq */ - STRLEN len = xend-exp; + STRLEN len = plen; DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); @@ -4170,24 +4253,24 @@ redo_first_pass: RXi_SET( r, ri ); r->engine= RE_ENGINE_PTR; r->refcnt = 1; - r->prelen = xend - exp; + r->prelen = plen; r->extflags = pm_flags; { - bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); const char *fptr = STD_PAT_MODS; /*"msix"*/ char *p; - r->wraplen = r->prelen + has_minus + has_k + has_runon + r->wraplen = r->prelen + has_minus + has_p + has_runon + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); Newx(r->wrapped, r->wraplen + 1, char ); p = r->wrapped; *p++='('; *p++='?'; - if (has_k) - *p++ = KEEPCOPY_PAT_MOD; /*'k'*/ + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ { char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; char *colon = r + 1; @@ -4251,9 +4334,10 @@ redo_first_pass: /* Store the count of eval-groups for security checks: */ RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); - if (reg(pRExC_state, 0, &flags,1) == NULL) + if (reg(pRExC_state, 0, &flags,1) == NULL) { + ReREFCNT_dec(r); return(NULL); - + } /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ Newx(r->substrs, 1, struct reg_substr_data); @@ -4291,7 +4375,7 @@ reStudy: #endif /* Dig out information for optimizations. */ - r->extflags = pm_flags; /* Again? */ + r->extflags = RExC_flags; /* was pm_op */ /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ if (UTF) @@ -4309,18 +4393,20 @@ reStudy: struct regnode_charclass_class ch_class; /* pointed to by data */ int stclass_flag; I32 last_close = 0; /* pointed to by data */ - - first = scan; + regnode *first= scan; + regnode *first_next= regnext(first); + /* Skip introductions and multiplicators >= 1. */ while ((OP(first) == OPEN && (sawopen = 1)) || /* An OR of *one* alternative - should not happen now. */ - (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || + (OP(first) == BRANCH && OP(first_next) != BRANCH) || /* for now we can't handle lookbehind IFMATCH*/ (OP(first) == IFMATCH && !first->flags) || (OP(first) == PLUS) || (OP(first) == MINMOD) || /* An {n,m} with n>0 */ - (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) + (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || + (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) { if (OP(first) == PLUS) @@ -4332,6 +4418,7 @@ reStudy: first += EXTRA_STEP_2ARGS; } else /* XXX possible optimisation for /(?=)/ */ first = NEXTOPER(first); + first_next= regnext(first); } /* Starting-point info. */ @@ -4677,11 +4764,34 @@ reStudy: r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); else r->paren_names = NULL; - if (r->prelen == 3 && strEQ("\\s+", r->precomp)) - r->extflags |= RXf_WHITE; + +#ifdef STUPID_PATTERN_CHECKS + if (r->prelen == 0) + r->extflags |= RXf_NULL; + if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else if (r->prelen == 3 && memEQ("\\s+", r->precomp, 3)) + r->extflags |= RXf_WHITE; else if (r->prelen == 1 && r->precomp[0] == '^') r->extflags |= RXf_START_ONLY; - +#else + if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else { + regnode *first = ri->program + 1; + U8 fop = OP(first); + U8 nop = OP(NEXTOPER(first)); + + if (PL_regkind[fop] == NOTHING && nop == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && nop == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END) + r->extflags |= RXf_WHITE; + } +#endif #ifdef DEBUGGING if (RExC_paren_names) { ri->name_list_idx = add_data( pRExC_state, 1, "p" ); @@ -4724,11 +4834,52 @@ reStudy: SV* -Perl_reg_named_buff_get(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags) +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) +{ + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak(aTHX_ PL_no_modify); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags) { AV *retarray = NULL; SV *ret; - if (flags & 1) + if (flags & RXapif_ALL) retarray=newAV(); if (rx && rx->paren_names) { @@ -4738,48 +4889,164 @@ Perl_reg_named_buff_get(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags) SV* sv_dat=HeVAL(he_str); I32 *nums=(I32*)SvPVX(sv_dat); for ( i=0; inparens) >= nums[i] - && rx->offs[nums[i]].start != -1 - && rx->offs[nums[i]].end != -1) + if ((I32)(rx->nparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) { - ret = CALLREG_NUMBUF(rx,nums[i],NULL); + ret = newSVpvs(""); + CALLREG_NUMBUF_FETCH(rx,nums[i],ret); if (!retarray) return ret; } else { ret = newSVsv(&PL_sv_undef); } if (retarray) { - SvREFCNT_inc(ret); + SvREFCNT_inc_simple_void(ret); av_push(retarray, ret); } } if (retarray) - return (SV*)retarray; + return newRV((SV*)retarray); } } return NULL; } +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key, + const U32 flags) +{ + if (rx && rx->paren_names) { + if (flags & RXapif_ALL) { + return hv_exists_ent(rx->paren_names, key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags); + if (sv) { + SvREFCNT_dec(sv); + return TRUE; + } else { + return FALSE; + } + } + } else { + return FALSE; + } +} + SV* -Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags) +{ + (void)hv_iterinit(rx->paren_names); + + return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY); +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) +{ + if (rx && rx->paren_names) { + HV *hv = rx->paren_names; + HE *temphe; + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastcloseparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + STRLEN len; + char *pv = HePV(temphe, len); + return newSVpvn(pv,len); + } + } + } + return NULL; +} + +SV* +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags) +{ + SV *ret; + AV *av; + I32 length; + + if (rx && rx->paren_names) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(rx->paren_names)); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES)); + av = (AV*)SvRV(ret); + length = av_len(av); + return newSViv(length + 1); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) +{ + AV *av = newAV(); + + if (rx && rx->paren_names) { + HV *hv= rx->paren_names; + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastcloseparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + STRLEN len; + char *pv = HePV(temphe, len); + av_push(av, newSVpvn(pv,len)); + } + } + } + + return newRV((SV*)av); +} + +void +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv) { char *s = NULL; I32 i = 0; I32 s1, t1; - SV *sv = usesv ? usesv : newSVpvs(""); if (!rx->subbeg) { sv_setsv(sv,&PL_sv_undef); - return sv; + return; } else - if (paren == -2 && rx->offs[0].start != -1) { + if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { /* $` */ i = rx->offs[0].start; s = rx->subbeg; } else - if (paren == -1 && rx->offs[0].end != -1) { + if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { /* $' */ s = rx->subbeg + rx->offs[0].end; i = rx->sublen - rx->offs[0].end; @@ -4794,7 +5061,7 @@ Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) s = rx->subbeg + s1; } else { sv_setsv(sv,&PL_sv_undef); - return sv; + return; } assert(rx->sublen >= (s - rx->subbeg) + i ); if (i >= 0) { @@ -4832,10 +5099,86 @@ Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) } } else { sv_setsv(sv,&PL_sv_undef); + return; } - return sv; } +void +Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, + SV const * const value) +{ + PERL_UNUSED_ARG(rx); + PERL_UNUSED_ARG(paren); + PERL_UNUSED_ARG(value); + + if (!PL_localizing) + Perl_croak(aTHX_ PL_no_modify); +} + +I32 +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, + const I32 paren) +{ + I32 i; + I32 s1, t1; + + /* Some of this code was originally in C in F */ + switch (paren) { + /* $` / ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: + if (rx->offs[0].start != -1) { + i = rx->offs[0].start; + if (i > 0) { + s1 = 0; + t1 = i; + goto getlen; + } + } + return 0; + /* $' / ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: + if (rx->offs[0].end != -1) { + i = rx->sublen - rx->offs[0].end; + if (i > 0) { + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; + } + } + return 0; + /* $& / ${^MATCH}, $1, $2, ... */ + default: + if (paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + i = t1 - s1; + goto getlen; + } else { + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((SV*)sv); + return 0; + } + } + getlen: + if (i > 0 && RX_MATCH_UTF8(rx)) { + const char * const s = rx->subbeg + s1; + const U8 *ep; + STRLEN el; + + i = t1 - s1; + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; + } + return i; +} + +SV* +Perl_reg_qr_package(pTHX_ REGEXP * const rx) +{ + PERL_UNUSED_ARG(rx); + return newSVpvs("Regexp"); +} /* Scans the name of a named buffer from the pattern. * If flags is REG_RSN_RETURN_NULL returns null. @@ -4965,7 +5308,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) register regnode *ender = NULL; register I32 parno = 0; I32 flags; - const I32 oregflags = RExC_flags; + U32 oregflags = RExC_flags; bool have_branch = 0; bool is_open = 0; I32 freeze_paren = 0; @@ -5125,7 +5468,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } RExC_sawback = 1; ret = reganode(pRExC_state, @@ -5460,7 +5803,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } ret = reganode(pRExC_state,NGROUPP,num); goto insert_if_check_paren; @@ -5564,8 +5907,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) and must be globally applied -- japhy */ switch (*RExC_parse) { CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); - case 'o': - case 'g': + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; if (! (wastedflags & wflagbit) ) { @@ -5582,7 +5925,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; - case 'c': + case CONTINUE_PAT_MOD: /* 'c' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; @@ -5595,10 +5938,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } break; - case 'k': + case KEEPCOPY_PAT_MOD: /* 'p' */ if (flagsp == &negflags) { if (SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(RExC_parse + 1,"Useless use of (?-k)"); + vWARN(RExC_parse + 1,"Useless use of (?-p)"); } else { *flagsp |= RXf_PMf_KEEPCOPY; } @@ -5618,6 +5961,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case ')': RExC_flags |= posflags; RExC_flags &= ~negflags; + if (paren != ':') { + oregflags |= posflags; + oregflags &= ~negflags; + } nextchar(pRExC_state); if (paren != ':') { *flagp = TRYAGAIN; @@ -6132,7 +6479,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) /* RExC_parse points at the beginning brace, endbrace points at the last */ if ( name[0]=='U' && name[1]=='+' ) { - /* its a "unicode hex" notation {U+89AB} */ + /* its a "Unicode hex" notation {U+89AB} */ I32 fl = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); @@ -6356,8 +6703,7 @@ S_reg_recode(pTHX_ const char value, SV **encp) { STRLEN numlen = 1; SV * const sv = sv_2mortal(newSVpvn(&value, numlen)); - const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp) - : SvPVX(sv); + const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); const STRLEN newlen = SvCUR(sv); UV uv = UNICODE_REPLACEMENT; @@ -6368,8 +6714,7 @@ S_reg_recode(pTHX_ const char value, SV **encp) if (!newlen || numlen != newlen) { uv = UNICODE_REPLACEMENT; - if (encp) - *encp = NULL; + *encp = NULL; } return uv; } @@ -6410,7 +6755,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) tryagain: - switch (*RExC_parse) { + switch ((U8)*RExC_parse) { case '^': RExC_seen_zerolen++; nextchar(pRExC_state); @@ -6494,6 +6839,22 @@ tryagain: RExC_parse++; vFAIL("Quantifier follows nothing"); break; + case 0xDF: + case 0xC3: + case 0xCE: + if (!LOC && FOLD) { + U32 len,cp; + len=0; /* silence a spurious compiler warning */ + if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { + *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ + RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ + ret = reganode(pRExC_state, FOLDCHAR, cp); + Set_Node_Length(ret, 1); /* MJD */ + nextchar(pRExC_state); /* kill whitespace under /x */ + return ret; + } + } + goto outer_default; case '\\': /* Special Escapes @@ -6579,15 +6940,25 @@ tryagain: ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; + case 'R': + ret = reg_node(pRExC_state, LNBREAK); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'h': + ret = reg_node(pRExC_state, HORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'H': + ret = reg_node(pRExC_state, NHORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; case 'v': - ret = reganode(pRExC_state, PRUNE, 0); - ret->flags = 1; - *flagp |= SIMPLE; + ret = reg_node(pRExC_state, VERTWS); + *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'V': - ret = reganode(pRExC_state, SKIP, 0); - ret->flags = 1; - *flagp |= SIMPLE; + ret = reg_node(pRExC_state, NVERTWS); + *flagp |= HASWIDTH|SIMPLE; finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ @@ -6658,7 +7029,7 @@ tryagain: if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } RExC_sawback = 1; @@ -6699,6 +7070,8 @@ tryagain: goto parse_named_seq; } } num = atoi(RExC_parse); + if (isg && num == 0) + vFAIL("Reference to invalid group 0"); if (isrel) { num = RExC_npar - num; if (num < 1) @@ -6754,7 +7127,8 @@ tryagain: } /* FALL THROUGH */ - default: { + default: + outer_default:{ register STRLEN len; register UV ender; register char *p; @@ -6779,7 +7153,12 @@ tryagain: if (RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - switch (*p) { + switch ((U8)*p) { + case 0xDF: + case 0xC3: + case 0xCE: + if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) + goto normal_default; case '^': case '$': case '.': @@ -6809,11 +7188,13 @@ tryagain: case 'C': /* Single char !DANGEROUS! */ case 'd': case 'D': /* digit class */ case 'g': case 'G': /* generic-backref, pos assertion */ + case 'h': case 'H': /* HORIZWS */ case 'k': case 'K': /* named backref, keep marker */ case 'N': /* named char sequence */ - case 'p': case 'P': /* unicode property */ + case 'p': case 'P': /* Unicode property */ + case 'R': /* LNBREAK */ case 's': case 'S': /* space class */ - case 'v': case 'V': /* (*PRUNE) and (*SKIP) */ + case 'v': case 'V': /* VERTWS */ case 'w': case 'W': /* word class */ case 'X': /* eXtended Unicode "combining character sequence" */ case 'z': case 'Z': /* End of line/string assertion */ @@ -7236,6 +7617,21 @@ case ANYOF_N##NAME: \ what = WORD; \ break +#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ +ANYOF_##NAME: \ + for (value = 0; value < 256; value++) \ + if (TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '+'; \ + what = WORD; \ + break; \ +case ANYOF_N##NAME: \ + for (value = 0; value < 256; value++) \ + if (!TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '!'; \ + what = WORD; \ + break /* parse a class specification and produce either an ANYOF node that @@ -7248,10 +7644,10 @@ STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) { dVAR; - register UV value = 0; register UV nextvalue; register IV prevvalue = OOB_UNICODE; register IV range = 0; + UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ register regnode *ret; STRLEN numlen; IV namedclass; @@ -7354,6 +7750,10 @@ parseit: case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { /* We only pay attention to the first char of @@ -7532,6 +7932,8 @@ parseit: case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); case _C_C_T_(UPPER, isUPPER(value), "Upper"); case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); + case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); + case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); case ANYOF_ASCII: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_ASCII); @@ -7653,12 +8055,16 @@ parseit: { if (isLOWER(prevvalue)) { for (i = prevvalue; i <= ceilvalue; i++) - if (isLOWER(i)) + if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } else { for (i = prevvalue; i <= ceilvalue; i++) - if (isUPPER(i)) + if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } } else @@ -8255,6 +8661,27 @@ S_regcurly(register const char *s) /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ +#ifdef DEBUGGING +void +S_regdump_extflags(pTHX_ const char *lead, const U32 flags) { + int bit; + int set=0; + for (bit=0; bit<32; bit++) { + if (flags & (1<program, ri->program + 1, NULL, NULL, sv, 0, 0); @@ -8336,6 +8764,7 @@ Perl_regdump(pTHX_ const regexp *r) if (r->extflags & RXf_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); #else PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(r); @@ -8375,7 +8804,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) pv_pretty(dsv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], PERL_PV_ESCAPE_UNI_DETECT | - PERL_PV_PRETTY_ELIPSES | + PERL_PV_PRETTY_ELLIPSES | PERL_PV_PRETTY_LTGT ); Perl_sv_catpvf(aTHX_ sv, " %s", s ); @@ -8466,6 +8895,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SVfARG((SV*)progi->data->data[ ARG( o ) ])); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + else if (k == FOLDCHAR) + Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); @@ -8616,7 +9047,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } SV * -Perl_re_intuit_string(pTHX_ regexp *prog) +Perl_re_intuit_string(pTHX_ REGEXP * const prog) { /* Assume that RE_INTUIT is set */ dVAR; GET_RE_DEBUG_FLAGS_DECL; @@ -8732,10 +9163,7 @@ Perl_reg_temp_copy (pTHX_ struct regexp *r) { } RX_MATCH_COPIED_off(ret); #ifdef PERL_OLD_COPY_ON_WRITE - /* this is broken. */ - assert(0); - if (ret->saved_copy) - ret->saved_copy=NULL; + ret->saved_copy = NULL; #endif ret->mother_re = r; ret->swap = NULL; @@ -8757,7 +9185,7 @@ Perl_reg_temp_copy (pTHX_ struct regexp *r) { */ void -Perl_regfree_internal(pTHX_ struct regexp *r) +Perl_regfree_internal(pTHX_ REGEXP * const r) { dVAR; RXi_GET_DECL(r,ri); @@ -8983,7 +9411,7 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) */ void * -Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) +Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) { dVAR; regexp_internal *reti; @@ -9344,7 +9772,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, PL_colors[0], PL_colors[1], (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELIPSES | + PERL_PV_PRETTY_ELLIPSES | PERL_PV_PRETTY_LTGT ) : "???"