From: Nicholas Clark Date: Sat, 5 Jan 2008 13:54:38 +0000 (+0000) Subject: Add RX_UTF8(), which is effectively SvUTF8() but for regexps. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=efd26800e76b6f876fd6abe3a3f7e3e4128150a9;p=p5sagit%2Fp5-mst-13.2.git Add RX_UTF8(), which is effectively SvUTF8() but for regexps. Remove RXp_PRECOMP() and RXp_WRAPPED(). Change the parameter of S_debug_start_match() from regexp to REGEXP. Change its callers [the only part wrong for 5.10.x] p4raw-id: //depot/perl@32840 --- diff --git a/embed.fnc b/embed.fnc index b11cd6e..85fca2a 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1454,7 +1454,9 @@ ERs |I32 |reg_check_named_buff_matched |NN const regexp *rex|NN const regnode *p # ifdef DEBUGGING Es |void |dump_exec_pos |NN const char *locinput|NN const regnode *scan|NN const char *loc_regeol\ |NN const char *loc_bostr|NN const char *loc_reg_starttry|const bool do_utf8 -Es |void |debug_start_match|NN const regexp *prog|const bool do_utf8|NN const char *start|NN const char *end|NN const char *blurb +Es |void |debug_start_match|NN const REGEXP *prog|const bool do_utf8\ + |NN const char *start|NN const char *end\ + |NN const char *blurb # endif #endif diff --git a/proto.h b/proto.h index 5bbb593..d24ba1e 100644 --- a/proto.h +++ b/proto.h @@ -3886,7 +3886,7 @@ STATIC void S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, con __attribute__nonnull__(pTHX_4) __attribute__nonnull__(pTHX_5); -STATIC void S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8, const char *start, const char *end, const char *blurb) +STATIC void S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8, const char *start, const char *end, const char *blurb) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_3) __attribute__nonnull__(pTHX_4) diff --git a/regcomp.c b/regcomp.c index b7fd317..33ed6fc 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4294,8 +4294,8 @@ redo_first_pass: + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); - Newx(RXp_WRAPPED(r), RXp_WRAPLEN(r) + 1, char ); - p = RXp_WRAPPED(r); + Newx(RX_WRAPPED(rx), RXp_WRAPLEN(r) + 1, char ); + p = RX_WRAPPED(rx); *p++='('; *p++='?'; if (has_p) *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ @@ -4319,8 +4319,8 @@ redo_first_pass: *p++ = ':'; Copy(RExC_precomp, p, plen, char); - assert ((RXp_WRAPPED(r) - p) < 16); - r->pre_prefix = p - RXp_WRAPPED(r); + assert ((RX_WRAPPED(rx) - p) < 16); + r->pre_prefix = p - RX_WRAPPED(rx); p += plen; if (has_runon) *p++ = '\n'; @@ -4798,7 +4798,7 @@ reStudy: #ifdef STUPID_PATTERN_CHECKS if (RX_PRELEN(r) == 0) r->extflags |= RXf_NULL; - if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == ' ') + if (r->extflags & RXf_SPLIT && RX_PRELEN(r) == 1 && RX_PRECOMP(rx)[0] == ' ') /* XXX: this should happen BEFORE we compile */ r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); else if (RX_PRELEN(r) == 3 && memEQ("\\s+", RXp_PRECOMP(r), 3)) @@ -4806,7 +4806,7 @@ reStudy: else if (RX_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == '^') r->extflags |= RXf_START_ONLY; #else - if (r->extflags & RXf_SPLIT && RXp_PRELEN(r) == 1 && RXp_PRECOMP(r)[0] == ' ') + if (r->extflags & RXf_SPLIT && RXp_PRELEN(r) == 1 && RX_PRECOMP(rx)[0] == ' ') /* XXX: this should happen BEFORE we compile */ r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); else { @@ -9159,7 +9159,7 @@ Perl_pregfree2(pTHX_ REGEXP *rx) CALLREGFREE_PVT(rx); /* free the private data */ if (r->paren_names) SvREFCNT_dec(r->paren_names); - Safefree(RXp_WRAPPED(r)); + Safefree(RX_WRAPPED(rx)); } if (r->substrs) { if (r->anchored_substr) @@ -9258,7 +9258,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) { SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8), - dsv, RXp_PRECOMP(r), RXp_PRELEN(r), 60); + dsv, RX_PRECOMP(rx), RXp_PRELEN(r), 60); PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", PL_colors[4],PL_colors[5],s); } diff --git a/regexec.c b/regexec.c index 2b7ae4a..2fb1fbe 100644 --- a/regexec.c +++ b/regexec.c @@ -401,7 +401,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, PL_reg_flags |= RF_utf8; } DEBUG_EXECUTE_r( - debug_start_match(prog, do_utf8, strpos, strend, + debug_start_match(rx, do_utf8, strpos, strend, sv ? "Guessing start of match in sv for" : "Guessing start of match in string for"); ); @@ -1784,7 +1784,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre RX_MATCH_UTF8_set(rx, do_utf8); DEBUG_EXECUTE_r( - debug_start_match(prog, do_utf8, startpos, strend, + debug_start_match(rx, do_utf8, startpos, strend, "Matching"); ); @@ -2553,15 +2553,15 @@ regmatch(), slabs allocated since entry are freed. #ifdef DEBUGGING STATIC void -S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8, +S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8, const char *start, const char *end, const char *blurb) { - const bool utf8_pat= prog->extflags & RXf_UTF8 ? 1 : 0; + const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; if (!PL_colorset) reginitcolors(); { RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), - RXp_PRECOMP(prog), RXp_PRELEN(prog), 60); + RX_PRECOMP(prog), RX_PRELEN(prog), 60); RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1), start, end - start, 60); @@ -3775,7 +3775,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) re->sublen = rex->sublen; rei = RXi_GET(re); DEBUG_EXECUTE_r( - debug_start_match(re, do_utf8, locinput, PL_regeol, + debug_start_match(re_sv, do_utf8, locinput, PL_regeol, "Matching embedded"); ); startpoint = rei->program + 1; diff --git a/regexp.h b/regexp.h index 1a2b17c..f76a8ea 100644 --- a/regexp.h +++ b/regexp.h @@ -358,19 +358,17 @@ and check for NULL. ? RX_MATCH_COPIED_on(prog) \ : RX_MATCH_COPIED_off(prog)) -#define RXp_PRECOMP(rx) ((rx)->wrapped + (rx)->pre_prefix) /* FIXME? Are we hardcoding too much here and constraining plugin extension writers? Specifically, the value 1 assumes that the wrapped version always has exactly one character at the end, a ')'. Will that always be true? */ #define RXp_PRELEN(rx) ((rx)->wraplen - (rx)->pre_prefix - 1) -#define RXp_WRAPPED(rx) ((rx)->wrapped) #define RXp_WRAPLEN(rx) ((rx)->wraplen) #define RXp_EXTFLAGS(rx) ((rx)->extflags) /* For source compatibility. We used to store these explicitly. */ -#define RX_PRECOMP(prog) RXp_PRECOMP((struct regexp *)SvANY(prog)) +#define RX_PRECOMP(prog) (((struct regexp *)SvANY(prog))->wrapped + ((struct regexp *)SvANY(prog))->pre_prefix) #define RX_PRELEN(prog) RXp_PRELEN((struct regexp *)SvANY(prog)) -#define RX_WRAPPED(prog) RXp_WRAPPED((struct regexp *)SvANY(prog)) +#define RX_WRAPPED(prog) (((struct regexp *)SvANY(prog))->wrapped) #define RX_WRAPLEN(prog) RXp_WRAPLEN((struct regexp *)SvANY(prog)) #define RX_CHECK_SUBSTR(prog) (((struct regexp *)SvANY(prog))->check_substr) #define RX_EXTFLAGS(prog) RXp_EXTFLAGS((struct regexp *)SvANY(prog)) @@ -417,6 +415,9 @@ and check for NULL. #define RX_MATCH_UTF8_set(prog, t) ((t) \ ? (RX_MATCH_UTF8_on(prog), (PL_reg_match_utf8 = 1)) \ : (RX_MATCH_UTF8_off(prog), (PL_reg_match_utf8 = 0))) + +/* Whether the pattern stored at RX_WRAPPED is in UTF-8 */ +#define RX_UTF8(prog) (RX_EXTFLAGS(prog) & RXf_UTF8) #define REXEC_COPY_STR 0x01 /* Need to copy the string. */ #define REXEC_CHECKED 0x02 /* check_substr already checked. */