From: Yves Orton Date: Thu, 30 Nov 2006 22:06:38 +0000 (+0100) Subject: Continue split of perl internal regexp structures from ones that are engine specific. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=f81494553e6bd219aa5863d70777bd6f2b02d37f;p=p5sagit%2Fp5-mst-13.2.git Continue split of perl internal regexp structures from ones that are engine specific. Message-ID: <9b18b3110611301306p5cad5deal4aa55559b8c8defd@mail.gmail.com> p4raw-id: //depot/perl@29430 --- diff --git a/embed.fnc b/embed.fnc index 9d5c7e2..3b1a68e 100644 --- a/embed.fnc +++ b/embed.fnc @@ -668,9 +668,10 @@ Ap |I32 |pregexec |NN regexp* prog|NN char* stringarg \ |NN char* strend|NN char* strbeg|I32 minend \ |NN SV* screamer|U32 nosave Ap |void |pregfree |NULLOK struct regexp* r -p |char * |reg_stringify |NN MAGIC *mg|NULLOK STRLEN *lp|NULLOK U32 *flags|NULLOK I32 *haseval +Ap |void |regfree_internal|NULLOK struct regexp* r +Ap |char * |reg_stringify |NN MAGIC *mg|NULLOK STRLEN *lp|NULLOK U32 *flags|NULLOK I32 *haseval #if defined(USE_ITHREADS) -Ap |regexp*|regdupe |NN const regexp* r|NN CLONE_PARAMS* param +Ap |void* |regdupe_internal|NN const regexp* r|NN CLONE_PARAMS* param #endif Ap |regexp*|pregcomp |NN char* exp|NN char* xend|NN PMOP* pm Ap |char* |re_intuit_start|NN regexp* prog|NULLOK SV* sv|NN char* strpos \ diff --git a/embed.h b/embed.h index 9714956..618166f 100644 --- a/embed.h +++ b/embed.h @@ -680,11 +680,10 @@ #define regclass_swash Perl_regclass_swash #define pregexec Perl_pregexec #define pregfree Perl_pregfree -#ifdef PERL_CORE +#define regfree_internal Perl_regfree_internal #define reg_stringify Perl_reg_stringify -#endif #if defined(USE_ITHREADS) -#define regdupe Perl_regdupe +#define regdupe_internal Perl_regdupe_internal #endif #define pregcomp Perl_pregcomp #define re_intuit_start Perl_re_intuit_start @@ -2891,11 +2890,10 @@ #define regclass_swash(a,b,c,d,e) Perl_regclass_swash(aTHX_ a,b,c,d,e) #define pregexec(a,b,c,d,e,f,g) Perl_pregexec(aTHX_ a,b,c,d,e,f,g) #define pregfree(a) Perl_pregfree(aTHX_ a) -#ifdef PERL_CORE +#define regfree_internal(a) Perl_regfree_internal(aTHX_ a) #define reg_stringify(a,b,c,d) Perl_reg_stringify(aTHX_ a,b,c,d) -#endif #if defined(USE_ITHREADS) -#define regdupe(a,b) Perl_regdupe(aTHX_ a,b) +#define regdupe_internal(a,b) Perl_regdupe_internal(aTHX_ a,b) #endif #define pregcomp(a,b,c) Perl_pregcomp(aTHX_ a,b,c) #define re_intuit_start(a,b,c,d,e,f) Perl_re_intuit_start(aTHX_ a,b,c,d,e,f) diff --git a/ext/re/re.xs b/ext/re/re.xs index 8c6fbc1..8847901 100644 --- a/ext/re/re.xs +++ b/ext/re/re.xs @@ -15,13 +15,13 @@ extern regexp* my_regcomp (pTHX_ char* exp, char* xend, PMOP* pm); extern I32 my_regexec (pTHX_ regexp* prog, char* stringarg, char* strend, char* strbeg, I32 minend, SV* screamer, void* data, U32 flags); -extern void my_regfree (pTHX_ struct regexp* r); + extern char* my_re_intuit_start (pTHX_ regexp *prog, SV *sv, char *strpos, char *strend, U32 flags, struct re_scream_pos_data_s *data); extern SV* my_re_intuit_string (pTHX_ regexp *prog); -extern char* my_reg_stringify (pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval); +extern void my_regfree (pTHX_ struct regexp* r); #if defined(USE_ITHREADS) extern regexp* my_regdupe (pTHX_ const regexp *r, CLONE_PARAMS *param); #endif @@ -36,7 +36,6 @@ const struct regexp_engine my_reg_engine = { my_re_intuit_start, my_re_intuit_string, my_regfree, - my_reg_stringify, #if defined(USE_ITHREADS) my_regdupe #endif diff --git a/ext/re/re_top.h b/ext/re/re_top.h index 39b7fd1..178c433 100644 --- a/ext/re/re_top.h +++ b/ext/re/re_top.h @@ -13,10 +13,9 @@ #define Perl_regprop my_regprop #define Perl_re_intuit_start my_re_intuit_start #define Perl_pregcomp my_regcomp -#define Perl_pregfree my_regfree +#define Perl_regfree_internal my_regfree #define Perl_re_intuit_string my_re_intuit_string -#define Perl_regdupe my_regdupe -#define Perl_reg_stringify my_reg_stringify +#define Perl_regdupe_internal my_regdupe #define PERL_NO_GET_CONTEXT diff --git a/global.sym b/global.sym index b33fded..e69d181 100644 --- a/global.sym +++ b/global.sym @@ -386,7 +386,9 @@ Perl_regdump Perl_regclass_swash Perl_pregexec Perl_pregfree -Perl_regdupe +Perl_regfree_internal +Perl_reg_stringify +Perl_regdupe_internal Perl_pregcomp Perl_re_intuit_start Perl_re_intuit_string diff --git a/perl.h b/perl.h index f5b8037..12be192 100644 --- a/perl.h +++ b/perl.h @@ -196,7 +196,7 @@ #define CALLRUNOPS CALL_FPTR(PL_runops) -#define CALLREGCOMP(exp, xend, pm) Perl_pregcomp(aTHX_ exp,xend,pm) +#define CALLREGCOMP(exp, xend, pm) Perl_pregcomp(aTHX_ (exp),(xend),(pm)) #define CALLREGCOMP_ENG(prog, exp, xend, pm) \ CALL_FPTR(((prog)->comp))(aTHX_ exp, xend, pm) @@ -208,13 +208,22 @@ (strend),(flags),(data)) #define CALLREG_INTUIT_STRING(prog) \ CALL_FPTR((prog)->engine->checkstr)(aTHX_ (prog)) -#define CALLREGFREE(prog) \ - if(prog) CALL_FPTR((prog)->engine->free)(aTHX_ (prog)) + #define CALLREG_AS_STR(mg,lp,flags,haseval) \ - CALL_FPTR(((regexp *)((mg)->mg_obj))->engine->as_str)(aTHX_ (mg), (lp), (flags), (haseval)) + Perl_reg_stringify(aTHX_ (mg), (lp), (flags), (haseval)) #define CALLREG_STRINGIFY(mg,lp,flags) CALLREG_AS_STR(mg,lp,flags,0) + +#define CALLREGFREE(prog) \ + Perl_pregfree(aTHX_ (prog)) + +#define CALLREGFREE_PVT(prog) \ + if(prog) CALL_FPTR((prog)->engine->free)(aTHX_ (prog)) + #if defined(USE_ITHREADS) #define CALLREGDUPE(prog,param) \ + Perl_re_dup(aTHX_ (prog),(param)) + +#define CALLREGDUPE_PVT(prog,param) \ (prog ? CALL_FPTR((prog)->engine->dupe)(aTHX_ (prog),(param)) \ : (REGEXP *)NULL) #endif diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod index aa54bfc..5ad10cd 100644 --- a/pod/perlreguts.pod +++ b/pod/perlreguts.pod @@ -12,13 +12,13 @@ author's experience, comments in the source code, other papers on the regex engine, feedback on the perl5-porters mail list, and no doubt other places as well. -B It should be clearly understood that this document -represents the state of the regex engine as the author understands it at -the time of writing. It is B an API definition; it is purely an -internals guide for those who want to hack the regex engine, or +B It should be clearly understood that this document represents +the state of the regex engine as the author understands it at the time of +writing. Unless stated otherwise it is B an API definition; it is +purely an internals guide for those who want to hack the regex engine, or understand how the regex engine works. Readers of this document are -expected to understand perl's regex syntax and its usage in detail. If -you want to learn about the basics of Perl's regular expressions, see +expected to understand perl's regex syntax and its usage in detail. If you +want to learn about the basics of Perl's regular expressions, see L. =head1 OVERVIEW @@ -740,113 +740,104 @@ tricky this can be: A sequence of valid UTF-8 bytes cannot be a subsequence of another valid sequence of UTF-8 bytes. -=head2 Base Struct -F contains the base structure definition: +=head2 Base Structures - typedef struct regexp { - I32 *startp; - I32 *endp; - regexp_paren_ofs *swap; - regnode *regstclass; - struct reg_substr_data *substrs; - char *precomp; /* pre-compilation regular expression */ - struct reg_data *data; /* Additional data. */ - char *subbeg; /* saved or original string - so \digit works forever. */ - #ifdef PERL_OLD_COPY_ON_WRITE - SV *saved_copy; /* If non-NULL, SV which is COW from original */ - #endif - U32 *offsets; /* offset annotations 20001228 MJD */ - I32 sublen; /* Length of string pointed by subbeg */ - I32 refcnt; - I32 minlen; /* mininum length of string to match */ - I32 minlenret; /* mininum possible length of $& */ - I32 prelen; /* length of precomp */ - U32 nparens; /* number of parentheses */ - U32 lastparen; /* last paren matched */ - U32 lastcloseparen; /* last paren matched */ - U32 reganch; /* Internal use only + - Tainted information used by regexec? */ - HV *paren_names; /* Paren names */ - const struct regexp_engine* engine; - regnode program[1]; /* Unwarranted chumminess with compiler. */ - } regexp; +There are two structures used to store a compiled regular expression. +One, the regexp structure is considered to be perl's property, and the +other is considered to be the property of the regex engine which +compiled the regular expression; in the case of the stock engine this +structure is called regexp_internal. -=over 5 - -=item C +The regexp structure contains all the data that perl needs to be aware of +to properly work with the regular expression. It includes data about +optimisations that perl can use to determine if the regex engine should +really be used, and various other control info that is needed to properly +execute patterns in various contexts such as is the pattern anchored in +some way, or what flags were used during the compile, or whether the +program contains special constructs that perl needs to be aware of. -Compiled program. Inlined into the structure so the entire struct can be -treated as a single blob. +In addition it contains two fields that are intended for the private use +of the regex engine that compiled the pattern. These are the C +and pprivate members. The C is a void pointer to an arbitrary +structure whose use and management is the responsibility of the compiling +engine. perl will never modify either of these values. -=item C +As mentioned earlier, in the case of the default engines, the C +will be a pointer to a regexp_internal structure which holds the compiled +program and any additional data that is private to the regex engine +implementation. -This field points at a reg_data structure, which is defined as follows +=head3 Perl Inspectable Data About Pattern - struct reg_data { - U32 count; - U8 *what; - void* data[1]; - }; +F contains the "public" structure definition. All regex engines +must be able to correctly build a regexp structure. -This structure is used for handling data structures that the regex engine -needs to handle specially during a clone or free operation on the compiled -product. Each element in the data array has a corresponding element in the -what array. During compilation regops that need special structures stored -will add an element to each array using the add_data() routine and then store -the index in the regop. - -=item C, C, and C + typedef struct regexp { + /* what engine created this regexp? */ + const struct regexp_engine* engine; + + /* Information about the match that the perl core uses to manage things */ + U32 extflags; /* Flags used both externally and internally */ + I32 minlen; /* mininum possible length of string to match */ + I32 minlenret; /* mininum possible length of $& */ + U32 gofs; /* chars left of pos that we search from */ + struct reg_substr_data *substrs; /* substring data about strings that must appear + in the final match, used for optimisations */ + U32 nparens; /* number of capture buffers */ + + /* private engine specific data */ + U32 intflags; /* Engine Specific Internal flags */ + void *pprivate; /* Data private to the regex engine which + created this object. */ + + /* Data about the last/current match. These are modified during matching*/ + U32 lastparen; /* last open paren matched */ + U32 lastcloseparen; /* last close paren matched */ + I32 *startp; /* Array of offsets from start of string (@-) */ + I32 *endp; /* Array of offsets from start of string (@+) */ + char *subbeg; /* saved or original string + so \digit works forever. */ + I32 sublen; /* Length of string pointed by subbeg */ + SV_SAVED_COPY /* If non-NULL, SV which is COW from original */ + + + /* Information about the match that isn't often used */ + char *precomp; /* pre-compilation regular expression */ + I32 prelen; /* length of precomp */ + I32 seen_evals; /* number of eval groups in the pattern - for security checks */ + HV *paren_names; /* Optional hash of paren names */ + + /* Refcount of this regexp */ + I32 refcnt; /* Refcount of this regexp */ + } regexp; -These fields are used to keep track of how many paren groups could be matched -in the pattern, which was the last open paren to be entered, and which was -the last close paren to be entered. +The fields are discussed in more detail below: -=item C, C, C +=over 5 -These fields store arrays that are used to hold the offsets of the begining -and end of each capture group that has matched. -1 is used to indicate no match. -C is an extra set of startp/endp stored in a C -struct. This is used when the last successful match was from same pattern -as the current pattern, so that a partial match doesn't overwrite the -previous match's results. When this field is data filled the matching -engine will swap buffers before every match attempt. If the match fails, -then it swaps them back. If it's successful it leaves them. This field -is populated on demand and is by default null. +=item C -These are the source for @- and @+. +The number of times the structure is referenced. When this falls to 0 +the regexp is automatically freed by a call to pregfree. -=item C C C +=item C -These are used during execution phase for managing search and replace -patterns. +This field points at a regexp_engine structure which contains pointers +to the subroutine that are to be used for performing a match. It +is the compiling routines responsibility to populate this field before +returning the regexp object. -=item C C C +=item C C Used for debugging purposes. C holds a copy of the pattern -that was compiled, offsets holds a mapping of offset in the C -to offset in the C string. This is only used by ActiveStates -visual regex debugger. +that was compiled. -=item C - -Holds information on the longest string that must occur at a fixed -offset from the start of the pattern, and the longest string that must -occur at a floating offset from the start of the pattern. Used to do -Fast-Boyer-Moore searches on the string to find out if its worth using -the regex engine at all, and if so where in the string to search. +=item C -=item C - -Special regop that is used by C to check if a pattern -can match at a certain position. For instance if the regex engine knows -that the pattern must start with a 'Z' then it can scan the string until -it finds one and then launch the regex engine from there. The routine -that handles this is called C. Sometimes this field -points at a regop embedded in the program, and sometimes it points at -an independent synthetic regop that has been constructed by the optimiser. +This is used to store various flags about the pattern, such as whether it +contains a \G or a ^ or $ symbol. =item C C @@ -871,10 +862,15 @@ content. This distinction is particularly important as the substitution logic uses the C to tell whether it can do in-place substition which can result in considerable speedup. -=item C +=item C -This is used to store various flags about the pattern, such as whether it -contains a \G or a ^ or $ symbol. +Left offset from pos() to start match at. + +=item C, C, and C + +These fields are used to keep track of how many paren groups could be matched +in the pattern, which was the last open paren to be entered, and which was +the last close paren to be entered. =item C @@ -885,17 +881,102 @@ pv being an embedded array of I32. The values may also be contained independently in the data array in cases where named backreferences are used. -=item C +=item C -The number of times the structure is referenced. When this falls to 0 -the regexp is automatically freed by a call to pregfree. +Holds information on the longest string that must occur at a fixed +offset from the start of the pattern, and the longest string that must +occur at a floating offset from the start of the pattern. Used to do +Fast-Boyer-Moore searches on the string to find out if its worth using +the regex engine at all, and if so where in the string to search. -=item C +=item C, C, -This field points at a regexp_engine structure which contains pointers -to the subroutine that are to be used for performing a match. It -is the compiling routines responsibility to populate this field before -returning the regexp object. +These fields store arrays that are used to hold the offsets of the begining +and end of each capture group that has matched. -1 is used to indicate no match. + +These are the source for @- and @+. + +=item C C C + +These are used during execution phase for managing search and replace +patterns. + +=item C + +This stores the number of eval groups in the pattern. This is used +for security purposes when embedding compiled regexes into larger +patterns. + +=back + +=head3 Engine Private Data About Pattern + +Additionally regexp.h contains the following "private" definition which is perl +specific and is only of curiosity value to other engine implementations. + + typedef struct regexp_internal { + regexp_paren_ofs *swap; /* Swap copy of *startp / *endp */ + U32 *offsets; /* offset annotations 20001228 MJD + data about mapping the program to the + string*/ + regnode *regstclass; /* Optional startclass as identified or constructed + by the optimiser */ + struct reg_data *data; /* Additional miscellaneous data used by the program. + Used to make it easier to clone and free arbitrary + data that the regops need. Often the ARG field of + a regop is an index into this structure */ + regnode program[1]; /* Unwarranted chumminess with compiler. */ + } regexp_internal; + +=over 5 + +=item C + +C is an extra set of startp/endp stored in a C +struct. This is used when the last successful match was from same pattern +as the current pattern, so that a partial match doesn't overwrite the +previous match's results. When this field is data filled the matching +engine will swap buffers before every match attempt. If the match fails, +then it swaps them back. If it's successful it leaves them. This field +is populated on demand and is by default null. + +=item C + +Offsets holds a mapping of offset in the C +to offset in the C string. This is only used by ActiveStates +visual regex debugger. + +=item C + +Special regop that is used by C to check if a pattern +can match at a certain position. For instance if the regex engine knows +that the pattern must start with a 'Z' then it can scan the string until +it finds one and then launch the regex engine from there. The routine +that handles this is called C. Sometimes this field +points at a regop embedded in the program, and sometimes it points at +an independent synthetic regop that has been constructed by the optimiser. + +=item C + +This field points at a reg_data structure, which is defined as follows + + struct reg_data { + U32 count; + U8 *what; + void* data[1]; + }; + +This structure is used for handling data structures that the regex engine +needs to handle specially during a clone or free operation on the compiled +product. Each element in the data array has a corresponding element in the +what array. During compilation regops that need special structures stored +will add an element to each array using the add_data() routine and then store +the index in the regop. + +=item C + +Compiled program. Inlined into the structure so the entire struct can be +treated as a single blob. =back @@ -907,21 +988,21 @@ a constant structure of the following format: typedef struct regexp_engine { regexp* (*comp) (pTHX_ char* exp, char* xend, PMOP* pm); - I32 (*exec) (pTHX_ regexp* prog, char* stringarg, char* strend, - char* strbeg, I32 minend, SV* screamer, - void* data, U32 flags); + I32 (*exec) (pTHX_ regexp* prog, char* stringarg, char* strend, + char* strbeg, I32 minend, SV* screamer, + void* data, U32 flags); char* (*intuit) (pTHX_ regexp *prog, SV *sv, char *strpos, - char *strend, U32 flags, - struct re_scream_pos_data_s *data); - SV* (*checkstr) (pTHX_ regexp *prog); + char *strend, U32 flags, + struct re_scream_pos_data_s *data); + SV* (*checkstr) (pTHX_ regexp *prog); void (*free) (pTHX_ struct regexp* r); #ifdef USE_ITHREADS - regexp* (*dupe) (pTHX_ const regexp *r, CLONE_PARAMS *param); - #endif + void* (*dupe) (pTHX_ const regexp *r, CLONE_PARAMS *param); + #endif } regexp_engine; When a regexp is compiled its C field is then set to point at -the appropriate structure so that when it needs to be used it can find +the appropriate structure so that when it needs to be used Perl can find the right routines to do so. In order to install a new regexp handler, C<$^H{regcomp}> is set @@ -964,7 +1045,9 @@ Execute a regexp. Find the start position where a regex match should be attempted, or possibly whether the regex engine should not be run because the -pattern can't match. +pattern can't match. This is called as appropriate by the core +depending on the values of the extflags member of the regexp +structure. =item checkstr @@ -977,16 +1060,28 @@ for optimising matches. void free(regexp *prog); -Release any resources allocated to store this pattern. After this -call prog is an invalid pointer. +Called by perl when it is freeing a regexp pattern so that the engine +can release any resources pointed to by the C member of the +regexp structure. This is only responsible for freeing private data, +perl will handle releasing anything else contained in the regexp structure. =item dupe - regexp* dupe(const regexp *r, CLONE_PARAMS *param); + void* dupe(const regexp *r, CLONE_PARAMS *param); On threaded builds a regexp may need to be duplicated so that the pattern can be used by mutiple threads. This routine is expected to handle the -duplication. On unthreaded builds this field doesnt exist. +duplication of any private data pointed to by the C member of +the regexp structure. It will be called with the preconstructed new +regexp structure as an argument, the C member will point at +the B private structue, and it is this routines responsibility to +construct a copy and return a pointer to it (which perl will then use to +overwrite the field as passed to this routine.) + +This allows the engine to dupe its private data but also if necessary +modify the final structure if it really must. + +On unthreaded builds this field doesn't exist. =back diff --git a/proto.h b/proto.h index 9134296..9add7e0 100644 --- a/proto.h +++ b/proto.h @@ -1833,11 +1833,12 @@ PERL_CALLCONV I32 Perl_pregexec(pTHX_ regexp* prog, char* stringarg, char* stren __attribute__nonnull__(pTHX_6); PERL_CALLCONV void Perl_pregfree(pTHX_ struct regexp* r); +PERL_CALLCONV void Perl_regfree_internal(pTHX_ struct regexp* r); PERL_CALLCONV char * Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval) __attribute__nonnull__(pTHX_1); #if defined(USE_ITHREADS) -PERL_CALLCONV regexp* Perl_regdupe(pTHX_ const regexp* r, CLONE_PARAMS* param) +PERL_CALLCONV void* Perl_regdupe_internal(pTHX_ const regexp* r, CLONE_PARAMS* param) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); diff --git a/regcomp.c b/regcomp.c index 30b0660..0e506e7 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3896,6 +3896,7 @@ S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s) return count; } +/*XXX: todo make this not included in a non debugging perl */ #ifndef PERL_IN_XSUB_RE void Perl_reginitcolors(pTHX) @@ -4133,7 +4134,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) ri->program[RExC_size].type = 255; #endif /* Store the count of eval-groups for security checks: */ - RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals; + RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) return(NULL); @@ -8461,36 +8462,32 @@ Perl_re_intuit_string(pTHX_ regexp *prog) } /* - pregfree - free a regexp + pregfree() - See regdupe below if you change anything here. + handles refcounting and freeing the perl core regexp structure. When + it is necessary to actually free the structure the first thing it + does is call the 'free' method of the regexp_engine associated to to + the regexp, allowing the handling of the void *pprivate; member + first. (This routine is not overridable by extensions, which is why + the extensions free is called first.) + + See regdupe and regdupe_internal if you change anything here. */ - +#ifndef PERL_IN_XSUB_RE void Perl_pregfree(pTHX_ struct regexp *r) { dVAR; - RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; if (!r || (--r->refcnt > 0)) return; - DEBUG_COMPILE_r({ - if (!PL_colorset) - reginitcolors(); - { - SV *dsv= sv_newmortal(); - RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8), - dsv, r->precomp, r->prelen, 60); - PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", - PL_colors[4],PL_colors[5],s); - } - }); - + + CALLREGFREE_PVT(r); /* free the private data */ + /* gcov results gave these as non-null 100% of the time, so there's no optimisation in checking them before calling Safefree */ Safefree(r->precomp); - Safefree(ri->offsets); /* 20010421 MJD */ RX_MATCH_COPY_FREE(r); #ifdef PERL_OLD_COPY_ON_WRITE if (r->saved_copy) @@ -8509,6 +8506,45 @@ Perl_pregfree(pTHX_ struct regexp *r) } if (r->paren_names) SvREFCNT_dec(r->paren_names); + + Safefree(r->startp); + Safefree(r->endp); + Safefree(r); +} +#endif + +/* regfree_internal() + + Free the private data in a regexp. This is overloadable by + extensions. Perl takes care of the regexp structure in pregfree(), + this covers the *pprivate pointer which technically perldoesnt + know about, however of course we have to handle the + regexp_internal structure when no extension is in use. + + Note this is called before freeing anything in the regexp + structure. + */ + +void +Perl_regfree_internal(pTHX_ struct regexp *r) +{ + dVAR; + RXi_GET_DECL(r,ri); + GET_RE_DEBUG_FLAGS_DECL; + + DEBUG_COMPILE_r({ + if (!PL_colorset) + reginitcolors(); + { + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8), + dsv, r->precomp, r->prelen, 60); + PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", + PL_colors[4],PL_colors[5],s); + } + }); + + Safefree(ri->offsets); /* 20010421 MJD */ if (ri->data) { int n = ri->data->count; PAD* new_comppad = NULL; @@ -8597,15 +8633,12 @@ Perl_pregfree(pTHX_ struct regexp *r) Safefree(ri->data->what); Safefree(ri->data); } - Safefree(r->startp); - Safefree(r->endp); if (ri->swap) { Safefree(ri->swap->startp); Safefree(ri->swap->endp); Safefree(ri->swap); } Safefree(ri); - Safefree(r); } #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t)) @@ -8620,16 +8653,21 @@ Perl_pregfree(pTHX_ struct regexp *r) given regexp structure. It is a no-op when not under USE_ITHREADS. (Originally this *was* re_dup() for change history see sv.c) - See pregfree() above if you change anything here. + After all of the core data stored in struct regexp is duplicated + the regexp_engine.dupe method is used to copy any private data + stored in the *pprivate pointer. This allows extensions to handle + any duplication it needs to do. + + See pregfree() and regfree_internal() if you change anything here. */ #if defined(USE_ITHREADS) +#ifndef PERL_IN_XSUB_RE regexp * -Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param) +Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) { dVAR; regexp *ret; - regexp_internal *reti; - int i, len, npar; + int i, npar; struct reg_substr_datum *s; RXi_GET_DECL(r,ri); @@ -8639,26 +8677,13 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param) if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r))) return ret; - len = ri->offsets[0]; + npar = r->nparens+1; - Newxz(ret, 1, regexp); - Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal); - RXi_SET(ret,reti); - Copy(ri->program, reti->program, len+1, regnode); - Newx(ret->startp, npar, I32); Copy(r->startp, ret->startp, npar, I32); Newx(ret->endp, npar, I32); - Copy(r->startp, ret->startp, npar, I32); - if(ri->swap) { - Newx(reti->swap, 1, regexp_paren_ofs); - /* no need to copy these */ - Newx(reti->swap->startp, npar, I32); - Newx(reti->swap->endp, npar, I32); - } else { - reti->swap = NULL; - } + Copy(r->endp, ret->endp, npar, I32); Newx(ret->substrs, 1, struct reg_substr_data); for (s = ret->substrs->data, i = 0; i < 3; i++, s++) { @@ -8668,6 +8693,78 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param) s->substr = sv_dup_inc(r->substrs->data[i].substr, param); s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param); } + + + ret->precomp = SAVEPVN(r->precomp, r->prelen); + ret->refcnt = r->refcnt; + ret->minlen = r->minlen; + ret->minlenret = r->minlenret; + ret->prelen = r->prelen; + ret->nparens = r->nparens; + ret->lastparen = r->lastparen; + ret->lastcloseparen = r->lastcloseparen; + ret->intflags = r->intflags; + ret->extflags = r->extflags; + + ret->sublen = r->sublen; + + ret->engine = r->engine; + + ret->paren_names = hv_dup_inc(r->paren_names, param); + + if (RX_MATCH_COPIED(ret)) + ret->subbeg = SAVEPVN(r->subbeg, r->sublen); + else + ret->subbeg = NULL; +#ifdef PERL_OLD_COPY_ON_WRITE + ret->saved_copy = NULL; +#endif + + ret->pprivate = r->pprivate; + RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); + + ptr_table_store(PL_ptr_table, r, ret); + return ret; +} +#endif /* PERL_IN_XSUB_RE */ + +/* + regdupe_internal() + + This is the internal complement to regdupe() which is used to copy + the structure pointed to by the *pprivate pointer in the regexp. + This is the core version of the extension overridable cloning hook. + The regexp structure being duplicated will be copied by perl prior + to this and will be provided as the regexp *r argument, however + with the /old/ structures pprivate pointer value. Thus this routine + may override any copying normally done by perl. + + It returns a pointer to the new regexp_internal structure. +*/ + +void * +Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) +{ + dVAR; + regexp_internal *reti; + int len, npar; + RXi_GET_DECL(r,ri); + + npar = r->nparens+1; + len = ri->offsets[0]; + + Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal); + Copy(ri->program, reti->program, len+1, regnode); + + if(ri->swap) { + Newx(reti->swap, 1, regexp_paren_ofs); + /* no need to copy these */ + Newx(reti->swap->startp, npar, I32); + Newx(reti->swap->endp, npar, I32); + } else { + reti->swap = NULL; + } + reti->regstclass = NULL; if (ri->data) { @@ -8732,36 +8829,11 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param) Newx(reti->offsets, 2*len+1, U32); Copy(ri->offsets, reti->offsets, 2*len+1, U32); - - ret->precomp = SAVEPVN(r->precomp, r->prelen); - ret->refcnt = r->refcnt; - ret->minlen = r->minlen; - ret->minlenret = r->minlenret; - ret->prelen = r->prelen; - ret->nparens = r->nparens; - ret->lastparen = r->lastparen; - ret->lastcloseparen = r->lastcloseparen; - ret->intflags = r->intflags; - ret->extflags = r->extflags; - - ret->sublen = r->sublen; - - ret->engine = r->engine; - ret->paren_names = hv_dup_inc(r->paren_names, param); - - if (RX_MATCH_COPIED(ret)) - ret->subbeg = SAVEPVN(r->subbeg, r->sublen); - else - ret->subbeg = NULL; -#ifdef PERL_OLD_COPY_ON_WRITE - ret->saved_copy = NULL; -#endif - - ptr_table_store(PL_ptr_table, r, ret); - return ret; + return (void*)reti; } -#endif + +#endif /* USE_ITHREADS */ /* reg_stringify() @@ -8774,29 +8846,28 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param) resulting string If flags is nonnull and the returned string contains UTF8 then - (flags & 1) will be true. + (*flags & 1) will be true. If haseval is nonnull then it is used to return whether the pattern contains evals. Normally called via macro: - CALLREG_STRINGIFY(mg,0,0); + CALLREG_STRINGIFY(mg,&len,&utf8); And internally with - CALLREG_AS_STR(mg,lp,flags,haseval) + CALLREG_AS_STR(mg,&lp,&flags,&haseval) See sv_2pv_flags() in sv.c for an example of internal usage. */ - +#ifndef PERL_IN_XSUB_RE char * Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { dVAR; const regexp * const re = (regexp *)mg->mg_obj; - RXi_GET_DECL(re,ri); - + if (!mg->mg_ptr) { const char *fptr = "msix"; char reflags[6]; @@ -8859,7 +8930,7 @@ Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { mg->mg_ptr[mg->mg_len] = 0; } if (haseval) - *haseval = ri->program[0].next_off; + *haseval = re->seen_evals; if (flags) *flags = ((re->extflags & RXf_UTF8) ? 1 : 0); @@ -8868,8 +8939,6 @@ Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { return mg->mg_ptr; } - -#ifndef PERL_IN_XSUB_RE /* - regnext - dig the "next" pointer out of a node */ diff --git a/regcomp.h b/regcomp.h index 167b2c0..9cf03b4 100644 --- a/regcomp.h +++ b/regcomp.h @@ -408,10 +408,9 @@ EXTCONST regexp_engine PL_core_reg_engine = { Perl_regexec_flags, Perl_re_intuit_start, Perl_re_intuit_string, - Perl_pregfree, - Perl_reg_stringify, + Perl_regfree_internal, #if defined(USE_ITHREADS) - Perl_regdupe + Perl_regdupe_internal #endif }; #endif /* DOINIT */ diff --git a/regexp.h b/regexp.h index f2c9705..a0ba5c7 100644 --- a/regexp.h +++ b/regexp.h @@ -42,46 +42,45 @@ typedef struct regexp_paren_ofs { #else #define SV_SAVED_COPY #endif - +/* this is ordered such that the most commonly used + fields are at the start of the struct */ typedef struct regexp { - /* Generic details */ - const struct regexp_engine* engine; /* what created this regexp? */ - I32 refcnt; /* Refcount of this regexp */ - - /* The original string as passed to the compilation routine */ - char *precomp; /* pre-compilation regular expression */ - I32 prelen; /* length of precomp */ - - /* Used for generic optimisations by the perl core. - All engines are expected to provide this information. */ + /* what engine created this regexp? */ + const struct regexp_engine* engine; + + /* Information about the match that the perl core uses to manage things */ U32 extflags; /* Flags used both externally and internally */ I32 minlen; /* mininum possible length of string to match */ I32 minlenret; /* mininum possible length of $& */ U32 gofs; /* chars left of pos that we search from */ - U32 nparens; /* number of capture buffers */ - HV *paren_names; /* Optional hash of paren names */ - struct reg_substr_data *substrs; /* substring data about strings that must appear + struct reg_substr_data *substrs; /* substring data about strings that must appear in the final match, used for optimisations */ + U32 nparens; /* number of capture buffers */ - /* Data about the last/current match. Used by the core and therefore - must be populated by all engines. */ + /* private engine specific data */ + U32 intflags; /* Engine Specific Internal flags */ + void *pprivate; /* Data private to the regex engine which + created this object. */ + + /* Data about the last/current match. These are modified during matching*/ + U32 lastparen; /* last open paren matched */ + U32 lastcloseparen; /* last close paren matched */ + I32 *startp; /* Array of offsets from start of string (@-) */ + I32 *endp; /* Array of offsets from start of string (@+) */ char *subbeg; /* saved or original string so \digit works forever. */ I32 sublen; /* Length of string pointed by subbeg */ - I32 *startp; /* Array of offsets from start of string (@-) */ - I32 *endp; /* Array of offsets from start of string (@+) */ - SV_SAVED_COPY /* If non-NULL, SV which is COW from original */ - U32 lastparen; /* last open paren matched */ - U32 lastcloseparen; /* last close paren matched */ - - /* Perl Regex Engine specific data. Other engines shouldn't need - to touch this. Should be refactored out into a different structure - and accessed via the *pprivate field. (except intflags) */ - U32 intflags; /* Internal flags */ - void *pprivate; /* Data private to the regex engine which - created this object. Perl will never mess with - this member at all. */ + + + /* Information about the match that isn't often used */ + char *precomp; /* pre-compilation regular expression */ + I32 prelen; /* length of precomp */ + I32 seen_evals; /* number of eval groups in the pattern - for security checks */ + HV *paren_names; /* Optional hash of paren names */ + + /* Refcount of this regexp */ + I32 refcnt; /* Refcount of this regexp */ } regexp; @@ -119,9 +118,8 @@ typedef struct regexp_engine { struct re_scream_pos_data_s *data); SV* (*checkstr) (pTHX_ regexp *prog); void (*free) (pTHX_ struct regexp* r); - char* (*as_str) (pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval); #ifdef USE_ITHREADS - regexp* (*dupe) (pTHX_ const regexp *r, CLONE_PARAMS *param); + void* (*dupe) (pTHX_ const regexp *r, CLONE_PARAMS *param); #endif } regexp_engine; diff --git a/sv.c b/sv.c index c7de314..a675df2 100644 --- a/sv.c +++ b/sv.c @@ -9472,15 +9472,6 @@ ptr_table_* functions. #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) -/* Duplicate a regexp. Required reading: pregcomp() and pregfree() in - regcomp.c. AMS 20010712 */ - -REGEXP * -Perl_re_dup(pTHX_ const REGEXP *r, CLONE_PARAMS *param) -{ - return CALLREGDUPE(r,param); -} - /* duplicate a file handle */ PerlIO * @@ -9575,7 +9566,7 @@ Perl_mg_dup(pTHX_ MAGIC *mg, CLONE_PARAMS* param) nmg->mg_type = mg->mg_type; nmg->mg_flags = mg->mg_flags; if (mg->mg_type == PERL_MAGIC_qr) { - nmg->mg_obj = (SV*)re_dup((REGEXP*)mg->mg_obj, param); + nmg->mg_obj = (SV*)CALLREGDUPE((REGEXP*)mg->mg_obj, param); } else if(mg->mg_type == PERL_MAGIC_backref) { /* The backref AV has its reference count deliberately bumped by @@ -10935,7 +10926,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, SvREPADTMP(regex) ? sv_dup_inc(regex, param) : SvREFCNT_inc( - newSViv(PTR2IV(re_dup( + newSViv(PTR2IV(CALLREGDUPE( INT2PTR(REGEXP *, SvIVX(regex)), param)))) ; av_push(PL_regex_padav, sv);