*/
#ifdef PERL_EXT_RE_BUILD
-/* need to replace pregcomp et al, so enable that */
-# ifndef PERL_IN_XSUB_RE
-# define PERL_IN_XSUB_RE
-# endif
-/* need access to debugger hooks */
-# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
-# define DEBUGGING
-# endif
+#include "re_top.h"
#endif
-#ifdef PERL_IN_XSUB_RE
-/* We *really* need to overwrite these symbols: */
-# define Perl_pregcomp my_regcomp
-# define Perl_regdump my_regdump
-# define Perl_regprop my_regprop
-# define Perl_pregfree my_regfree
-# define Perl_re_intuit_string my_re_intuit_string
-/* *These* symbols are masked to allow static link. */
-# define Perl_regnext my_regnext
-# define Perl_save_re_context my_save_re_context
-# define Perl_reginitcolors my_reginitcolors
-
-# define PERL_NO_GET_CONTEXT
-#endif
-
-/*SUPPRESS 112*/
/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
**** Alterations to Henry's code are...
****
**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
- **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
+ **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
****
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
#endif
#define REG_COMP_C
-#include "regcomp.h"
+#ifdef PERL_IN_XSUB_RE
+# include "re_comp.h"
+#else
+# include "regcomp.h"
+#endif
#ifdef op
#undef op
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#endif
+#ifdef DEBUGGING
+ char *lastparse;
+ I32 lastnum;
+#define RExC_lastparse (pRExC_state->lastparse)
+#define RExC_lastnum (pRExC_state->lastnum)
+#endif
} RExC_state_t;
#define RExC_flags (pRExC_state->flags)
#define SPSTART 0x4 /* Starts with * or +. */
#define TRYAGAIN 0x8 /* Weeded out a declaration. */
+#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
+
/* Length of a variant. */
typedef struct scan_data_t {
} STMT_END
/*
- * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
- * args. Show regex, up to a maximum length. If it's too long, chop and add
- * "...".
- */
-#define FAIL2(pat,msg) STMT_START { \
- const char *ellipses = ""; \
- IV len = RExC_end - RExC_precomp; \
- \
- if (!SIZE_ONLY) \
- SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
- if (len > RegexLengthToShowInErrorMessages) { \
- /* chop 10 shorter than the max, to ensure meaning of "..." */ \
- len = RegexLengthToShowInErrorMessages - 10; \
- ellipses = "..."; \
- } \
- S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
- msg, (int)len, RExC_precomp, ellipses); \
-} STMT_END
-
-
-/*
* Simple_vFAIL -- like FAIL, but marks the current location in the scan
*/
#define Simple_vFAIL(m) STMT_START { \
- IV offset = RExC_parse - RExC_precomp; \
+ const IV offset = RExC_parse - RExC_precomp; \
Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
* Like Simple_vFAIL(), but accepts two arguments.
*/
#define Simple_vFAIL2(m,a1) STMT_START { \
- IV offset = RExC_parse - RExC_precomp; \
+ const IV offset = RExC_parse - RExC_precomp; \
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
* Like Simple_vFAIL(), but accepts three arguments.
*/
#define Simple_vFAIL3(m, a1, a2) STMT_START { \
- IV offset = RExC_parse - RExC_precomp; \
+ const IV offset = RExC_parse - RExC_precomp; \
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
* Like Simple_vFAIL(), but accepts four arguments.
*/
#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
- IV offset = RExC_parse - RExC_precomp; \
+ const IV offset = RExC_parse - RExC_precomp; \
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
-/*
- * Like Simple_vFAIL(), but accepts five arguments.
- */
-#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
- IV offset = RExC_parse - RExC_precomp; \
- S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
- (int)offset, RExC_precomp, RExC_precomp + offset); \
-} STMT_END
-
-
#define vWARN(loc,m) STMT_START { \
- IV offset = loc - RExC_precomp; \
+ const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARNdep(loc,m) STMT_START { \
- IV offset = loc - RExC_precomp; \
+ const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
"%s" REPORT_LOCATION, \
m, (int)offset, RExC_precomp, RExC_precomp + offset); \
#define vWARN2(loc, m, a1) STMT_START { \
- IV offset = loc - RExC_precomp; \
+ const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN3(loc, m, a1, a2) STMT_START { \
- IV offset = loc - RExC_precomp; \
+ const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
- IV offset = loc - RExC_precomp; \
+ const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
- IV offset = loc - RExC_precomp; \
+ const IV offset = loc - RExC_precomp; \
Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
__LINE__, (node), (byte))); \
if((node) < 0) { \
- Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
+ Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
} else { \
RExC_offsets[2*(node)-1] = (byte); \
} \
#define Set_Node_Length_To_R(node,len) STMT_START { \
if (! SIZE_ONLY) { \
MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
- __LINE__, (node), (len))); \
+ __LINE__, (int)(node), (int)(len))); \
if((node) < 0) { \
- Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
+ Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
} else { \
RExC_offsets[2*(node)] = (len); \
} \
floating substrings if needed. */
STATIC void
-S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
+S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
{
const STRLEN l = CHR_SVLEN(data->last_found);
const STRLEN old_l = CHR_SVLEN(*data->longest);
}
SvCUR_set(data->last_found, 0);
{
- SV * sv = data->last_found;
- MAGIC *mg =
- SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
- if (mg && mg->mg_len > 0)
- mg->mg_len = 0;
+ SV * const sv = data->last_found;
+ if (SvUTF8(sv) && SvMAGICAL(sv)) {
+ MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
+ if (mg)
+ mg->mg_len = 0;
+ }
}
data->last_end = -1;
data->flags &= ~SF_BEFORE_EOL;
/* Can match anything (initialization) */
STATIC void
-S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
+S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
{
ANYOF_CLASS_ZERO(cl);
ANYOF_BITMAP_SETALL(cl);
/* Can match anything (initialization) */
STATIC int
-S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
+S_cl_is_anything(const struct regnode_charclass_class *cl)
{
int value;
/* Can match anything (initialization) */
STATIC void
-S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
+S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
{
Zero(cl, 1, struct regnode_charclass_class);
cl->type = ANYOF;
}
STATIC void
-S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
+S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
{
Zero(cl, 1, struct regnode_charclass_class);
cl->type = ANYOF;
/* 'And' a given class with another one. Can create false positives */
/* We assume that cl is not inverted */
STATIC void
-S_cl_and(pTHX_ struct regnode_charclass_class *cl,
- struct regnode_charclass_class *and_with)
+S_cl_and(struct regnode_charclass_class *cl,
+ const struct regnode_charclass_class *and_with)
{
if (!(and_with->flags & ANYOF_CLASS)
&& !(cl->flags & ANYOF_CLASS)
/* 'OR' a given class with another one. Can create false positives */
/* We assume that cl is not inverted */
STATIC void
-S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
+S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
{
if (or_with->flags & ANYOF_INVERT) {
/* We do not use
/*
- make_trie(startbranch,first,last,tail,flags)
+ make_trie(startbranch,first,last,tail,flags,depth)
startbranch: the first branch in the whole branch sequence
first : start branch of sequence of branch-exact nodes.
May be the same as startbranch
May be the same as tail.
tail : item following the branch sequence
flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
+ depth : indent depth
Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
8: EXACT <baz>(10)
10: END(0)
+ d = uvuni_to_utf8_flags(d, uv, 0);
+
+is the recommended Unicode-aware way of saying
+
+ *(d++) = uv;
*/
-#define TRIE_DEBUG_CHAR \
- DEBUG_TRIE_COMPILE_r({ \
- SV *tmp; \
- if ( UTF ) { \
- tmp = newSVpv( "", 0 ); \
- pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
- } else { \
- tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
- } \
- av_push( trie->revcharmap, tmp ); \
- })
+#define TRIE_STORE_REVCHAR \
+ STMT_START { \
+ SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
+ av_push( TRIE_REVCHARMAP(trie), tmp ); \
+ } STMT_END
#define TRIE_READ_CHAR STMT_START { \
if ( UTF ) { \
} STMT_END
#define TRIE_LIST_NEW(state) STMT_START { \
- Newz( 1023, trie->states[ state ].trans.list, \
+ Newxz( trie->states[ state ].trans.list, \
4, reg_trie_trans_le ); \
TRIE_LIST_CUR( state ) = 1; \
TRIE_LIST_LEN( state ) = 4; \
} STMT_END
+#define TRIE_HANDLE_WORD(state) STMT_START { \
+ if ( !trie->states[ state ].wordnum ) { \
+ /* we havent inserted this word into the structure yet. */\
+ trie->states[ state ].wordnum = ++curword; \
+ DEBUG_r({ \
+ /* store the word for dumping */ \
+ SV* tmp; \
+ if (OP(noper) != NOTHING ) \
+ tmp=newSVpvn( STRING( noper ), STR_LEN( noper ) );\
+ else \
+ tmp=newSVpvn( "", 0 ); \
+ if ( UTF ) SvUTF8_on( tmp ); \
+ av_push( trie->words, tmp ); \
+ }); \
+ } else { \
+ NOOP; /* It's a dupe. So ignore it. */ \
+ } \
+} STMT_END
+
+#ifdef DEBUGGING
+/*
+ dump_trie(trie)
+ dump_trie_interim_list(trie,next_alloc)
+ dump_trie_interim_table(trie,next_alloc)
+
+ These routines dump out a trie in a somewhat readable format.
+ The _interim_ variants are used for debugging the interim
+ tables that are used to generate the final compressed
+ representation which is what dump_trie expects.
+
+ Part of the reason for their existance is to provide a form
+ of documentation as to how the different representations function.
+
+*/
+
+/*
+ dump_trie(trie)
+ Dumps the final compressed table form of the trie to Perl_debug_log.
+ Used for debugging make_trie().
+*/
+
+STATIC void
+S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
+{
+ U32 state;
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
+ (int)depth * 2 + 2,"",
+ "Match","Base","Ofs" );
+
+ for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
+ SV **tmp = av_fetch( trie->revcharmap, state, 0);
+ if ( tmp ) {
+ PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
+ }
+ }
+ PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
+ (int)depth * 2 + 2,"");
+
+ for( state = 0 ; state < trie->uniquecharcount ; state++ )
+ PerlIO_printf( Perl_debug_log, "-----");
+ PerlIO_printf( Perl_debug_log, "\n");
+
+ for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
+ const U32 base = trie->states[ state ].trans.base;
+
+ PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
+
+ if ( trie->states[ state ].wordnum ) {
+ PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
+ } else {
+ PerlIO_printf( Perl_debug_log, "%6s", "" );
+ }
+
+ PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
+
+ if ( base ) {
+ U32 ofs = 0;
+
+ while( ( base + ofs < trie->uniquecharcount ) ||
+ ( base + ofs - trie->uniquecharcount < trie->lasttrans
+ && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
+ ofs++;
+
+ PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
+
+ for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
+ if ( ( base + ofs >= trie->uniquecharcount ) &&
+ ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
+ trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
+ {
+ PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
+ (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
+ } else {
+ PerlIO_printf( Perl_debug_log, "%4s "," ." );
+ }
+ }
+
+ PerlIO_printf( Perl_debug_log, "]");
+
+ }
+ PerlIO_printf( Perl_debug_log, "\n" );
+ }
+}
+/*
+ dump_trie_interim_list(trie,next_alloc)
+ Dumps a fully constructed but uncompressed trie in list form.
+ List tries normally only are used for construction when the number of
+ possible chars (trie->uniquecharcount) is very high.
+ Used for debugging make_trie().
+*/
+STATIC void
+S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
+{
+ U32 state;
+ GET_RE_DEBUG_FLAGS_DECL;
+ /* print out the table precompression. */
+ PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
+ (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
+ PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
+
+ for( state=1 ; state < next_alloc ; state ++ ) {
+ U16 charid;
+
+ PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
+ (int)depth * 2 + 2,"", (UV)state );
+ if ( ! trie->states[ state ].wordnum ) {
+ PerlIO_printf( Perl_debug_log, "%5s| ","");
+ } else {
+ PerlIO_printf( Perl_debug_log, "W%4x| ",
+ trie->states[ state ].wordnum
+ );
+ }
+ for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
+ SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
+ PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
+ SvPV_nolen_const( *tmp ),
+ TRIE_LIST_ITEM(state,charid).forid,
+ (UV)TRIE_LIST_ITEM(state,charid).newstate
+ );
+ }
+
+ }
+}
+
+/*
+ dump_trie_interim_table(trie,next_alloc)
+ Dumps a fully constructed but uncompressed trie in table form.
+ This is the normal DFA style state transition table, with a few
+ twists to facilitate compression later.
+ Used for debugging make_trie().
+*/
+STATIC void
+S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
+{
+ U32 state;
+ U16 charid;
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ /*
+ print out the table precompression so that we can do a visual check
+ that they are identical.
+ */
+
+ PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
+
+ for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
+ SV **tmp = av_fetch( trie->revcharmap, charid, 0);
+ if ( tmp ) {
+ PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
+ }
+ }
+
+ PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
+
+ for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
+ PerlIO_printf( Perl_debug_log, "%4s-", "----" );
+ }
+
+ PerlIO_printf( Perl_debug_log, "\n" );
+
+ for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
+
+ PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
+ (int)depth * 2 + 2,"",
+ (UV)TRIE_NODENUM( state ) );
+
+ for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
+ PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
+ (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
+ }
+ if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
+ PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
+ } else {
+ PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
+ trie->states[ TRIE_NODENUM( state ) ].wordnum );
+ }
+ }
+}
+
+#endif
+
+
+
+
+
STATIC I32
-S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
+S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
{
dVAR;
/* first pass, loop through and scan words */
reg_trie_data *trie;
regnode *cur;
- const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
+ const U32 uniflags = UTF8_ALLOW_DEFAULT;
STRLEN len = 0;
UV uvc = 0;
U16 curword = 0;
const U32 data_slot = add_data( pRExC_state, 1, "t" );
SV *re_trie_maxbuff;
-
+#ifndef DEBUGGING
+ /* these are only used during construction but are useful during
+ debugging so we store them in the struct when debugging.
+ Wordcount is actually superfluous in debugging as we have
+ (AV*)trie->words to use for it, but thats not available when
+ not debugging... We could make the macro use the AV during
+ debugging tho...
+ */
+ U16 trie_wordcount=0;
+ STRLEN trie_charcount=0;
+ U32 trie_laststate=0;
+ AV *trie_revcharmap;
+#endif
GET_RE_DEBUG_FLAGS_DECL;
- Newz( 848200, trie, 1, reg_trie_data );
+ Newxz( trie, 1, reg_trie_data );
trie->refcount = 1;
+ trie->startstate = 1;
RExC_rx->data->data[ data_slot ] = (void*)trie;
- Newz( 848201, trie->charmap, 256, U16 );
+ Newxz( trie->charmap, 256, U16 );
+ if (!(UTF && folder))
+ Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
DEBUG_r({
trie->words = newAV();
- trie->revcharmap = newAV();
});
-
+ TRIE_REVCHARMAP(trie) = newAV();
re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
if (!SvIOK(re_trie_maxbuff)) {
sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
}
-
+ DEBUG_OPTIMISE_r({
+ PerlIO_printf( Perl_debug_log,
+ "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
+ (int)depth * 2 + 2, "",
+ REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
+ REG_NODE_NUM(last), REG_NODE_NUM(tail));
+ });
/* -- First loop and Setup --
We first traverse the branches and scan each word to determine if it
*/
-
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
- regnode *noper = NEXTOPER( cur );
+ regnode * const noper = NEXTOPER( cur );
const U8 *uc = (U8*)STRING( noper );
- const U8 *e = uc + STR_LEN( noper );
+ const U8 * const e = uc + STR_LEN( noper );
STRLEN foldlen = 0;
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
const U8 *scan = (U8*)NULL;
+ STRLEN chars=0;
+ TRIE_WORDCOUNT(trie)++;
+ if (OP(noper) == NOTHING) {
+ trie->minlen= 0;
+ continue;
+ }
+ if (trie->bitmap) {
+ TRIE_BITMAP_SET(trie,*uc);
+ if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
+ }
for ( ; uc < e ; uc += len ) {
- trie->charcount++;
+ TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
+ chars++;
if ( uvc < 256 ) {
if ( !trie->charmap[ uvc ] ) {
trie->charmap[ uvc ]=( ++trie->uniquecharcount );
if ( folder )
trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
- TRIE_DEBUG_CHAR;
+ TRIE_STORE_REVCHAR;
}
} else {
SV** svpp;
if ( !SvTRUE( *svpp ) ) {
sv_setiv( *svpp, ++trie->uniquecharcount );
- TRIE_DEBUG_CHAR;
+ TRIE_STORE_REVCHAR;
}
}
}
- trie->wordcount++;
+ if( cur == first ) {
+ trie->minlen=chars;
+ trie->maxlen=chars;
+ } else if (chars < trie->minlen) {
+ trie->minlen=chars;
+ } else if (chars > trie->maxlen) {
+ trie->maxlen=chars;
+ }
+
} /* end first pass */
DEBUG_TRIE_COMPILE_r(
- PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
- ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
- (int)trie->charcount, trie->uniquecharcount )
+ PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
+ (int)depth * 2 + 2,"",
+ ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
+ (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, trie->minlen, trie->maxlen )
);
*/
- if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
+ if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
/*
Second Pass -- Array Of Lists Representation
We build the initial structure using the lists, and then convert
it into the compressed table form which allows faster lookups
(but cant be modified once converted).
-
-
*/
-
STRLEN transcount = 1;
- Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
+ Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
TRIE_LIST_NEW(1);
next_alloc = 2;
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
- regnode *noper = NEXTOPER( cur );
- U8 *uc = (U8*)STRING( noper );
- U8 *e = uc + STR_LEN( noper );
- U32 state = 1; /* required init */
- U16 charid = 0; /* sanity init */
- U8 *scan = (U8*)NULL; /* sanity init */
- STRLEN foldlen = 0; /* required init */
- U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
-
-
- for ( ; uc < e ; uc += len ) {
-
- TRIE_READ_CHAR;
-
- if ( uvc < 256 ) {
- charid = trie->charmap[ uvc ];
- } else {
- SV** svpp=(SV**)NULL;
- svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
- if ( !svpp ) {
- charid = 0;
- } else {
- charid=(U16)SvIV( *svpp );
- }
- }
- if ( charid ) {
-
- U16 check;
- U32 newstate = 0;
-
- charid--;
- if ( !trie->states[ state ].trans.list ) {
- TRIE_LIST_NEW( state );
- }
- for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
- if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
- newstate = TRIE_LIST_ITEM( state, check ).newstate;
- break;
- }
- }
- if ( ! newstate ) {
- newstate = next_alloc++;
- TRIE_LIST_PUSH( state, charid, newstate );
- transcount++;
- }
- state = newstate;
-
- } else {
- Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
- }
- /* charid is now 0 if we dont know the char read, or nonzero if we do */
- }
-
- if ( !trie->states[ state ].wordnum ) {
- /* we havent inserted this word into the structure yet. */
- trie->states[ state ].wordnum = ++curword;
+ regnode * const noper = NEXTOPER( cur );
+ U8 *uc = (U8*)STRING( noper );
+ const U8 * const e = uc + STR_LEN( noper );
+ U32 state = 1; /* required init */
+ U16 charid = 0; /* sanity init */
+ U8 *scan = (U8*)NULL; /* sanity init */
+ STRLEN foldlen = 0; /* required init */
+ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+
+ if (OP(noper) != NOTHING) {
+ for ( ; uc < e ; uc += len ) {
+
+ TRIE_READ_CHAR;
+
+ if ( uvc < 256 ) {
+ charid = trie->charmap[ uvc ];
+ } else {
+ SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
+ if ( !svpp ) {
+ charid = 0;
+ } else {
+ charid=(U16)SvIV( *svpp );
+ }
+ }
+ if ( charid ) {
- DEBUG_r({
- /* store the word for dumping */
- SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
- if ( UTF ) SvUTF8_on( tmp );
- av_push( trie->words, tmp );
- });
+ U16 check;
+ U32 newstate = 0;
- } else {
- /* Its a dupe. So ignore it. */
- }
+ charid--;
+ if ( !trie->states[ state ].trans.list ) {
+ TRIE_LIST_NEW( state );
+ }
+ for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
+ if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
+ newstate = TRIE_LIST_ITEM( state, check ).newstate;
+ break;
+ }
+ }
+ if ( ! newstate ) {
+ newstate = next_alloc++;
+ TRIE_LIST_PUSH( state, charid, newstate );
+ transcount++;
+ }
+ state = newstate;
+ } else {
+ Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
+ }
+ /* charid is now 0 if we dont know the char read, or nonzero if we do */
+ }
+ }
+ TRIE_HANDLE_WORD(state);
} /* end second pass */
- trie->laststate = next_alloc;
+ TRIE_LASTSTATE(trie) = next_alloc;
Renew( trie->states, next_alloc, reg_trie_state );
- DEBUG_TRIE_COMPILE_MORE_r({
- U32 state;
- U16 charid;
-
- /*
- print out the table precompression.
- */
-
- PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
- PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
-
- for( state=1 ; state < next_alloc ; state ++ ) {
-
- PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
- if ( ! trie->states[ state ].wordnum ) {
- PerlIO_printf( Perl_debug_log, "%5s| ","");
- } else {
- PerlIO_printf( Perl_debug_log, "W%04x| ",
- trie->states[ state ].wordnum
- );
- }
- for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
- SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
- PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
- SvPV_nolen( *tmp ),
- TRIE_LIST_ITEM(state,charid).forid,
- (UV)TRIE_LIST_ITEM(state,charid).newstate
+ /* and now dump it out before we compress it */
+ DEBUG_TRIE_COMPILE_MORE_r(
+ dump_trie_interim_list(trie,next_alloc,depth+1)
);
- }
-
- }
- PerlIO_printf( Perl_debug_log, "\n\n" );
- });
- Newz( 848203, trie->trans, transcount ,reg_trie_trans );
+ Newxz( trie->trans, transcount ,reg_trie_trans );
{
U32 state;
- U16 idx;
U32 tp = 0;
U32 zp = 0;
if (trie->states[state].trans.list) {
U16 minid=TRIE_LIST_ITEM( state, 1).forid;
U16 maxid=minid;
-
+ U16 idx;
for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
- if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
- minid=TRIE_LIST_ITEM( state, idx).forid;
- } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
- maxid=TRIE_LIST_ITEM( state, idx).forid;
- }
+ const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
+ if ( forid < minid ) {
+ minid=forid;
+ } else if ( forid > maxid ) {
+ maxid=forid;
+ }
}
if ( transcount < tp + maxid - minid + 1) {
transcount *= 2;
}
} else {
for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
- U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
+ const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
trie->trans[ tid ].check = state;
}
*/
- Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
+
+ Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
reg_trie_trans );
- Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
+ Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
next_alloc = trie->uniquecharcount + 1;
+
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
- regnode *noper = NEXTOPER( cur );
- U8 *uc = (U8*)STRING( noper );
- U8 *e = uc + STR_LEN( noper );
+ regnode * const noper = NEXTOPER( cur );
+ const U8 *uc = (U8*)STRING( noper );
+ const U8 * const e = uc + STR_LEN( noper );
U32 state = 1; /* required init */
STRLEN foldlen = 0; /* required init */
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
-
+ if ( OP(noper) != NOTHING ) {
for ( ; uc < e ; uc += len ) {
TRIE_READ_CHAR;
if ( uvc < 256 ) {
charid = trie->charmap[ uvc ];
} else {
- SV** svpp=(SV**)NULL;
- svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
- if ( !svpp ) {
- charid = 0;
- } else {
- charid=(U16)SvIV( *svpp );
- }
+ SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
+ charid = svpp ? (U16)SvIV(*svpp) : 0;
}
if ( charid ) {
charid--;
}
/* charid is now 0 if we dont know the char read, or nonzero if we do */
}
-
- accept_state = TRIE_NODENUM( state );
- if ( !trie->states[ accept_state ].wordnum ) {
- /* we havent inserted this word into the structure yet. */
- trie->states[ accept_state ].wordnum = ++curword;
-
- DEBUG_r({
- /* store the word for dumping */
- SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
- if ( UTF ) SvUTF8_on( tmp );
- av_push( trie->words, tmp );
- });
-
- } else {
- /* Its a dupe. So ignore it. */
}
+ accept_state = TRIE_NODENUM( state );
+ TRIE_HANDLE_WORD(accept_state);
} /* end second pass */
- DEBUG_TRIE_COMPILE_MORE_r({
- /*
- print out the table precompression so that we can do a visual check
- that they are identical.
- */
- U32 state;
- U16 charid;
- PerlIO_printf( Perl_debug_log, "\nChar : " );
-
- for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
- SV **tmp = av_fetch( trie->revcharmap, charid, 0);
- if ( tmp ) {
- PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
- }
- }
-
- PerlIO_printf( Perl_debug_log, "\nState+-" );
-
- for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
- PerlIO_printf( Perl_debug_log, "%4s-", "----" );
- }
-
- PerlIO_printf( Perl_debug_log, "\n" );
-
- for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
-
- PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
+ /* and now dump it out before we compress it */
+ DEBUG_TRIE_COMPILE_MORE_r(
+ dump_trie_interim_table(trie,next_alloc,depth+1)
+ );
- for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
- PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
- (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
- }
- if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
- PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
- } else {
- PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
- trie->states[ TRIE_NODENUM( state ) ].wordnum );
- }
- }
- PerlIO_printf( Perl_debug_log, "\n\n" );
- });
{
/*
* Inplace compress the table.*
demq
*/
const U32 laststate = TRIE_NODENUM( next_alloc );
- U32 used , state, charid;
+ U32 state, charid;
U32 pos = 0, zp=0;
- trie->laststate = laststate;
+ TRIE_LASTSTATE(trie) = laststate;
for ( state = 1 ; state < laststate ; state++ ) {
U8 flag = 0;
- U32 stateidx = TRIE_NODEIDX( state );
- U32 o_used=trie->trans[ stateidx ].check;
- used = trie->trans[ stateidx ].check;
+ const U32 stateidx = TRIE_NODEIDX( state );
+ const U32 o_used = trie->trans[ stateidx ].check;
+ U32 used = trie->trans[ stateidx ].check;
trie->trans[ stateidx ].check = 0;
for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
Renew( trie->states, laststate + 1, reg_trie_state);
DEBUG_TRIE_COMPILE_MORE_r(
PerlIO_printf( Perl_debug_log,
- " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
- (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
+ "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
+ (int)depth * 2 + 2,"",
+ (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
(IV)next_alloc,
(IV)pos,
( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
/* resize the trans array to remove unused space */
Renew( trie->trans, trie->lasttrans, reg_trie_trans);
- DEBUG_TRIE_COMPILE_r({
- U32 state;
+ /* and now dump out the compressed format */
+ DEBUG_TRIE_COMPILE_r(
+ dump_trie(trie,depth+1)
+ );
+
+ { /* Modify the program and insert the new TRIE node*/
+ regnode *convert;
+ U8 nodetype =(U8)(flags & 0xFF);
+ char *str=NULL;
/*
- Now we print it out again, in a slightly different form as there is additional
- info we want to be able to see when its compressed. They are close enough for
- visual comparison though.
+ This means we convert either the first branch or the first Exact,
+ depending on whether the thing following (in 'last') is a branch
+ or not and whther first is the startbranch (ie is it a sub part of
+ the alternation or is it the whole thing.)
+ Assuming its a sub part we conver the EXACT otherwise we convert
+ the whole branch sequence, including the first.
*/
- PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
-
- for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
- SV **tmp = av_fetch( trie->revcharmap, state, 0);
- if ( tmp ) {
- PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
- }
- }
- PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
-
- for( state = 0 ; state < trie->uniquecharcount ; state++ )
- PerlIO_printf( Perl_debug_log, "-----");
- PerlIO_printf( Perl_debug_log, "\n");
-
- for( state = 1 ; state < trie->laststate ; state++ ) {
- U32 base = trie->states[ state ].trans.base;
-
- PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
-
- if ( trie->states[ state ].wordnum ) {
- PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
+ /* Find the node we are going to overwrite */
+ if ( first == startbranch && OP( last ) != BRANCH ) {
+ convert = first;
} else {
- PerlIO_printf( Perl_debug_log, "%6s", "" );
+ convert = NEXTOPER( first );
+ NEXT_OFF( first ) = (U16)(last - first);
}
- PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
-
- if ( base ) {
+ /* But first we check to see if there is a common prefix we can
+ split out as an EXACT and put in front of the TRIE node. */
+ trie->startstate= 1;
+ if ( trie->bitmap && !trie->widecharmap ) {
+ U32 state;
+ DEBUG_OPTIMISE_r(
+ PerlIO_printf( Perl_debug_log,"%*sLaststate:%d\n",
+ (int)depth * 2 + 2,"",
+ TRIE_LASTSTATE(trie)));
+ for( state= 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
U32 ofs = 0;
+ I32 idx= -1;
+ U32 count= 0;
+ const U32 base= trie->states[ state ].trans.base;
- while( ( base + ofs < trie->uniquecharcount ) ||
- ( base + ofs - trie->uniquecharcount < trie->lasttrans
- && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
- ofs++;
+ if ( trie->states[state].wordnum )
+ count =1;
- PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
+ for ( ofs= 0 ; ofs < trie->uniquecharcount ; ofs++ )
+ {
- for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
if ( ( base + ofs >= trie->uniquecharcount ) &&
( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
{
- PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
- (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
- } else {
- PerlIO_printf( Perl_debug_log, "%4s "," 0" );
+ if ( ++count > 1 ) {
+ SV **tmp= av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
+ const char *ch= SvPV_nolen_const( *tmp );
+ if (state==1) break;
+ if ( count == 2 ) {
+ Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
+ DEBUG_OPTIMISE_r(
+ PerlIO_printf( Perl_debug_log,"%*sNew Start State=%d Class: [",
+ (int)depth * 2 + 2,"",
+ state));
+ if (idx>-1) {
+ SV **tmp= av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
+ const char *ch= SvPV_nolen_const( *tmp );
+
+ TRIE_BITMAP_SET(trie,*ch);
+ if ( folder )
+ TRIE_BITMAP_SET(trie,folder[ *ch ]);
+ DEBUG_OPTIMISE_r(
+ PerlIO_printf( Perl_debug_log,"%s", ch)
+ );
}
}
-
- PerlIO_printf( Perl_debug_log, "]");
-
+ TRIE_BITMAP_SET(trie,*ch);
+ if ( folder ) TRIE_BITMAP_SET(trie,folder[ *ch ]);
+ DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
}
- PerlIO_printf( Perl_debug_log, "\n" );
+ idx= ofs;
}
- });
-
- {
- /* now finally we "stitch in" the new TRIE node
- This means we convert either the first branch or the first Exact,
- depending on whether the thing following (in 'last') is a branch
- or not and whther first is the startbranch (ie is it a sub part of
- the alternation or is it the whole thing.)
- Assuming its a sub part we conver the EXACT otherwise we convert
- the whole branch sequence, including the first.
- */
- regnode *convert;
-
-
-
+ }
+ if ( count == 1 ) {
+ SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
+ const char *ch= SvPV_nolen_const( *tmp );
+ DEBUG_OPTIMISE_r(
+ PerlIO_printf( Perl_debug_log,"%*sPrefix State: %d Idx:%d Char='%s'\n",
+ (int)depth * 2 + 2,"",
+ state, idx, ch)
+ );
+ if ( state==1 ) {
+ OP( convert ) = nodetype;
+ str=STRING(convert);
+ STR_LEN(convert)=0;
+ }
+ *str++=*ch;
+ STR_LEN(convert)++;
- if ( first == startbranch && OP( last ) != BRANCH ) {
- convert = first;
} else {
- convert = NEXTOPER( first );
- NEXT_OFF( first ) = (U16)(last - first);
+ if (state>1)
+ DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
+ break;
}
-
- OP( convert ) = TRIE + (U8)( flags - EXACT );
+ }
+ if (str) {
+ regnode *n= convert+NODE_SZ_STR(convert);
+ NEXT_OFF(convert)= NODE_SZ_STR(convert);
+ trie->startstate= state;
+ trie->minlen-= (state-1);
+ trie->maxlen-= (state-1);
+ if (trie->maxlen)
+ convert= n;
+ else {
+ NEXT_OFF(convert) = (U16)(tail - convert);
+ }
+ }
+ }
+ if ( trie->maxlen ) {
+ OP( convert ) = TRIE;
NEXT_OFF( convert ) = (U16)(tail - convert);
ARG_SET( convert, data_slot );
- /* tells us if we need to handle accept buffers specially */
- convert->flags = ( RExC_seen_evals ? 1 : 0 );
-
-
+ /* store the type in the flags */
+ convert->flags = nodetype;
+ /* XXX We really should free up the resource in trie now, as we wont use them */
+ }
/* needed for dumping*/
DEBUG_r({
regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
return 1;
}
-
-
/*
* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
* These need to be revisited when a newer toolchain becomes available.
/* REx optimizer. Converts nodes into quickier variants "in place".
Finds fixed substrings. */
-/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
+/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
to the position after last scanned or to NULL. */
-
STATIC I32
-S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
+S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
+ regnode *last, scan_data_t *data, U32 flags, U32 depth)
/* scanp: Start here (read-write). */
/* deltap: Write maxlen-minlen here. */
/* last: Stop before this one. */
{
+ dVAR;
I32 min = 0, pars = 0, code;
regnode *scan = *scanp, *next;
I32 delta = 0;
GET_RE_DEBUG_FLAGS_DECL;
+PEEP:
while (scan && OP(scan) != END && scan < last) {
+ #ifdef DEBUGGING
+ int merged=0;
+ #endif
/* Peephole optimizer: */
DEBUG_OPTIMISE_r({
- SV *mysv=sv_newmortal();
- regprop( mysv, scan);
- PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
- (int)depth*2, "", SvPV_nolen(mysv), PTR2UV(scan));
+ SV * const mysv=sv_newmortal();
+ regprop(RExC_rx, mysv, scan);
+ PerlIO_printf(Perl_debug_log, "%*s%4s~ %s (%d)\n",
+ (int)depth*2, "",
+ scan==*scanp ? "Peep" : "",
+ SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
});
-
- if (PL_regkind[(U8)OP(scan)] == EXACT) {
+ if (PL_regkind[OP(scan)] == EXACT) {
/* Merge several consecutive EXACTish nodes into one. */
regnode *n = regnext(scan);
U32 stringok = 1;
#ifdef DEBUGGING
regnode *stop = scan;
#endif
-
next = scan + NODE_SZ_STR(scan);
/* Skip NOTHING, merge EXACT*. */
while (n &&
- ( PL_regkind[(U8)OP(n)] == NOTHING ||
+ ( PL_regkind[OP(n)] == NOTHING ||
(stringok && (OP(n) == OP(scan))))
&& NEXT_OFF(n)
&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
if (OP(n) == TAIL || n > next)
stringok = 0;
- if (PL_regkind[(U8)OP(n)] == NOTHING) {
+ if (PL_regkind[OP(n)] == NOTHING) {
+ DEBUG_OPTIMISE_r({
+ SV * const mysv=sv_newmortal();
+ regprop(RExC_rx, mysv, n);
+ PerlIO_printf(Perl_debug_log, "%*sskip: %s (%d)\n",
+ (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
+ });
NEXT_OFF(scan) += NEXT_OFF(n);
next = n + NODE_STEP_REGNODE;
#ifdef DEBUGGING
}
else if (stringok) {
const int oldl = STR_LEN(scan);
- regnode *nnext = regnext(n);
-
+ regnode * const nnext = regnext(n);
+ DEBUG_OPTIMISE_r({
+ SV * const mysv=sv_newmortal();
+ regprop(RExC_rx, mysv, n);
+ PerlIO_printf(Perl_debug_log, "%*s mrg: %s (%d)\n",
+ (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
+ merged++;
+ });
if (oldl + STR_LEN(n) > U8_MAX)
break;
NEXT_OFF(scan) += NEXT_OFF(n);
another valid sequence of UTF-8 bytes.
*/
- char *s0 = STRING(scan), *s, *t;
- char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
- const char *t0 = "\xcc\x88\xcc\x81";
- const char *t1 = t0 + 3;
+ char * const s0 = STRING(scan), *s, *t;
+ char * const s1 = s0 + STR_LEN(scan) - 1;
+ char * const s2 = s1 - 4;
+ const char t0[] = "\xcc\x88\xcc\x81";
+ const char * const t1 = t0 + 3;
for (s = s0 + 2;
s < s2 && (t = ninstr(s, s1, t0, t1));
/* Allow dumping */
n = scan + NODE_SZ_STR(scan);
while (n <= stop) {
- if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
+ if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
OP(n) = OPTIMIZED;
NEXT_OFF(n) = 0;
}
/* Skip NOTHING and LONGJMP. */
while ((n = regnext(n))
- && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
+ && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
|| ((OP(n) == LONGJMP) && (noff = ARG(n))))
&& off + noff < max)
off += noff;
NEXT_OFF(scan) = off;
}
+ DEBUG_OPTIMISE_r({if (merged){
+ SV * const mysv=sv_newmortal();
+ regprop(RExC_rx, mysv, scan);
+ PerlIO_printf(Perl_debug_log, "%*s res: %s (%d)\n",
+ (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
+ }});
+
/* The principal pseudo-switch. Cannot be a switch, since we
look into several different things. */
if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
I32 max1 = 0, min1 = I32_MAX, num = 0;
struct regnode_charclass_class accum;
- regnode *startbranch=scan;
+ regnode * const startbranch=scan;
if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
scan = next;
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
pars++;
- if (data && (data_fake.flags & SF_HAS_EVAL))
+ if (data) {
+ if (data_fake.flags & SF_HAS_EVAL)
data->flags |= SF_HAS_EVAL;
- if (data)
data->whilem_c = data_fake.whilem_c;
+ }
if (flags & SCF_DO_STCLASS)
cl_or(pRExC_state, &accum, &this_class);
if (code == SUSPEND)
*/
if (DO_TRIE) {
+ int made=0;
if (!re_trie_maxbuff) {
re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
if (!SvIOK(re_trie_maxbuff))
U32 count=0;
#ifdef DEBUGGING
- SV *mysv = sv_newmortal(); /* for dumping */
+ SV * const mysv = sv_newmortal(); /* for dumping */
#endif
/* var tail is used because there may be a TAIL
regop in the way. Ie, the exacts will point to the
tail = regnext( tail );
}
+
DEBUG_OPTIMISE_r({
- regprop( mysv, tail );
- PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
- (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
- (RExC_seen_evals) ? "[EVAL]" : ""
+ regprop(RExC_rx, mysv, tail );
+ PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
+ (int)depth * 2 + 2, "",
+ "Looking for TRIE'able sequences. Tail node is: ",
+ SvPV_nolen_const( mysv )
);
});
+
/*
step through the branches, cur represents each
/* dont use tail as the end marker for this traverse */
for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
- regnode *noper = NEXTOPER( cur );
- regnode *noper_next = regnext( noper );
+ regnode * const noper = NEXTOPER( cur );
+ regnode * const noper_next = regnext( noper );
DEBUG_OPTIMISE_r({
- regprop( mysv, cur);
- PerlIO_printf( Perl_debug_log, "%*s%s",
- (int)depth * 2 + 2," ", SvPV_nolen( mysv ) );
+ regprop(RExC_rx, mysv, cur);
+ PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
+ (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
- regprop( mysv, noper);
+ regprop(RExC_rx, mysv, noper);
PerlIO_printf( Perl_debug_log, " -> %s",
- SvPV_nolen(mysv));
+ SvPV_nolen_const(mysv));
if ( noper_next ) {
- regprop( mysv, noper_next );
+ regprop(RExC_rx, mysv, noper_next );
PerlIO_printf( Perl_debug_log,"\t=> %s\t",
- SvPV_nolen(mysv));
+ SvPV_nolen_const(mysv));
}
- PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
- first, last, cur );
+ PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
+ REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
});
- if ( ( first ? OP( noper ) == optype
- : PL_regkind[ (U8)OP( noper ) ] == EXACT )
+ if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
+ : PL_regkind[ OP( noper ) ] == EXACT )
+ || OP(noper) == NOTHING )
&& noper_next == tail && count<U16_MAX)
{
count++;
- if ( !first ) {
- first = cur;
+ if ( !first || optype == NOTHING ) {
+ if (!first) first = cur;
optype = OP( noper );
} else {
- DEBUG_OPTIMISE_r(
- if (!last ) {
- regprop( mysv, first);
- PerlIO_printf( Perl_debug_log, "%*s%s",
- (int)depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
- regprop( mysv, NEXTOPER(first) );
- PerlIO_printf( Perl_debug_log, " -> %s\n",
- SvPV_nolen( mysv ) );
- }
- );
last = cur;
- DEBUG_OPTIMISE_r({
- regprop( mysv, cur);
- PerlIO_printf( Perl_debug_log, "%*s%s",
- (int)depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
- regprop( mysv, noper );
- PerlIO_printf( Perl_debug_log, " -> %s\n",
- SvPV_nolen( mysv ) );
- });
}
} else {
if ( last ) {
- DEBUG_OPTIMISE_r(
- PerlIO_printf( Perl_debug_log, "%*s%s\n",
- (int)depth * 2 + 2, "E:", "**END**" );
- );
- make_trie( pRExC_state, startbranch, first, cur, tail, optype );
+ made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
}
- if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
+ if ( PL_regkind[ OP( noper ) ] == EXACT
&& noper_next == tail )
{
count = 1;
}
}
DEBUG_OPTIMISE_r({
- regprop( mysv, cur);
+ regprop(RExC_rx, mysv, cur);
PerlIO_printf( Perl_debug_log,
- "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
- " ", SvPV_nolen( mysv ), first, last, cur);
+ "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
+ "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
});
if ( last ) {
- DEBUG_OPTIMISE_r(
- PerlIO_printf( Perl_debug_log, "%*s%s\n",
- (int)depth * 2 + 2, "E:", "==END==" );
- );
- make_trie( pRExC_state, startbranch, first, scan, tail, optype );
+ made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
+#ifdef TRIE_STUDY_OPT
+ if ( OP(first)!=TRIE && startbranch == first ) {
+
}
+#endif
}
}
+
+ } /* do trie */
}
else if ( code == BRANCHJ ) { /* single branch is optimized. */
scan = NEXTOPER(NEXTOPER(scan));
}
else if (OP(scan) == EXACT) {
I32 l = STR_LEN(scan);
- UV uc = *((U8*)STRING(scan));
+ UV uc;
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
l = utf8_length(s, s + l);
uc = utf8_to_uvchr(s, NULL);
+ } else {
+ uc = *((U8*)STRING(scan));
}
min += l;
if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
? I32_MAX : data->pos_min + data->pos_delta;
}
sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
+ if (UTF)
+ SvUTF8_on(data->last_found);
{
- SV * sv = data->last_found;
- MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
+ SV * const sv = data->last_found;
+ MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
mg_find(sv, PERL_MAGIC_utf8) : NULL;
if (mg && mg->mg_len >= 0)
mg->mg_len += utf8_length((U8*)STRING(scan),
(U8*)STRING(scan)+STR_LEN(scan));
}
- if (UTF)
- SvUTF8_on(data->last_found);
data->last_end = data->pos_min + l;
data->pos_min += l; /* As in the first entry. */
data->flags &= ~SF_BEFORE_EOL;
}
flags &= ~SCF_DO_STCLASS;
}
- else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
+ else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
I32 l = STR_LEN(scan);
UV uc = *((U8*)STRING(scan));
/* Search for fixed substrings supports EXACT only. */
- if (flags & SCF_DO_SUBSTR)
+ if (flags & SCF_DO_SUBSTR) {
+ assert(data);
scan_commit(pRExC_state, data);
+ }
if (UTF) {
- U8 *s = (U8 *)STRING(scan);
+ const U8 * const s = (U8 *)STRING(scan);
l = utf8_length(s, s + l);
uc = utf8_to_uvchr(s, NULL);
}
min += l;
- if (data && (flags & SCF_DO_SUBSTR))
+ if (flags & SCF_DO_SUBSTR)
data->pos_min += l;
if (flags & SCF_DO_STCLASS_AND) {
/* Check whether it is compatible with what we know already! */
}
flags &= ~SCF_DO_STCLASS;
}
+#ifdef TRIE_STUDY_OPT
+ else if (OP(scan) == TRIE) {
+ reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
+ min += trie->minlen;
+ flags &= ~SCF_DO_STCLASS; /* xxx */
+ if (flags & SCF_DO_SUBSTR) {
+ scan_commit(pRExC_state,data); /* Cannot expect anything... */
+ data->pos_min += trie->minlen;
+ data->pos_delta+= (trie->maxlen-trie->minlen);
+ }
+ }
+#endif
else if (strchr((const char*)PL_varies,OP(scan))) {
I32 mincount, maxcount, minnext, deltanext, fl = 0;
I32 f = flags, pos_before = 0;
- regnode *oscan = scan;
+ regnode * const oscan = scan;
struct regnode_charclass_class this_class;
struct regnode_charclass_class *oclass = NULL;
I32 next_is_eval = 0;
- switch (PL_regkind[(U8)OP(scan)]) {
+ switch (PL_regkind[OP(scan)]) {
case WHILEM: /* End of (?:...)* . */
scan = NEXTOPER(scan);
goto finish;
}
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
- if (ckWARN(WARN_REGEXP)
- /* ? quantifier ok, except for (?{ ... }) */
- && (next_is_eval || !(mincount == 0 && maxcount == 1))
+ if ( /* ? quantifier ok, except for (?{ ... }) */
+ (next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
&& data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
- && maxcount <= REG_INFTY/3) /* Complement check for big count */
+ && maxcount <= REG_INFTY/3 /* Complement check for big count */
+ && ckWARN(WARN_REGEXP))
{
vWARN(RExC_parse,
"Quantifier unexpected on zero-length expression");
&& !deltanext && minnext == 1 ) {
/* Try to optimize to CURLYN. */
regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
- regnode *nxt1 = nxt;
+ regnode * const nxt1 = nxt;
#ifdef DEBUGGING
regnode *nxt2;
#endif
/* Skip open. */
nxt = regnext(nxt);
if (!strchr((const char*)PL_simple,OP(nxt))
- && !(PL_regkind[(U8)OP(nxt)] == EXACT
+ && !(PL_regkind[OP(nxt)] == EXACT
&& STR_LEN(nxt) == 1))
goto nogo;
#ifdef DEBUGGING
if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
pars++;
if (flags & SCF_DO_SUBSTR) {
- SV *last_str = Nullsv;
+ SV *last_str = NULL;
int counted = mincount != 0;
if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
#if defined(SPARC64_GCC_WORKAROUND)
I32 b = 0;
STRLEN l = 0;
- char *s = NULL;
+ const char *s = NULL;
I32 old = 0;
if (pos_before >= data->last_start_min)
b = data->last_start_min;
l = 0;
- s = SvPV(data->last_found, l);
+ s = SvPV_const(data->last_found, l);
old = b - data->last_start_min;
#else
I32 b = pos_before >= data->last_start_min
? pos_before : data->last_start_min;
STRLEN l;
- char *s = SvPV(data->last_found, l);
+ const char * const s = SvPV_const(data->last_found, l);
I32 old = b - data->last_start_min;
#endif
if (mincount > 1) {
SvGROW(last_str, (mincount * l) + 1);
repeatcpy(SvPVX(last_str) + l,
- SvPVX(last_str), l, mincount - 1);
+ SvPVX_const(last_str), l, mincount - 1);
SvCUR_set(last_str, SvCUR(last_str) * mincount);
/* Add additional parts. */
SvCUR_set(data->last_found,
the group. */
scan_commit(pRExC_state,data);
if (mincount && last_str) {
- sv_setsv(data->last_found, last_str);
+ SV * const sv = data->last_found;
+ MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
+ mg_find(sv, PERL_MAGIC_utf8) : NULL;
+
+ if (mg)
+ mg->mg_len = -1;
+ sv_setsv(sv, last_str);
data->last_end = data->pos_min;
data->last_start_min =
data->pos_min - CHR_SVLEN(last_str);
data->flags |= SF_HAS_EVAL;
optimize_curly_tail:
if (OP(oscan) != CURLYX) {
- while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
+ while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
&& NEXT_OFF(next))
NEXT_OFF(oscan) += NEXT_OFF(next);
}
/* Some of the logic below assumes that switching
locale on will only add false positives. */
- switch (PL_regkind[(U8)OP(scan)]) {
+ switch (PL_regkind[OP(scan)]) {
case SANY:
default:
do_default:
flags &= ~SCF_DO_STCLASS;
}
}
- else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
+ else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
data->flags |= (OP(scan) == MEOL
? SF_BEFORE_MEOL
: SF_BEFORE_SEOL);
}
- else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
+ else if ( PL_regkind[OP(scan)] == BRANCHJ
/* Lookbehind, or need to calculate parens/evals/stclass: */
&& (scan->flags || data || (flags & SCF_DO_STCLASS))
&& (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
if (data)
data->whilem_c = data_fake.whilem_c;
if (f & SCF_DO_STCLASS_AND) {
- int was = (data->start_class->flags & ANYOF_EOS);
+ const int was = (data->start_class->flags & ANYOF_EOS);
cl_and(data->start_class, &intrnl);
if (was)
}
STATIC I32
-S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
+S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
{
if (RExC_rx->data) {
Renewc(RExC_rx->data,
RExC_rx->data->count += n;
}
else {
- Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
+ Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
char, struct reg_data);
- New(1208, RExC_rx->data->what, n, U8);
+ Newx(RExC_rx->data->what, n, U8);
RExC_rx->data->count = n;
}
Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
return RExC_rx->data->count - n;
}
+#ifndef PERL_IN_XSUB_RE
void
Perl_reginitcolors(pTHX)
{
+ dVAR;
const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
if (s) {
char *t = savepv(s);
}
PL_colorset = 1;
}
-
+#endif
/*
- pregcomp - compile a regular expression into internal code
regexp *
Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
{
+ dVAR;
register regexp *r;
regnode *scan;
regnode *first;
* Clever compilers notice this and complain. --jhi */
REGC((U8)REG_MAGIC, (char*)RExC_emit);
#endif
- if (reg(pRExC_state, 0, &flags) == NULL) {
- RExC_precomp = Nullch;
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
+ if (reg(pRExC_state, 0, &flags,1) == NULL) {
+ RExC_precomp = NULL;
return(NULL);
}
- DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
+ DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
/* Small enough for pointer-storage convention?
If extralen==0, this means that we will not need long jumps. */
RExC_whilem_seen = 15;
/* Allocate space and initialize. */
- Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
+ Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
char, regexp);
if (r == NULL)
FAIL("Regexp out of space");
r->prelen = xend - exp;
r->precomp = savepvn(RExC_precomp, r->prelen);
r->subbeg = NULL;
-#ifdef PERL_COPY_ON_WRITE
- r->saved_copy = Nullsv;
+#ifdef PERL_OLD_COPY_ON_WRITE
+ r->saved_copy = NULL;
#endif
r->reganch = pm->op_pmflags & PMf_COMPILETIME;
r->nparens = RExC_npar - 1; /* set early to validate backrefs */
+ r->lastparen = 0; /* mg.c reads this. */
r->substrs = 0; /* Useful during FAIL. */
r->startp = 0; /* Useful during FAIL. */
r->endp = 0; /* Useful during FAIL. */
- Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
+ Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
if (r->offsets) {
r->offsets[0] = RExC_size;
}
RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
REGC((U8)REG_MAGIC, (char*) RExC_emit++);
r->data = 0;
- if (reg(pRExC_state, 0, &flags) == NULL)
+ if (reg(pRExC_state, 0, &flags,1) == NULL)
return(NULL);
/* XXXX To minimize changes to RE engine we always allocate
3-units-long substrs field. */
- Newz(1004, r->substrs, 1, struct reg_substr_data);
+ Newxz(r->substrs, 1, struct reg_substr_data);
StructCopy(&zero_scan_data, &data, scan_data_t);
/* XXXX Should not we check for something else? Usually it is OPEN1... */
(OP(first) == PLUS) ||
(OP(first) == MINMOD) ||
/* An {n,m} with n>0 */
- (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
+ (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) {
if (OP(first) == PLUS)
sawplus = 1;
else
- first += regarglen[(U8)OP(first)];
+ first += regarglen[OP(first)];
first = NEXTOPER(first);
}
/* Starting-point info. */
again:
- if (PL_regkind[(U8)OP(first)] == EXACT) {
+ if (PL_regkind[OP(first)] == EXACT) {
if (OP(first) == EXACT)
- ; /* Empty, get anchored substr later. */
+ NOOP; /* Empty, get anchored substr later. */
else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
r->regstclass = first;
}
else if (strchr((const char*)PL_simple,OP(first)))
r->regstclass = first;
- else if (PL_regkind[(U8)OP(first)] == BOUND ||
- PL_regkind[(U8)OP(first)] == NBOUND)
+ else if (PL_regkind[OP(first)] == BOUND ||
+ PL_regkind[OP(first)] == NBOUND)
r->regstclass = first;
- else if (PL_regkind[(U8)OP(first)] == BOL) {
+ else if (PL_regkind[OP(first)] == BOL) {
r->reganch |= (OP(first) == MBOL
? ROPT_ANCH_MBOL
: (OP(first) == SBOL
goto again;
}
else if (!sawopen && (OP(first) == STAR &&
- PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
+ PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
!(r->reganch & ROPT_ANCH) )
{
/* turn .* into ^.* with an implied $*=1 */
*/
minlen = 0;
- data.longest_fixed = newSVpvn("",0);
- data.longest_float = newSVpvn("",0);
- data.last_found = newSVpvn("",0);
+ data.longest_fixed = newSVpvs("");
+ data.longest_float = newSVpvs("");
+ data.last_found = newSVpvs("");
data.longest = &(data.longest_fixed);
first = scan;
if (!r->regstclass) {
if (SvUTF8(data.longest_float)) {
r->float_utf8 = data.longest_float;
- r->float_substr = Nullsv;
+ r->float_substr = NULL;
} else {
r->float_substr = data.longest_float;
- r->float_utf8 = Nullsv;
+ r->float_utf8 = NULL;
}
r->float_min_offset = data.offset_float_min;
r->float_max_offset = data.offset_float_max;
}
else {
remove_float:
- r->float_substr = r->float_utf8 = Nullsv;
+ r->float_substr = r->float_utf8 = NULL;
SvREFCNT_dec(data.longest_float);
longest_float_length = 0;
}
if (SvUTF8(data.longest_fixed)) {
r->anchored_utf8 = data.longest_fixed;
- r->anchored_substr = Nullsv;
+ r->anchored_substr = NULL;
} else {
r->anchored_substr = data.longest_fixed;
- r->anchored_utf8 = Nullsv;
+ r->anchored_utf8 = NULL;
}
r->anchored_offset = data.offset_fixed;
t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
}
else {
- r->anchored_substr = r->anchored_utf8 = Nullsv;
+ r->anchored_substr = r->anchored_utf8 = NULL;
SvREFCNT_dec(data.longest_fixed);
longest_fixed_length = 0;
}
{
const I32 n = add_data(pRExC_state, 1, "f");
- New(1006, RExC_rx->data->data[n], 1,
+ Newx(RExC_rx->data->data[n], 1,
struct regnode_charclass_class);
StructCopy(data.start_class,
(struct regnode_charclass_class*)RExC_rx->data->data[n],
struct regnode_charclass_class);
r->regstclass = (regnode*)RExC_rx->data->data[n];
r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
- PL_regdata = r->data; /* for regprop() */
DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
- regprop(sv, (regnode*)data.start_class);
+ regprop(r, sv, (regnode*)data.start_class);
PerlIO_printf(Perl_debug_log,
- "synthetic stclass `%s'.\n",
- SvPVX(sv));});
+ "synthetic stclass \"%s\".\n",
+ SvPVX_const(sv));});
}
/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
data.last_closep = &last_close;
minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
- = r->float_substr = r->float_utf8 = Nullsv;
+ = r->float_substr = r->float_utf8 = NULL;
if (!(data.start_class->flags & ANYOF_EOS)
&& !cl_is_anything(data.start_class))
{
const I32 n = add_data(pRExC_state, 1, "f");
- New(1006, RExC_rx->data->data[n], 1,
+ Newx(RExC_rx->data->data[n], 1,
struct regnode_charclass_class);
StructCopy(data.start_class,
(struct regnode_charclass_class*)RExC_rx->data->data[n],
r->regstclass = (regnode*)RExC_rx->data->data[n];
r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
- regprop(sv, (regnode*)data.start_class);
+ regprop(r, sv, (regnode*)data.start_class);
PerlIO_printf(Perl_debug_log,
- "synthetic stclass `%s'.\n",
- SvPVX(sv));});
+ "synthetic stclass \"%s\".\n",
+ SvPVX_const(sv));});
}
}
r->reganch |= ROPT_EVAL_SEEN;
if (RExC_seen & REG_SEEN_CANY)
r->reganch |= ROPT_CANY_SEEN;
- Newz(1002, r->startp, RExC_npar, I32);
- Newz(1002, r->endp, RExC_npar, I32);
- PL_regdata = r->data; /* for regprop() */
- DEBUG_COMPILE_r(regdump(r));
+ Newxz(r->startp, RExC_npar, I32);
+ Newxz(r->endp, RExC_npar, I32);
+ DEBUG_COMPILE_r({
+ if (SvIV(re_debug_flags)> (RE_DEBUG_COMPILE | RE_DEBUG_EXECUTE))
+ PerlIO_printf(Perl_debug_log,"Final program:\n");
+ regdump(r);
+ });
return(r);
}
+
+#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
+ int rem=(int)(RExC_end - RExC_parse); \
+ int cut; \
+ int num; \
+ int iscut=0; \
+ if (rem>10) { \
+ rem=10; \
+ iscut=1; \
+ } \
+ cut=10-rem; \
+ if (RExC_lastparse!=RExC_parse) \
+ PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
+ rem, RExC_parse, \
+ cut + 4, \
+ iscut ? "..." : "<" \
+ ); \
+ else \
+ PerlIO_printf(Perl_debug_log,"%16s",""); \
+ \
+ if (SIZE_ONLY) \
+ num=RExC_size; \
+ else \
+ num=REG_NODE_NUM(RExC_emit); \
+ if (RExC_lastnum!=num) \
+ PerlIO_printf(Perl_debug_log,"%4d",num); \
+ else \
+ PerlIO_printf(Perl_debug_log,"%4s",""); \
+ PerlIO_printf(Perl_debug_log,"%*s%-4s", \
+ 10+(depth*2),"", \
+ (funcname) \
+ ); \
+ RExC_lastnum=num; \
+ RExC_lastparse=RExC_parse; \
+})
+
+#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
+ DEBUG_PARSE_MSG((funcname)); \
+ PerlIO_printf(Perl_debug_log,"%4s","\n"); \
+})
/*
- reg - regular expression, i.e. main body or parenthesized thing
*
* is a trifle forced, but the need to tie the tails of the branches to what
* follows makes it hard to avoid.
*/
+#define REGTAIL(x,y,z) regtail(x,y,z,depth+1)
+
STATIC regnode *
-S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
+S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
{
dVAR;
register regnode *ret; /* Will be the head of the group. */
register regnode *br;
register regnode *lastbr;
- register regnode *ender = 0;
+ register regnode *ender = NULL;
register I32 parno = 0;
- I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
+ I32 flags;
+ const I32 oregflags = RExC_flags;
+ bool have_branch = 0;
+ bool is_open = 0;
/* for (?g), (?gc), and (?o) warnings; warning
about (?c) will warn about (?g) -- japhy */
- I32 wastedflags = 0x00,
- wasted_o = 0x01,
- wasted_g = 0x02,
- wasted_gc = 0x02 | 0x04,
- wasted_c = 0x04;
+#define WASTED_O 0x01
+#define WASTED_G 0x02
+#define WASTED_C 0x04
+#define WASTED_GC (0x02|0x04)
+ I32 wastedflags = 0x00;
char * parse_start = RExC_parse; /* MJD */
- char *oregcomp_parse = RExC_parse;
- char c;
+ char * const oregcomp_parse = RExC_parse;
+
+ GET_RE_DEBUG_FLAGS_DECL;
+ DEBUG_PARSE("reg ");
+
*flagp = 0; /* Tentatively. */
if (*RExC_parse == '?') { /* (?...) */
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
- int logical = 0;
- char *seqstart = RExC_parse;
+ bool is_logical = 0;
+ const char * const seqstart = RExC_parse;
RExC_parse++;
paren = *RExC_parse++;
vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
/* FALL THROUGH*/
case '?': /* (??...) */
- logical = 1;
+ is_logical = 1;
if (*RExC_parse != '{')
goto unknown;
paren = *RExC_parse++;
I32 count = 1, n = 0;
char c;
char *s = RExC_parse;
- SV *sv;
- OP_4tree *sop, *rop;
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_EVAL;
while (count && (c = *RExC_parse)) {
- if (c == '\\' && RExC_parse[1])
- RExC_parse++;
+ if (c == '\\') {
+ if (RExC_parse[1])
+ RExC_parse++;
+ }
else if (c == '{')
count++;
else if (c == '}')
count--;
RExC_parse++;
}
- if (*RExC_parse != ')')
- {
+ if (*RExC_parse != ')') {
RExC_parse = s;
vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
}
if (!SIZE_ONLY) {
PAD *pad;
-
- if (RExC_parse - 1 - s)
- sv = newSVpvn(s, RExC_parse - 1 - s);
- else
- sv = newSVpvn("", 0);
+ OP_4tree *sop, *rop;
+ SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
ENTER;
Perl_save_re_context(aTHX);
FAIL("Eval-group not allowed at runtime, use re 'eval'");
if (PL_tainting && PL_tainted)
FAIL("Eval-group in insecure regular expression");
+#if PERL_VERSION > 8
if (IN_PERL_COMPILETIME)
PL_cv_has_eval = 1;
+#endif
}
nextchar(pRExC_state);
- if (logical) {
+ if (is_logical) {
ret = reg_node(pRExC_state, LOGICAL);
if (!SIZE_ONLY)
ret->flags = 2;
- regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
+ REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
/* deal with the length of this later - MJD */
return ret;
}
ret = reg_node(pRExC_state, LOGICAL);
if (!SIZE_ONLY)
ret->flags = 1;
- regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
+ REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
goto insert_if;
}
}
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
/* (?(1)...) */
+ char c;
parno = atoi(RExC_parse++);
while (isDIGIT(*RExC_parse))
if ((c = *nextchar(pRExC_state)) != ')')
vFAIL("Switch condition not recognized");
insert_if:
- regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
- br = regbranch(pRExC_state, &flags, 1);
+ REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
+ br = regbranch(pRExC_state, &flags, 1,depth+1);
if (br == NULL)
br = reganode(pRExC_state, LONGJMP, 0);
else
- regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
+ REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
c = *nextchar(pRExC_state);
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
if (c == '|') {
lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
- regbranch(pRExC_state, &flags, 1);
- regtail(pRExC_state, ret, lastbr);
+ regbranch(pRExC_state, &flags, 1,depth+1);
+ REGTAIL(pRExC_state, ret, lastbr);
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
c = *nextchar(pRExC_state);
if (c != ')')
vFAIL("Switch (?(condition)... contains too many branches");
ender = reg_node(pRExC_state, TAIL);
- regtail(pRExC_state, br, ender);
+ REGTAIL(pRExC_state, br, ender);
if (lastbr) {
- regtail(pRExC_state, lastbr, ender);
- regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
+ REGTAIL(pRExC_state, lastbr, ender);
+ REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
}
else
- regtail(pRExC_state, ret, ender);
+ REGTAIL(pRExC_state, ret, ender);
return ret;
}
else {
if (*RExC_parse == 'o' || *RExC_parse == 'g') {
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
- I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
+ const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
if (! (wastedflags & wflagbit) ) {
wastedflags |= wflagbit;
vWARN5(
}
else if (*RExC_parse == 'c') {
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
- if (! (wastedflags & wasted_c) ) {
- wastedflags |= wasted_gc;
+ if (! (wastedflags & WASTED_C) ) {
+ wastedflags |= WASTED_GC;
vWARN3(
RExC_parse + 1,
"Useless (%sc) - %suse /gc modifier",
ret = reganode(pRExC_state, OPEN, parno);
Set_Node_Length(ret, 1); /* MJD */
Set_Node_Offset(ret, RExC_parse); /* MJD */
- open = 1;
+ is_open = 1;
}
}
else /* ! paren */
/* Pick up the branches, linking them together. */
parse_start = RExC_parse; /* MJD */
- br = regbranch(pRExC_state, &flags, 1);
+ br = regbranch(pRExC_state, &flags, 1,depth+1);
/* branch_len = (paren != 0); */
if (br == NULL)
else if (paren == ':') {
*flagp |= flags&SIMPLE;
}
- if (open) { /* Starts with OPEN. */
- regtail(pRExC_state, ret, br); /* OPEN -> first. */
+ if (is_open) { /* Starts with OPEN. */
+ REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
}
else if (paren != '?') /* Not Conditional */
ret = br;
while (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
ender = reganode(pRExC_state, LONGJMP,0);
- regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
+ REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
}
if (SIZE_ONLY)
RExC_extralen += 2; /* Account for LONGJMP. */
nextchar(pRExC_state);
- br = regbranch(pRExC_state, &flags, 0);
+ br = regbranch(pRExC_state, &flags, 0, depth+1);
if (br == NULL)
return(NULL);
- regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
+ REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
lastbr = br;
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
ender = reg_node(pRExC_state, END);
break;
}
- regtail(pRExC_state, lastbr, ender);
+ REGTAIL(pRExC_state, lastbr, ender);
- if (have_branch) {
+ if (have_branch && !SIZE_ONLY) {
/* Hook the tails of the branches to the closing node. */
- for (br = ret; br != NULL; br = regnext(br)) {
- regoptail(pRExC_state, br, ender);
+ U8 exact= PSEUDO;
+ for (br = ret; br; br = regnext(br)) {
+ const U8 op = PL_regkind[OP(br)];
+ U8 exact_ret;
+ if (op == BRANCH) {
+ exact_ret=regtail_study(pRExC_state, NEXTOPER(br), ender,depth+1);
+ }
+ else if (op == BRANCHJ) {
+ exact_ret=regtail_study(pRExC_state, NEXTOPER(NEXTOPER(br)), ender,depth+1);
+ }
+ if ( exact == PSEUDO )
+ exact= exact_ret;
+ else if ( exact != exact_ret )
+ exact= 0;
}
}
}
Set_Node_Cur_Length(ret);
Set_Node_Offset(ret, parse_start + 1);
ret->flags = flag;
- regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
+ REGTAIL(pRExC_state, ret, reg_node(pRExC_state, TAIL));
}
}
* Implements the concatenation operator.
*/
STATIC regnode *
-S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
+S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
{
+ dVAR;
register regnode *ret;
register regnode *chain = NULL;
register regnode *latest;
I32 flags = 0, c = 0;
-
+ GET_RE_DEBUG_FLAGS_DECL;
+ DEBUG_PARSE("brnc");
if (first)
ret = NULL;
else {
nextchar(pRExC_state);
while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
flags &= ~TRYAGAIN;
- latest = regpiece(pRExC_state, &flags);
+ latest = regpiece(pRExC_state, &flags,depth+1);
if (latest == NULL) {
if (flags & TRYAGAIN)
continue;
*flagp |= flags&SPSTART;
else {
RExC_naughty++;
- regtail(pRExC_state, chain, latest);
+ REGTAIL(pRExC_state, chain, latest);
}
chain = latest;
c++;
*flagp |= flags&SIMPLE;
}
- return(ret);
+ return ret;
}
/*
* endmarker role is not redundant.
*/
STATIC regnode *
-S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
+S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
+ dVAR;
register regnode *ret;
register char op;
register char *next;
I32 min;
I32 max = REG_INFTY;
char *parse_start;
+ GET_RE_DEBUG_FLAGS_DECL;
+ DEBUG_PARSE("piec");
- ret = regatom(pRExC_state, &flags);
+ ret = regatom(pRExC_state, &flags,depth+1);
if (ret == NULL) {
if (flags & TRYAGAIN)
*flagp |= TRYAGAIN;
op = *RExC_parse;
if (op == '{' && regcurly(RExC_parse)) {
+ const char *maxpos = NULL;
parse_start = RExC_parse; /* MJD */
next = RExC_parse + 1;
- maxpos = Nullch;
while (isDIGIT(*next) || *next == ',') {
if (*next == ',') {
if (maxpos)
Set_Node_Cur_Length(ret);
}
else {
- regnode *w = reg_node(pRExC_state, WHILEM);
+ regnode * const w = reg_node(pRExC_state, WHILEM);
w->flags = 0;
- regtail(pRExC_state, ret, w);
+ REGTAIL(pRExC_state, ret, w);
if (!SIZE_ONLY && RExC_extralen) {
reginsert(pRExC_state, LONGJMP,ret);
reginsert(pRExC_state, NOTHING,ret);
if (!SIZE_ONLY && RExC_extralen)
NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
- regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
+ REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
if (SIZE_ONLY)
RExC_whilem_seen++, RExC_extralen += 3;
RExC_naughty += 4 + RExC_naughty; /* compound interest */
goto do_curly;
}
nest_check:
- if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
+ if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
vWARN3(RExC_parse,
"%.*s matches null string many times",
- RExC_parse - origparse,
+ (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
origparse);
}
if (*RExC_parse == '?') {
nextchar(pRExC_state);
reginsert(pRExC_state, MINMOD, ret);
- regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
+ REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
}
if (ISMULT2(RExC_parse)) {
RExC_parse++;
* faster to run. Backslashed characters are exceptions, each becoming a
* separate node; the code is simpler that way and it's not worth fixing.
*
- * [Yes, it is worth fixing, some scripts can run twice the speed.] */
+ * [Yes, it is worth fixing, some scripts can run twice the speed.]
+ * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
+ */
STATIC regnode *
-S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
+S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
- register regnode *ret = 0;
+ dVAR;
+ register regnode *ret = NULL;
I32 flags;
char *parse_start = RExC_parse;
-
+ GET_RE_DEBUG_FLAGS_DECL;
+ DEBUG_PARSE("atom");
*flagp = WORST; /* Tentatively. */
tryagain:
break;
case '[':
{
- char *oregcomp_parse = ++RExC_parse;
- ret = regclass(pRExC_state);
+ char * const oregcomp_parse = ++RExC_parse;
+ ret = regclass(pRExC_state,depth+1);
if (*RExC_parse != ']') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched [");
}
case '(':
nextchar(pRExC_state);
- ret = reg(pRExC_state, 1, &flags);
+ ret = reg(pRExC_state, 1, &flags,depth+1);
if (ret == NULL) {
if (flags & TRYAGAIN) {
if (RExC_parse == RExC_end) {
case 'p':
case 'P':
{
- char* oldregxend = RExC_end;
+ char* const oldregxend = RExC_end;
char* parse_start = RExC_parse - 2;
if (RExC_parse[1] == '{') {
/* a lovely hack--pretend we saw [\pX] instead */
RExC_end = strchr(RExC_parse, '}');
if (!RExC_end) {
- U8 c = (U8)*RExC_parse;
+ const U8 c = (U8)*RExC_parse;
RExC_parse += 2;
RExC_end = oldregxend;
vFAIL2("Missing right brace on \\%c{}", c);
}
RExC_parse--;
- ret = regclass(pRExC_state);
+ ret = regclass(pRExC_state,depth+1);
RExC_end = oldregxend;
RExC_parse--;
if (num > 9 && num >= RExC_npar)
goto defchar;
else {
- char * parse_start = RExC_parse - 1; /* MJD */
+ char * const parse_start = RExC_parse - 1; /* MJD */
while (isDIGIT(*RExC_parse))
RExC_parse++;
FAIL("Trailing \\");
/* FALL THROUGH */
default:
- /* Do not generate `unrecognized' warnings here, we fall
+ /* Do not generate "unrecognized" warnings here, we fall
back into the quick-grab loop below */
parse_start--;
goto defchar;
case '#':
if (RExC_flags & PMf_EXTENDED) {
- while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
+ while (RExC_parse < RExC_end && *RExC_parse != '\n')
+ RExC_parse++;
if (RExC_parse < RExC_end)
goto tryagain;
}
register STRLEN len;
register UV ender;
register char *p;
- char *oldp, *s;
+ char *s;
STRLEN foldlen;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
len < 127 && p < RExC_end;
len++)
{
- oldp = p;
+ char * const oldp = p;
if (RExC_flags & PMf_EXTENDED)
p = regwhite(p, RExC_end);
FAIL("Trailing \\");
/* FALL THROUGH */
default:
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
+ if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
goto normal_default;
}
if (UTF8_IS_START(*p) && UTF) {
STRLEN numlen;
ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
- &numlen, 0);
+ &numlen, UTF8_ALLOW_DEFAULT);
p += numlen;
}
else
if (len)
p = oldp;
else if (UTF) {
- STRLEN unilen;
-
if (FOLD) {
/* Emit all the Unicode characters. */
STRLEN numlen;
foldlen -= numlen) {
ender = utf8_to_uvchr(foldbuf, &numlen);
if (numlen > 0) {
- reguni(pRExC_state, ender, s, &unilen);
+ const STRLEN unilen = reguni(pRExC_state, ender, s);
s += unilen;
len += unilen;
/* In EBCDIC the numlen
}
}
else {
- reguni(pRExC_state, ender, s, &unilen);
+ const STRLEN unilen = reguni(pRExC_state, ender, s);
if (unilen > 0) {
s += unilen;
len += unilen;
break;
}
if (UTF) {
- STRLEN unilen;
-
if (FOLD) {
/* Emit all the Unicode characters. */
STRLEN numlen;
foldlen -= numlen) {
ender = utf8_to_uvchr(foldbuf, &numlen);
if (numlen > 0) {
- reguni(pRExC_state, ender, s, &unilen);
+ const STRLEN unilen = reguni(pRExC_state, ender, s);
len += unilen;
s += unilen;
/* In EBCDIC the numlen
}
}
else {
- reguni(pRExC_state, ender, s, &unilen);
+ const STRLEN unilen = reguni(pRExC_state, ender, s);
if (unilen > 0) {
s += unilen;
len += unilen;
*flagp |= HASWIDTH;
if (len == 1 && UNI_IS_INVARIANT(ender))
*flagp |= SIMPLE;
- if (!SIZE_ONLY)
- STR_LEN(ret) = len;
+
if (SIZE_ONLY)
RExC_size += STR_SZ(len);
- else
+ else {
+ STR_LEN(ret) = len;
RExC_emit += STR_SZ(len);
}
+ }
break;
}
/* If the encoding pragma is in effect recode the text of
* any EXACT-kind nodes. */
- if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
- STRLEN oldlen = STR_LEN(ret);
- SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
+ if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
+ const STRLEN oldlen = STR_LEN(ret);
+ SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
if (RExC_utf8)
SvUTF8_on(sv);
}
STATIC char *
-S_regwhite(pTHX_ char *p, const char *e)
+S_regwhite(char *p, const char *e)
{
while (p < e) {
if (isSPACE(*p))
STATIC I32
S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
{
- char *posixcc = 0;
+ dVAR;
I32 namedclass = OOB_NAMEDCLASS;
if (value == '[' && RExC_parse + 1 < RExC_end &&
/* I smell either [: or [= or [. -- POSIX has been here, right? */
POSIXCC(UCHARAT(RExC_parse))) {
const char c = UCHARAT(RExC_parse);
- char* s = RExC_parse++;
+ char* const s = RExC_parse++;
while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
RExC_parse++;
/* Grandfather lone [:, [=, [. */
RExC_parse = s;
else {
- const char* t = RExC_parse++; /* skip over the c */
-
+ const char* const t = RExC_parse++; /* skip over the c */
assert(*t == c);
if (UCHARAT(RExC_parse) == ']') {
+ const char *posixcc = s + 1;
RExC_parse++; /* skip over the ending ] */
- posixcc = s + 1;
+
if (*s == ':') {
const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
const I32 skip = t - posixcc;
/* Initially switch on the length of the name. */
switch (skip) {
case 4:
- if (memEQ(posixcc, "word", 4)) {
- /* this is not POSIX, this is the Perl \w */;
- namedclass
- = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
- }
+ if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
+ namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
break;
case 5:
/* Names all of length 5. */
/* Offset 4 gives the best switch position. */
switch (posixcc[4]) {
case 'a':
- if (memEQ(posixcc, "alph", 4)) {
- /* a */
- namedclass
- = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
- }
+ if (memEQ(posixcc, "alph", 4)) /* alpha */
+ namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
break;
case 'e':
- if (memEQ(posixcc, "spac", 4)) {
- /* e */
- namedclass
- = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
- }
+ if (memEQ(posixcc, "spac", 4)) /* space */
+ namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
break;
case 'h':
- if (memEQ(posixcc, "grap", 4)) {
- /* h */
- namedclass
- = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
- }
+ if (memEQ(posixcc, "grap", 4)) /* graph */
+ namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
break;
case 'i':
- if (memEQ(posixcc, "asci", 4)) {
- /* i */
- namedclass
- = complement ? ANYOF_NASCII : ANYOF_ASCII;
- }
+ if (memEQ(posixcc, "asci", 4)) /* ascii */
+ namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
break;
case 'k':
- if (memEQ(posixcc, "blan", 4)) {
- /* k */
- namedclass
- = complement ? ANYOF_NBLANK : ANYOF_BLANK;
- }
+ if (memEQ(posixcc, "blan", 4)) /* blank */
+ namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
break;
case 'l':
- if (memEQ(posixcc, "cntr", 4)) {
- /* l */
- namedclass
- = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
- }
+ if (memEQ(posixcc, "cntr", 4)) /* cntrl */
+ namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
break;
case 'm':
- if (memEQ(posixcc, "alnu", 4)) {
- /* m */
- namedclass
- = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
- }
+ if (memEQ(posixcc, "alnu", 4)) /* alnum */
+ namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
break;
case 'r':
- if (memEQ(posixcc, "lowe", 4)) {
- /* r */
- namedclass
- = complement ? ANYOF_NLOWER : ANYOF_LOWER;
- }
- if (memEQ(posixcc, "uppe", 4)) {
- /* r */
- namedclass
- = complement ? ANYOF_NUPPER : ANYOF_UPPER;
- }
+ if (memEQ(posixcc, "lowe", 4)) /* lower */
+ namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
+ else if (memEQ(posixcc, "uppe", 4)) /* upper */
+ namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
break;
case 't':
- if (memEQ(posixcc, "digi", 4)) {
- /* t */
- namedclass
- = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
- }
- if (memEQ(posixcc, "prin", 4)) {
- /* t */
- namedclass
- = complement ? ANYOF_NPRINT : ANYOF_PRINT;
- }
- if (memEQ(posixcc, "punc", 4)) {
- /* t */
- namedclass
- = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
- }
+ if (memEQ(posixcc, "digi", 4)) /* digit */
+ namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
+ else if (memEQ(posixcc, "prin", 4)) /* print */
+ namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
+ else if (memEQ(posixcc, "punc", 4)) /* punct */
+ namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
break;
}
break;
case 6:
- if (memEQ(posixcc, "xdigit", 6)) {
- namedclass
- = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
- }
+ if (memEQ(posixcc, "xdigit", 6))
+ namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
break;
}
if (namedclass == OOB_NAMEDCLASS)
- {
Simple_vFAIL3("POSIX class [:%.*s:] unknown",
t - s - 1, s + 1);
- }
assert (posixcc[skip] == ':');
assert (posixcc[skip+1] == ']');
} else if (!SIZE_ONLY) {
STATIC void
S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
{
- if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
+ dVAR;
+ if (POSIXCC(UCHARAT(RExC_parse))) {
const char *s = RExC_parse;
const char c = *s++;
- while(*s && isALNUM(*s))
+ while (isALNUM(*s))
s++;
if (*s && c == *s && s[1] == ']') {
if (ckWARN(WARN_REGEXP))
/* adjust RExC_parse so the error shows after
the class closes */
while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
- ;
+ NOOP;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
}
}
}
+
+/*
+ parse a class specification and produce either an ANYOF node that
+ matches the pattern. If the pattern matches a single char only and
+ that char is < 256 then we produce an EXACT node instead.
+*/
STATIC regnode *
-S_regclass(pTHX_ RExC_state_t *pRExC_state)
+S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
{
+ dVAR;
register UV value;
register UV nextvalue;
register IV prevvalue = OOB_UNICODE;
register regnode *ret;
STRLEN numlen;
IV namedclass;
- char *rangebegin = 0;
+ char *rangebegin = NULL;
bool need_class = 0;
- SV *listsv = Nullsv;
- register char *e;
+ SV *listsv = NULL;
UV n;
bool optimize_invert = TRUE;
- AV* unicode_alternate = 0;
+ AV* unicode_alternate = NULL;
#ifdef EBCDIC
UV literal_endpoint = 0;
#endif
+ UV stored = 0; /* number of chars stored in the class */
+ regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
+ case we need to change the emitted regop to an EXACT. */
+ GET_RE_DEBUG_FLAGS_DECL;
+ DEBUG_PARSE("clas");
+
+ /* Assume we are going to generate an ANYOF node. */
ret = reganode(pRExC_state, ANYOF, 0);
if (!SIZE_ONLY)
ANYOF_FLAGS(ret) |= ANYOF_INVERT;
}
- if (SIZE_ONLY)
+ if (SIZE_ONLY) {
RExC_size += ANYOF_SKIP;
+ listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
+ }
else {
RExC_emit += ANYOF_SKIP;
if (FOLD)
if (LOC)
ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ANYOF_BITMAP_ZERO(ret);
- listsv = newSVpvn("# comment\n", 10);
+ listsv = newSVpvs("# comment\n");
}
nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
if (UTF) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
- &numlen, 0);
+ &numlen, UTF8_ALLOW_DEFAULT);
RExC_parse += numlen;
}
else
value = UCHARAT(RExC_parse++);
+
nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
if (value == '[' && POSIXCC(nextvalue))
namedclass = regpposixcc(pRExC_state, value);
if (UTF) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
- &numlen, 0);
+ &numlen, UTF8_ALLOW_DEFAULT);
RExC_parse += numlen;
}
else
case 'D': namedclass = ANYOF_NDIGIT; break;
case 'p':
case 'P':
+ {
+ char *e;
if (RExC_parse >= RExC_end)
vFAIL2("Empty \\%c{}", (U8)value);
if (*RExC_parse == '{') {
n--;
}
}
- if (value == 'p')
- Perl_sv_catpvf(aTHX_ listsv,
- "+utf8::%.*s\n", (int)n, RExC_parse);
- else
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::%.*s\n", (int)n, RExC_parse);
+ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
+ (value=='p' ? '+' : '!'), (int)n, RExC_parse);
}
RExC_parse = e + 1;
ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
namedclass = ANYOF_MAX; /* no official name, but it's named */
+ }
break;
case 'n': value = '\n'; break;
case 'r': value = '\r'; break;
if (*RExC_parse == '{') {
I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
| PERL_SCAN_DISALLOW_PREFIX;
- e = strchr(RExC_parse++, '}');
+ char * const e = strchr(RExC_parse++, '}');
if (!e)
vFAIL("Missing right brace on \\x{}");
break;
}
default:
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
+ if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
vWARN2(RExC_parse,
"Unrecognized escape \\%c in character class passed through",
(int)value);
/* a bad range like a-\d, a-[:digit:] ? */
if (range) {
if (!SIZE_ONLY) {
- if (ckWARN(WARN_REGEXP))
+ if (ckWARN(WARN_REGEXP)) {
+ const int w =
+ RExC_parse >= rangebegin ?
+ RExC_parse - rangebegin : 0;
vWARN4(RExC_parse,
"False [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
+ w, w, rangebegin);
+ }
if (prevvalue < 256) {
ANYOF_BITMAP_SET(ret, prevvalue);
ANYOF_BITMAP_SET(ret, '-');
if (range) {
if (prevvalue > (IV)value) /* b-a */ {
- Simple_vFAIL4("Invalid [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
+ const int w = RExC_parse - rangebegin;
+ Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
range = 0; /* not a valid range */
}
}
/* a bad range like \w-, [:word:]- ? */
if (namedclass > OOB_NAMEDCLASS) {
- if (ckWARN(WARN_REGEXP))
+ if (ckWARN(WARN_REGEXP)) {
+ const int w =
+ RExC_parse >= rangebegin ?
+ RExC_parse - rangebegin : 0;
vWARN4(RExC_parse,
"False [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
+ w, w, rangebegin);
+ }
if (!SIZE_ONLY)
ANYOF_BITMAP_SET(ret, '-');
} else
}
/* now is the next time */
+ stored += (value - prevvalue + 1);
if (!SIZE_ONLY) {
- IV i;
-
if (prevvalue < 256) {
const IV ceilvalue = value < 256 ? value : 255;
-
+ IV i;
#ifdef EBCDIC
/* In EBCDIC [\x89-\x91] should include
* the \x8e but [i-j] should not. */
RExC_emit += ANYOF_CLASS_ADD_SKIP;
}
+
+ if (SIZE_ONLY)
+ return ret;
+ /****** !SIZE_ONLY AFTER HERE *********/
+
+ if( stored == 1 && value < 256
+ && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
+ ) {
+ /* optimize single char class to an EXACT node
+ but *only* when its not a UTF/high char */
+ RExC_emit = orig_emit;
+ ret = reg_node(pRExC_state,
+ (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
+ *STRING(ret)= (char)value;
+ STR_LEN(ret)= 1;
+ RExC_emit += STR_SZ(1);
+ return ret;
+ }
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
- if (!SIZE_ONLY &&
- /* If the only flag is folding (plus possibly inversion). */
+ if ( /* If the only flag is folding (plus possibly inversion). */
((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
) {
for (value = 0; value < 256; ++value) {
}
/* optimize inverted simple patterns (e.g. [^a-z]) */
- if (!SIZE_ONLY && optimize_invert &&
+ if (optimize_invert &&
/* If the only flag is inversion. */
(ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
}
-
- if (!SIZE_ONLY) {
- AV *av = newAV();
+ {
+ AV * const av = newAV();
SV *rv;
-
/* The 0th element stores the character class description
* in its textual form: used later (regexec.c:Perl_regclass_swash())
* to initialize the appropriate swash (which gets stored in
RExC_rx->data->data[n] = (void*)rv;
ARG_SET(ret, n);
}
-
return ret;
}
STATIC char*
S_nextchar(pTHX_ RExC_state_t *pRExC_state)
{
- char* retval = RExC_parse++;
+ char* const retval = RExC_parse++;
for (;;) {
if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
STATIC regnode * /* Location. */
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
+ dVAR;
register regnode *ptr;
regnode * const ret = RExC_emit;
RExC_size += 1;
return(ret);
}
-
NODE_ALIGN_FILL(ret);
ptr = ret;
FILL_ADVANCE_NODE(ptr, op);
STATIC regnode * /* Location. */
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
+ dVAR;
register regnode *ptr;
regnode * const ret = RExC_emit;
/*
- reguni - emit (if appropriate) a Unicode character
*/
-STATIC void
-S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
+STATIC STRLEN
+S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
{
- *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
+ dVAR;
+ return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
}
/*
STATIC void
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
{
+ dVAR;
register regnode *src;
register regnode *dst;
register regnode *place;
/*
- regtail - set the next-pointer at the end of a node chain of p to val.
+- SEE ALSO: regtail_study
*/
+/* TODO: All three parms should be const */
STATIC void
-S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
+S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
{
+ dVAR;
register regnode *scan;
+ GET_RE_DEBUG_FLAGS_DECL;
if (SIZE_ONLY)
return;
scan = p;
for (;;) {
regnode * const temp = regnext(scan);
- if (temp == NULL)
- break;
- scan = temp;
+ DEBUG_PARSE_r({
+ SV * const mysv=sv_newmortal();
+ DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
+ regprop(RExC_rx, mysv, scan);
+ PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
+ SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
+ });
+ if (temp == NULL)
+ break;
+ scan = temp;
}
if (reg_off_by_arg[OP(scan)]) {
- ARG_SET(scan, val - scan);
+ ARG_SET(scan, val - scan);
}
else {
- NEXT_OFF(scan) = val - scan;
+ NEXT_OFF(scan) = val - scan;
}
}
/*
-- regoptail - regtail on operand of first argument; nop if operandless
+- regtail_study - set the next-pointer at the end of a node chain of p to val.
+- Look for optimizable sequences at the same time.
+- currently only looks for EXACT chains.
*/
-STATIC void
-S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
+/* TODO: All four parms should be const */
+STATIC U8
+S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
{
- /* "Operandless" and "op != BRANCH" are synonymous in practice. */
- if (p == NULL || SIZE_ONLY)
- return;
- if (PL_regkind[(U8)OP(p)] == BRANCH) {
- regtail(pRExC_state, NEXTOPER(p), val);
+ dVAR;
+ register regnode *scan;
+ U8 exact= PSEUDO;
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ if (SIZE_ONLY)
+ return exact;
+
+ /* Find last node. */
+
+ scan = p;
+ for (;;) {
+ regnode * const temp = regnext(scan);
+ if ( exact ) {
+ switch (OP(scan)) {
+ case EXACT:
+ case EXACTF:
+ case EXACTFL:
+ if( exact == PSEUDO )
+ exact= OP(scan);
+ else if ( exact != OP(scan) )
+ exact= 0;
+ case NOTHING:
+ break;
+ default:
+ exact= 0;
+ }
+ }
+ DEBUG_PARSE_r({
+ SV * const mysv=sv_newmortal();
+ DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
+ regprop(RExC_rx, mysv, scan);
+ PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
+ SvPV_nolen_const(mysv),
+ reg_name[exact],
+ REG_NODE_NUM(scan));
+ });
+ if (temp == NULL)
+ break;
+ scan = temp;
}
- else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
- regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
+
+ if (reg_off_by_arg[OP(scan)]) {
+ ARG_SET(scan, val - scan);
}
- else
- return;
+ else {
+ NEXT_OFF(scan) = val - scan;
+ }
+
+ return exact;
}
/*
- regcurly - a little FSA that accepts {\d+,?\d*}
*/
STATIC I32
-S_regcurly(pTHX_ register const char *s)
+S_regcurly(register const char *s)
{
if (*s++ != '{')
return FALSE;
}
-#ifdef DEBUGGING
-
-STATIC regnode *
-S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
-{
- register U8 op = EXACT; /* Arbitrary non-END op. */
- register regnode *next;
-
- while (op != END && (!last || node < last)) {
- /* While that wasn't END last time... */
-
- NODE_ALIGN(node);
- op = OP(node);
- if (op == CLOSE)
- l--;
- next = regnext(node);
- /* Where, what. */
- if (OP(node) == OPTIMIZED)
- goto after_print;
- regprop(sv, node);
- PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
- (int)(2*l + 1), "", SvPVX(sv));
- if (next == NULL) /* Next ptr. */
- PerlIO_printf(Perl_debug_log, "(0)");
- else
- PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
- (void)PerlIO_putc(Perl_debug_log, '\n');
- after_print:
- if (PL_regkind[(U8)op] == BRANCHJ) {
- register regnode *nnode = (OP(next) == LONGJMP
- ? regnext(next)
- : next);
- if (last && nnode > last)
- nnode = last;
- node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
- }
- else if (PL_regkind[(U8)op] == BRANCH) {
- node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
- }
- else if ( PL_regkind[(U8)op] == TRIE ) {
- const I32 n = ARG(node);
- const reg_trie_data *trie = (reg_trie_data*)PL_regdata->data[n];
- const I32 arry_len = av_len(trie->words)+1;
- I32 word_idx;
- PerlIO_printf(Perl_debug_log,
- "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
- (int)(2*(l+3)),
- "",
- trie->wordcount,
- (int)trie->charcount,
- trie->uniquecharcount,
- (IV)trie->laststate-1,
- node->flags ? " EVAL mode" : "");
-
- for (word_idx=0; word_idx < arry_len; word_idx++) {
- SV **elem_ptr=av_fetch(trie->words,word_idx,0);
- if (elem_ptr) {
- PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
- (int)(2*(l+4)), "",
- PL_colors[0],
- SvPV_nolen(*elem_ptr),
- PL_colors[1]
- );
- /*
- if (next == NULL)
- PerlIO_printf(Perl_debug_log, "(0)\n");
- else
- PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
- */
- }
-
- }
-
- node = NEXTOPER(node);
- node += regarglen[(U8)op];
-
- }
- else if ( op == CURLY) { /* `next' might be very big: optimizer */
- node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
- NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
- }
- else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
- node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
- next, sv, l + 1);
- }
- else if ( op == PLUS || op == STAR) {
- node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
- }
- else if (op == ANYOF) {
- /* arglen 1 + class block */
- node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
- ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
- node = NEXTOPER(node);
- }
- else if (PL_regkind[(U8)op] == EXACT) {
- /* Literal string, where present. */
- node += NODE_SZ_STR(node) - 1;
- node = NEXTOPER(node);
- }
- else {
- node = NEXTOPER(node);
- node += regarglen[(U8)op];
- }
- if (op == CURLYX || op == OPEN)
- l++;
- else if (op == WHILEM)
- l--;
- }
- return node;
-}
-
-#endif /* DEBUGGING */
-
/*
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
void
-Perl_regdump(pTHX_ regexp *r)
+Perl_regdump(pTHX_ const regexp *r)
{
#ifdef DEBUGGING
- SV *sv = sv_newmortal();
+ dVAR;
+ SV * const sv = sv_newmortal();
- (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
+ (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
/* Header fields of interest. */
if (r->anchored_substr)
PerlIO_printf(Perl_debug_log,
- "anchored `%s%.*s%s'%s at %"IVdf" ",
+ "anchored \"%s%.*s%s\"%s at %"IVdf" ",
PL_colors[0],
(int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
- SvPVX(r->anchored_substr),
+ SvPVX_const(r->anchored_substr),
PL_colors[1],
SvTAIL(r->anchored_substr) ? "$" : "",
(IV)r->anchored_offset);
else if (r->anchored_utf8)
PerlIO_printf(Perl_debug_log,
- "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
+ "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
PL_colors[0],
(int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
- SvPVX(r->anchored_utf8),
+ SvPVX_const(r->anchored_utf8),
PL_colors[1],
SvTAIL(r->anchored_utf8) ? "$" : "",
(IV)r->anchored_offset);
if (r->float_substr)
PerlIO_printf(Perl_debug_log,
- "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
+ "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
PL_colors[0],
(int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
- SvPVX(r->float_substr),
+ SvPVX_const(r->float_substr),
PL_colors[1],
SvTAIL(r->float_substr) ? "$" : "",
(IV)r->float_min_offset, (UV)r->float_max_offset);
else if (r->float_utf8)
PerlIO_printf(Perl_debug_log,
- "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
+ "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
PL_colors[0],
(int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
- SvPVX(r->float_utf8),
+ SvPVX_const(r->float_utf8),
PL_colors[1],
SvTAIL(r->float_utf8) ? "$" : "",
(IV)r->float_min_offset, (UV)r->float_max_offset);
PerlIO_printf(Perl_debug_log, ") ");
if (r->regstclass) {
- regprop(sv, r->regstclass);
- PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
+ regprop(r, sv, r->regstclass);
+ PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
}
if (r->reganch & ROPT_ANCH) {
PerlIO_printf(Perl_debug_log, "anchored");
PerlIO_printf(Perl_debug_log, "\n");
});
}
+#else
+ PERL_UNUSED_CONTEXT;
+ PERL_UNUSED_ARG(r);
#endif /* DEBUGGING */
}
-#ifdef DEBUGGING
-
-STATIC void
-S_put_byte(pTHX_ SV *sv, int c)
-{
- if (isCNTRL(c) || c == 255 || !isPRINT(c))
- Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
- else if (c == '-' || c == ']' || c == '\\' || c == '^')
- Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
- else
- Perl_sv_catpvf(aTHX_ sv, "%c", c);
-}
-
-#endif /* DEBUGGING */
-
-
/*
- regprop - printable representation of opcode
*/
void
-Perl_regprop(pTHX_ SV *sv, const regnode *o)
+Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
{
#ifdef DEBUGGING
+ dVAR;
register int k;
sv_setpvn(sv, "", 0);
Perl_croak(aTHX_ "Corrupted regexp opcode");
sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
- k = PL_regkind[(U8)OP(o)];
+ k = PL_regkind[OP(o)];
if (k == EXACT) {
- SV *dsv = sv_2mortal(newSVpvn("", 0));
+ SV * const dsv = sv_2mortal(newSVpvs(""));
/* Using is_utf8_string() is a crude hack but it may
* be the best for now since we have no flag "this EXACTish
* node was UTF-8" --jhi */
const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
- const char *s = do_utf8 ?
+ const char * const s = do_utf8 ?
pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
UNI_DISPLAY_REGEX) :
STRING(o);
PL_colors[0],
len, s,
PL_colors[1]);
- } else if (k == TRIE) {/*
- this isn't always safe, as Pl_regdata may not be for this regex yet
- (depending on where its called from) so its being moved to dumpuntil
- I32 n = ARG(o);
- reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
- Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
- trie->wordcount,
- trie->charcount,
- trie->uniquecharcount,
- trie->laststate);
- */
+ } else if (k == TRIE) {
+ Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
+ /* print the details of the trie in dumpuntil instead, as
+ * prog->data isn't available here */
} else if (k == CURLY) {
if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == ANYOF) {
int i, rangestart = -1;
- U8 flags = ANYOF_FLAGS(o);
- const char * const anyofs[] = { /* Should be synchronized with
- * ANYOF_ #xdefines in regcomp.h */
+ const U8 flags = ANYOF_FLAGS(o);
+
+ /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
+ static const char * const anyofs[] = {
"\\w",
"\\W",
"\\s",
};
if (flags & ANYOF_LOCALE)
- sv_catpv(sv, "{loc}");
+ sv_catpvs(sv, "{loc}");
if (flags & ANYOF_FOLD)
- sv_catpv(sv, "{i}");
+ sv_catpvs(sv, "{i}");
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
- sv_catpv(sv, "^");
+ sv_catpvs(sv, "^");
for (i = 0; i <= 256; i++) {
if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
if (rangestart == -1)
put_byte(sv, rangestart);
else {
put_byte(sv, rangestart);
- sv_catpv(sv, "-");
+ sv_catpvs(sv, "-");
put_byte(sv, i - 1);
}
rangestart = -1;
}
if (o->flags & ANYOF_CLASS)
- for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
+ for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
if (ANYOF_CLASS_TEST(o,i))
sv_catpv(sv, anyofs[i]);
if (flags & ANYOF_UNICODE)
- sv_catpv(sv, "{unicode}");
+ sv_catpvs(sv, "{unicode}");
else if (flags & ANYOF_UNICODE_ALL)
- sv_catpv(sv, "{unicode_all}");
+ sv_catpvs(sv, "{unicode_all}");
{
SV *lv;
- SV *sw = regclass_swash(o, FALSE, &lv, 0);
+ SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
if (lv) {
if (sw) {
if (rangestart == -1)
rangestart = i;
} else if (rangestart != -1) {
- U8 *p;
-
if (i <= rangestart + 3)
for (; rangestart < i; rangestart++) {
- U8 *e;
- for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
+ const U8 * const e = uvchr_to_utf8(s,rangestart);
+ U8 *p;
+ for(p = s; p < e; p++)
put_byte(sv, *p);
}
else {
- U8 *e;
- for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
+ const U8 *e = uvchr_to_utf8(s,rangestart);
+ U8 *p;
+ for (p = s; p < e; p++)
put_byte(sv, *p);
- sv_catpv(sv, "-");
- for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
+ sv_catpvs(sv, "-");
+ e = uvchr_to_utf8(s, i-1);
+ for (p = s; p < e; p++)
put_byte(sv, *p);
}
rangestart = -1;
}
}
- sv_catpv(sv, "..."); /* et cetera */
+ sv_catpvs(sv, "..."); /* et cetera */
}
{
char *s = savesvpv(lv);
- char *origs = s;
+ char * const origs = s;
- while(*s && *s != '\n') s++;
+ while (*s && *s != '\n')
+ s++;
if (*s == '\n') {
- const char *t = ++s;
+ const char * const t = ++s;
while (*s) {
if (*s == '\n')
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
+#else
+ PERL_UNUSED_CONTEXT;
+ PERL_UNUSED_ARG(sv);
+ PERL_UNUSED_ARG(o);
#endif /* DEBUGGING */
}
SV *
Perl_re_intuit_string(pTHX_ regexp *prog)
{ /* Assume that RE_INTUIT is set */
+ dVAR;
GET_RE_DEBUG_FLAGS_DECL;
+ PERL_UNUSED_CONTEXT;
+
DEBUG_COMPILE_r(
- { STRLEN n_a;
- const char *s = SvPV(prog->check_substr
- ? prog->check_substr : prog->check_utf8, n_a);
+ {
+ const char * const s = SvPV_nolen_const(prog->check_substr
+ ? prog->check_substr : prog->check_utf8);
if (!PL_colorset) reginitcolors();
PerlIO_printf(Perl_debug_log,
- "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
+ "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
PL_colors[4],
prog->check_substr ? "" : "utf8 ",
PL_colors[5],PL_colors[0],
{
dVAR;
#ifdef DEBUGGING
- SV *dsv = PERL_DEBUG_PAD_ZERO(0);
- SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
+ SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
#endif
-
+ GET_RE_DEBUG_FLAGS_DECL;
if (!r || (--r->refcnt > 0))
return;
DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
- const char *s = (r->reganch & ROPT_UTF8)
+ const char * const s = (r->reganch & ROPT_UTF8)
? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
: pv_display(dsv, r->precomp, r->prelen, 0, 60);
const int len = SvCUR(dsv);
len > 60 ? "..." : "");
});
- if (r->precomp)
- Safefree(r->precomp);
- if (r->offsets) /* 20010421 MJD */
- Safefree(r->offsets);
+ /* gcov results gave these as non-null 100% of the time, so there's no
+ optimisation in checking them before calling Safefree */
+ Safefree(r->precomp);
+ Safefree(r->offsets); /* 20010421 MJD */
RX_MATCH_COPY_FREE(r);
-#ifdef PERL_COPY_ON_WRITE
+#ifdef PERL_OLD_COPY_ON_WRITE
if (r->saved_copy)
SvREFCNT_dec(r->saved_copy);
#endif
Perl_croak(aTHX_ "panic: pregfree comppad");
PAD_SAVE_LOCAL(old_comppad,
/* Watch out for global destruction's random ordering. */
- (SvTYPE(new_comppad) == SVt_PVAV) ?
- new_comppad : Null(PAD *)
+ (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
);
OP_REFCNT_LOCK;
refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
break;
case 't':
{
- reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
+ reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
U32 refcount;
OP_REFCNT_LOCK;
- refcount = trie->refcount--;
+ refcount = --trie->refcount;
OP_REFCNT_UNLOCK;
if ( !refcount ) {
- if (trie->charmap)
- Safefree(trie->charmap);
+ Safefree(trie->charmap);
if (trie->widecharmap)
SvREFCNT_dec((SV*)trie->widecharmap);
- if (trie->states)
- Safefree(trie->states);
- if (trie->trans)
- Safefree(trie->trans);
+ Safefree(trie->states);
+ Safefree(trie->trans);
+ if (trie->bitmap)
+ Safefree(trie->bitmap);
#ifdef DEBUGGING
if (trie->words)
SvREFCNT_dec((SV*)trie->words);
Safefree(r);
}
+#ifndef PERL_IN_XSUB_RE
/*
- regnext - dig the "next" pointer out of a node
*/
regnode *
Perl_regnext(pTHX_ register regnode *p)
{
+ dVAR;
register I32 offset;
if (p == &PL_regdummy)
return(p+offset);
}
+#endif
STATIC void
S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
#endif
msv = vmess(buf, &args);
va_end(args);
- message = SvPV(msv,l1);
+ message = SvPV_const(msv,l1);
if (l1 > 512)
l1 = 512;
Copy(message, buf, l1 , char);
/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
+#ifndef PERL_IN_XSUB_RE
void
Perl_save_re_context(pTHX)
{
- SAVEI32(PL_reg_flags); /* from regexec.c */
- SAVEPPTR(PL_bostr);
- SAVEPPTR(PL_reginput); /* String-input pointer. */
- SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
- SAVEPPTR(PL_regeol); /* End of input, for $ check. */
- SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
- SAVEVPTR(PL_regendp); /* Ditto for endp. */
- SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
- SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
- SAVEPPTR(PL_regtill); /* How far we are required to go. */
- SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
+ dVAR;
+
+ struct re_save_state *state;
+
+ SAVEVPTR(PL_curcop);
+ SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
+
+ state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
+ PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
+ SSPUSHINT(SAVEt_RE_STATE);
+
+ Copy(&PL_reg_state, state, 1, struct re_save_state);
+
PL_reg_start_tmp = 0;
- SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
PL_reg_start_tmpl = 0;
- SAVEVPTR(PL_regdata);
- SAVEI32(PL_reg_eval_set); /* from regexec.c */
- SAVEI32(PL_regnarrate); /* from regexec.c */
- SAVEVPTR(PL_regprogram); /* from regexec.c */
- SAVEINT(PL_regindent); /* from regexec.c */
- SAVEVPTR(PL_regcc); /* from regexec.c */
- SAVEVPTR(PL_curcop);
- SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
- SAVEVPTR(PL_reg_re); /* from regexec.c */
- SAVEPPTR(PL_reg_ganch); /* from regexec.c */
- SAVESPTR(PL_reg_sv); /* from regexec.c */
- SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
- SAVEVPTR(PL_reg_magic); /* from regexec.c */
- SAVEI32(PL_reg_oldpos); /* from regexec.c */
- SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
- SAVEVPTR(PL_reg_curpm); /* from regexec.c */
- SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
- PL_reg_oldsaved = Nullch;
- SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
+ PL_reg_oldsaved = NULL;
PL_reg_oldsavedlen = 0;
-#ifdef PERL_COPY_ON_WRITE
- SAVESPTR(PL_nrs);
- PL_nrs = Nullsv;
-#endif
- SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
PL_reg_maxiter = 0;
- SAVEI32(PL_reg_leftiter); /* wait until caching pos */
PL_reg_leftiter = 0;
- SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
- PL_reg_poscache = Nullch;
- SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
+ PL_reg_poscache = NULL;
PL_reg_poscache_size = 0;
- SAVEPPTR(PL_regprecomp); /* uncompiled string. */
- SAVEI32(PL_regnpar); /* () count. */
- SAVEI32(PL_regsize); /* from regexec.c */
-
- {
- /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
- REGEXP *rx;
+#ifdef PERL_OLD_COPY_ON_WRITE
+ PL_nrs = NULL;
+#endif
- if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
+ /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
+ if (PL_curpm) {
+ const REGEXP * const rx = PM_GETRE(PL_curpm);
+ if (rx) {
U32 i;
for (i = 1; i <= rx->nparens; i++) {
- GV *mgv;
char digits[TYPE_CHARS(long)];
- sprintf(digits, "%lu", (long)i);
- if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
- save_scalar(mgv);
+ const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
+ GV *const *const gvp
+ = (GV**)hv_fetch(PL_defstash, digits, len, 0);
+
+ if (gvp) {
+ GV * const gv = *gvp;
+ if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
+ save_scalar(gv);
+ }
}
}
}
-
-#ifdef DEBUGGING
- SAVEPPTR(PL_reg_starttry); /* from regexec.c */
-#endif
}
+#endif
static void
clear_re(pTHX_ void *r)
{
+ dVAR;
ReREFCNT_dec((regexp *)r);
}
+#ifdef DEBUGGING
+
+STATIC void
+S_put_byte(pTHX_ SV *sv, int c)
+{
+ if (isCNTRL(c) || c == 255 || !isPRINT(c))
+ Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
+ else if (c == '-' || c == ']' || c == '\\' || c == '^')
+ Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
+ else
+ Perl_sv_catpvf(aTHX_ sv, "%c", c);
+}
+
+
+#define CLEAR_OPTSTART \
+ if (optstart) STMT_START { \
+ PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart); \
+ optstart=NULL; \
+ } STMT_END
+
+#define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
+
+STATIC const regnode *
+S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
+ const regnode *last, SV* sv, I32 l)
+{
+ dVAR;
+ register U8 op = EXACT; /* Arbitrary non-END op. */
+ register const regnode *next;
+ const regnode *optstart= NULL;
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ while (op != END && (!last || node < last)) {
+ /* While that wasn't END last time... */
+
+ NODE_ALIGN(node);
+ op = OP(node);
+ if (op == CLOSE)
+ l--;
+ next = regnext((regnode *)node);
+ /* Where, what. */
+ if ( OP(node) == OPTIMIZED) {
+ if (!optstart && (SvIV(re_debug_flags) & RE_DEBUG_OPTIMISE))
+ optstart= node;
+ else
+ goto after_print;
+ } else
+ CLEAR_OPTSTART;
+ regprop(r, sv, node);
+ PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
+ (int)(2*l + 1), "", SvPVX_const(sv));
+
+ if (OP(node) != OPTIMIZED) {
+ if (next == NULL) /* Next ptr. */
+ PerlIO_printf(Perl_debug_log, "(0)");
+ else
+ PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
+ (void)PerlIO_putc(Perl_debug_log, '\n');
+ }
+
+ after_print:
+ if (PL_regkind[(U8)op] == BRANCHJ) {
+ register const regnode *nnode = (OP(next) == LONGJMP
+ ? regnext((regnode *)next)
+ : next);
+ if (last && nnode > last)
+ nnode = last;
+ DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
+ }
+ else if (PL_regkind[(U8)op] == BRANCH) {
+ DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
+ }
+ else if ( PL_regkind[(U8)op] == TRIE ) {
+ const I32 n = ARG(node);
+ const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
+ const I32 arry_len = av_len(trie->words)+1;
+ I32 word_idx;
+ PerlIO_printf(Perl_debug_log,
+ "%*s[Start:%d Words:%d Chars:%d Unique:%d States:%"IVdf" Minlen:%d Maxlen:%d",
+ (int)(2*(l+3)),
+ "",
+ trie->startstate,
+ TRIE_WORDCOUNT(trie),
+ (int)TRIE_CHARCOUNT(trie),
+ trie->uniquecharcount,
+ (IV)TRIE_LASTSTATE(trie)-1,
+ trie->minlen, trie->maxlen
+ );
+ if (trie->bitmap) {
+ int i;
+ int rangestart= -1;
+ sv_setpvn(sv, "", 0);
+ for (i = 0; i <= 256; i++) {
+ if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ if (i <= rangestart + 3)
+ for (; rangestart < i; rangestart++)
+ put_byte(sv, rangestart);
+ else {
+ put_byte(sv, rangestart);
+ sv_catpvs(sv, "-");
+ put_byte(sv, i - 1);
+ }
+ rangestart = -1;
+ }
+ }
+ PerlIO_printf(Perl_debug_log, " Start-Class:%s]\n", SvPVX_const(sv));
+ } else
+ PerlIO_printf(Perl_debug_log, " No Start-Class]\n");
+
+ for (word_idx=0; word_idx < arry_len; word_idx++) {
+ SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
+ if (elem_ptr) {
+ PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
+ (int)(2*(l+4)), "",
+ PL_colors[0],
+ SvPV_nolen_const(*elem_ptr),
+ PL_colors[1]
+ );
+ /*
+ if (next == NULL)
+ PerlIO_printf(Perl_debug_log, "(0)\n");
+ else
+ PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
+ */
+ }
+
+ }
+
+ node = NEXTOPER(node);
+ node += regarglen[(U8)op];
+
+ }
+ else if ( op == CURLY) { /* "next" might be very big: optimizer */
+ DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
+ NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
+ }
+ else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
+ DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
+ next, sv, l + 1);
+ }
+ else if ( op == PLUS || op == STAR) {
+ DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
+ }
+ else if (op == ANYOF) {
+ /* arglen 1 + class block */
+ node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
+ ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
+ node = NEXTOPER(node);
+ }
+ else if (PL_regkind[(U8)op] == EXACT) {
+ /* Literal string, where present. */
+ node += NODE_SZ_STR(node) - 1;
+ node = NEXTOPER(node);
+ }
+ else {
+ node = NEXTOPER(node);
+ node += regarglen[(U8)op];
+ }
+ if (op == CURLYX || op == OPEN)
+ l++;
+ else if (op == WHILEM)
+ l--;
+ }
+ CLEAR_OPTSTART;
+ return node;
+}
+
+#endif /* DEBUGGING */
+
/*
* Local variables:
* c-indentation-style: bsd