X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.h;h=e3d671d70eef584444f98e6e4b3f3f4bb693d6e6;hb=de8c53012b7e614137ab875e0d58a92474b317ce;hp=535897f97156debba651baf7e0cc3c5955f8beab;hpb=0df25f3d8d51b9b8c7ab6750af674952bc4bb6c2;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.h b/regcomp.h index 535897f..e3d671d 100644 --- a/regcomp.h +++ b/regcomp.h @@ -13,7 +13,14 @@ typedef OP OP_4tree; /* Will be redefined later. */ #define PERL_ENABLE_TRIE_OPTIMISATION 1 #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1 +#define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 1 #define PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 0 +/* Unless the next line is uncommented it is illegal to combine lazy + matching with possessive matching. Frankly it doesn't make much sense + to allow it as X*?+ matches nothing, X+?+ matches a single char only, + and X{min,max}?+ matches min times only. + */ +/* #define REG_ALLOW_MINMOD_SUSPEND */ /* * The "internal use only" fields in regexp.h are present to pass info from @@ -85,6 +92,8 @@ struct regnode_string { char string[1]; }; +/* Argument bearing node - workhorse, + arg1 is often for the data field */ struct regnode_1 { U8 flags; U8 type; @@ -92,6 +101,16 @@ struct regnode_1 { U32 arg1; }; +/* Similar to a regnode_1 but with an extra signed argument */ +struct regnode_2L { + U8 flags; + U8 type; + U16 next_off; + U32 arg1; + I32 arg2; +}; + +/* 'Two field' -- Two 16 bit unsigned args */ struct regnode_2 { U8 flags; U8 type; @@ -100,9 +119,11 @@ struct regnode_2 { U16 arg2; }; + #define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ #define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */ +/* also used by trie */ struct regnode_charclass { U8 flags; U8 type; @@ -152,9 +173,12 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ARG(p) ARG_VALUE(ARG_LOC(p)) #define ARG1(p) ARG_VALUE(ARG1_LOC(p)) #define ARG2(p) ARG_VALUE(ARG2_LOC(p)) +#define ARG2L(p) ARG_VALUE(ARG2L_LOC(p)) + #define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val)) #define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val)) #define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val)) +#define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val)) #undef NEXT_OFF #undef NODE_ALIGN @@ -187,6 +211,8 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ARG_LOC(p) (((struct regnode_1 *)p)->arg1) #define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) #define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) +#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2) + #define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */ #define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2) @@ -288,7 +314,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ANYOF_BITMAP_ZERO(ret) Zero(((struct regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char) #define ANYOF_BITMAP(p) (((struct regnode_charclass*)(p))->bitmap) -#define ANYOF_BITMAP_BYTE(p, c) (ANYOF_BITMAP(p)[((c) >> 3) & 31]) +#define ANYOF_BITMAP_BYTE(p, c) (ANYOF_BITMAP(p)[(((U8)(c)) >> 3) & 31]) #define ANYOF_BITMAP_SET(p, c) (ANYOF_BITMAP_BYTE(p, c) |= ANYOF_BIT(c)) #define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c)) #define ANYOF_BITMAP_TEST(p, c) (ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c)) @@ -305,6 +331,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ANYOF_CLASS_SKIP ((ANYOF_CLASS_SIZE - 1)/sizeof(regnode)) #define ANYOF_CLASS_ADD_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP) + /* * Utility definitions. */ @@ -322,6 +349,9 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define REG_SEEN_EVAL 0x00000008 #define REG_SEEN_CANY 0x00000010 #define REG_SEEN_SANY REG_SEEN_CANY /* src bckwrd cmpt */ +#define REG_SEEN_RECURSE 0x00000020 +#define REG_TOP_LEVEL_BRANCHES 0x00000040 +#define REG_SEEN_VERBARG 0x00000080 START_EXTERN_C @@ -358,13 +388,27 @@ EXTCONST U8 PL_simple[] = { }; #endif +#ifndef PLUGGABLE_RE_EXTENSION +#ifndef DOINIT +EXTCONST regexp_engine PL_core_reg_engine; +#else /* DOINIT */ +EXTCONST regexp_engine PL_core_reg_engine = { + Perl_pregcomp, + Perl_regexec_flags, + Perl_re_intuit_start, + Perl_re_intuit_string, + Perl_pregfree, + Perl_reg_stringify, +#if defined(USE_ITHREADS) + Perl_regdupe +#endif +}; +#endif /* DOINIT */ +#endif /* PLUGGABLE_RE_EXTENSION */ + + END_EXTERN_C -typedef struct re_scream_pos_data_s -{ - char **scream_olds; /* match pos */ - I32 *scream_pos; /* Internal iterator of scream. */ -} re_scream_pos_data; /* .what is a character array with one character for each member of .data * The character describes the function of the corresponding .data item: @@ -377,6 +421,7 @@ typedef struct re_scream_pos_data_s * in the character class * t - trie struct * T - aho-trie struct + * S - sv for named capture lookup * 20010712 mjd@plover.com * (Remember to update re_dup() and pregfree() if you add any items.) */ @@ -391,6 +436,7 @@ struct reg_substr_datum { I32 max_offset; SV *substr; /* non-utf8 variant */ SV *utf8_substr; /* utf8 variant */ + I32 end_shift; }; struct reg_substr_data { @@ -400,14 +446,19 @@ struct reg_substr_data { #define anchored_substr substrs->data[0].substr #define anchored_utf8 substrs->data[0].utf8_substr #define anchored_offset substrs->data[0].min_offset +#define anchored_end_shift substrs->data[0].end_shift + #define float_substr substrs->data[1].substr #define float_utf8 substrs->data[1].utf8_substr #define float_min_offset substrs->data[1].min_offset #define float_max_offset substrs->data[1].max_offset +#define float_end_shift substrs->data[1].end_shift + #define check_substr substrs->data[2].substr #define check_utf8 substrs->data[2].utf8_substr #define check_offset_min substrs->data[2].min_offset #define check_offset_max substrs->data[2].max_offset +#define check_end_shift substrs->data[2].end_shift @@ -454,24 +505,29 @@ typedef struct _reg_trie_trans reg_trie_trans; /* anything in here that needs to be freed later should be dealt with in pregfree */ struct _reg_trie_data { - U16 uniquecharcount; - U32 lasttrans; - U16 *charmap; - HV *widecharmap; - reg_trie_state *states; - reg_trie_trans *trans; - char *bitmap; - U32 refcount; - U32 startstate; - STRLEN minlen; - STRLEN maxlen; - U32 *wordlen; - U32 laststate; /* Build only */ + U16 uniquecharcount; /* unique chars in trie (width of trans table) */ + U32 lasttrans; /* last valid transition element */ + U16 *charmap; /* byte to charid lookup array */ + HV *widecharmap; /* code points > 255 to charid */ + reg_trie_state *states; /* state data */ + reg_trie_trans *trans; /* array of transition elements */ + char *bitmap; /* stclass bitmap */ + U32 refcount; /* number of times this trie is referenced */ + U32 startstate; /* initial state - used for common prefix optimisation */ + STRLEN minlen; /* minimum length of words in trie - build/opt only? */ + STRLEN maxlen; /* maximum length of words in trie - build/opt only? */ + U32 *wordlen; /* array of lengths of words */ + U16 *jump; /* optional 1 indexed array of offsets before tail + for the node following a given word. */ + U16 *nextword; /* optional 1 indexed array to support linked list + of duplicate wordnums */ + U32 statecount; /* Build only - number of states in the states array + (including the unused zero state) */ + U32 wordcount; /* Build only */ #ifdef DEBUGGING - U16 wordcount; /* Build only */ - STRLEN charcount; /* Build only */ - AV *words; - AV *revcharmap; + STRLEN charcount; /* Build only */ + AV *words; /* Array of words contained in trie, for dumping */ + AV *revcharmap; /* Map of each charid back to its character representation */ #endif }; typedef struct _reg_trie_data reg_trie_data; @@ -489,11 +545,17 @@ typedef struct _reg_ac_data reg_ac_data; three different sets... */ #define TRIE_BITMAP(p) (((reg_trie_data *)(p))->bitmap) -#define TRIE_BITMAP_BYTE(p, c) (TRIE_BITMAP(p)[(((U8)c) >> 3) & 31]) +#define TRIE_BITMAP_BYTE(p, c) (TRIE_BITMAP(p)[(((U8)(c)) >> 3) & 31]) #define TRIE_BITMAP_SET(p, c) (TRIE_BITMAP_BYTE(p, c) |= ANYOF_BIT((U8)c)) #define TRIE_BITMAP_CLEAR(p,c) (TRIE_BITMAP_BYTE(p, c) &= ~ANYOF_BIT((U8)c)) #define TRIE_BITMAP_TEST(p, c) (TRIE_BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c)) +#define IS_ANYOF_TRIE(op) ((op)==TRIEC || (op)==AHOCORASICKC) +#define IS_TRIE_AC(op) ((op)>=AHOCORASICK) + + +#define BITMAP_BYTE(p, c) (((U8*)p)[(((U8)(c)) >> 3) & 31]) +#define BITMAP_TEST(p, c) (BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c)) /* these defines assume uniquecharcount is the correct variable, and state may be evaluated twice */ #define TRIE_NODENUM(state) (((state)-1)/(trie->uniquecharcount)+1) @@ -501,15 +563,10 @@ typedef struct _reg_ac_data reg_ac_data; #define TRIE_NODEIDX(state) ((state) ? (((state)-1)*(trie->uniquecharcount)+1) : (state)) #ifdef DEBUGGING -#define TRIE_WORDCOUNT(trie) ((trie)->wordcount) #define TRIE_CHARCOUNT(trie) ((trie)->charcount) -#define TRIE_LASTSTATE(trie) ((trie)->laststate) #define TRIE_REVCHARMAP(trie) ((trie)->revcharmap) #else -#define TRIE_WORDCOUNT(trie) (trie_wordcount) #define TRIE_CHARCOUNT(trie) (trie_charcount) -/*#define TRIE_LASTSTATE(trie) (trie_laststate)*/ -#define TRIE_LASTSTATE(trie) ((trie)->laststate) #define TRIE_REVCHARMAP(trie) (trie_revcharmap) #endif @@ -555,7 +612,6 @@ re.pm, especially to the documentation. #define RE_DEBUG_COMPILE_OPTIMISE 0x000002 #define RE_DEBUG_COMPILE_TRIE 0x000004 #define RE_DEBUG_COMPILE_DUMP 0x000008 -#define RE_DEBUG_COMPILE_OFFSETS 0x000010 /* Execute */ #define RE_DEBUG_EXECUTE_MASK 0x00FF00 @@ -567,6 +623,11 @@ re.pm, especially to the documentation. #define RE_DEBUG_EXTRA_MASK 0xFF0000 #define RE_DEBUG_EXTRA_TRIE 0x010000 #define RE_DEBUG_EXTRA_OFFSETS 0x020000 +#define RE_DEBUG_EXTRA_OFFDEBUG 0x040000 +#define RE_DEBUG_EXTRA_STATE 0x080000 +#define RE_DEBUG_EXTRA_OPTIMISE 0x100000 +/* combined */ +#define RE_DEBUG_EXTRA_STACK 0x280000 #define RE_DEBUG_FLAG(x) (re_debug_flags & x) /* Compile */ @@ -580,8 +641,6 @@ re.pm, especially to the documentation. if (re_debug_flags & RE_DEBUG_COMPILE_PARSE) x ) #define DEBUG_DUMP_r(x) DEBUG_r( \ if (re_debug_flags & RE_DEBUG_COMPILE_DUMP) x ) -#define DEBUG_OFFSETS_r(x) DEBUG_r( \ - if (re_debug_flags & RE_DEBUG_COMPILE_OFFSETS) x ) #define DEBUG_TRIE_COMPILE_r(x) DEBUG_r( \ if (re_debug_flags & RE_DEBUG_COMPILE_TRIE) x ) @@ -598,8 +657,17 @@ re.pm, especially to the documentation. /* Extra */ #define DEBUG_EXTRA_r(x) DEBUG_r( \ if (re_debug_flags & RE_DEBUG_EXTRA_MASK) x ) +#define DEBUG_OFFSETS_r(x) DEBUG_r( \ + if (re_debug_flags & RE_DEBUG_EXTRA_OFFSETS) x ) +#define DEBUG_STATE_r(x) DEBUG_r( \ + if (re_debug_flags & RE_DEBUG_EXTRA_STATE) x ) +#define DEBUG_STACK_r(x) DEBUG_r( \ + if (re_debug_flags & RE_DEBUG_EXTRA_STACK) x ) +#define DEBUG_OPTIMISE_MORE_r(x) DEBUG_r( \ + if ((RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE) == \ + (re_debug_flags & (RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE)) ) x ) #define MJD_OFFSET_DEBUG(x) DEBUG_r( \ - if (re_debug_flags & RE_DEBUG_EXTRA_OFFSETS) \ + if (re_debug_flags & RE_DEBUG_EXTRA_OFFDEBUG) \ Perl_warn_nocontext x ) #define DEBUG_TRIE_COMPILE_MORE_r(x) DEBUG_TRIE_COMPILE_r( \ if (re_debug_flags & RE_DEBUG_EXTRA_TRIE) x ) @@ -623,14 +691,41 @@ re.pm, especially to the documentation. }) #ifdef DEBUGGING + #define GET_RE_DEBUG_FLAGS_DECL IV re_debug_flags = 0; GET_RE_DEBUG_FLAGS; -#define RE_PV_DISPLAY_DECL(rpv,rlen,isuni,dsv,pv,l,m) \ - const char * const rpv = (isuni) ? \ - pv_uni_display(dsv, (U8*)(pv), l, m, UNI_DISPLAY_REGEX) : \ - pv_escape(dsv, pv, l, m, 0); \ + +#define RE_PV_COLOR_DECL(rpv,rlen,isuni,dsv,pv,l,m,c1,c2) \ + const char * const rpv = \ + pv_pretty((dsv), (pv), (l), (m), \ + PL_colors[(c1)],PL_colors[(c2)], \ + ((isuni) ? PERL_PV_ESCAPE_UNI : 0) ); \ const int rlen = SvCUR(dsv) -#else + +#define RE_SV_ESCAPE(rpv,isuni,dsv,sv,m) \ + const char * const rpv = \ + pv_pretty((dsv), (SvPV_nolen_const(sv)), (SvCUR(sv)), (m), \ + PL_colors[(c1)],PL_colors[(c2)], \ + ((isuni) ? PERL_PV_ESCAPE_UNI : 0) ) + +#define RE_PV_QUOTED_DECL(rpv,isuni,dsv,pv,l,m) \ + const char * const rpv = \ + pv_pretty((dsv), (pv), (l), (m), \ + PL_colors[0], PL_colors[1], \ + ( PERL_PV_PRETTY_QUOTE | PERL_PV_PRETTY_ELIPSES | \ + ((isuni) ? PERL_PV_ESCAPE_UNI : 0)) \ + ) + +#define RE_SV_DUMPLEN(ItEm) (SvCUR(ItEm) - (SvTAIL(ItEm)!=0)) +#define RE_SV_TAIL(ItEm) (SvTAIL(ItEm) ? "$" : "") + +#else /* if not DEBUGGING */ + #define GET_RE_DEBUG_FLAGS_DECL -#define RE_PV_DISPLAY_DECL -#endif +#define RE_PV_COLOR_DECL(rpv,rlen,isuni,dsv,pv,l,m,c1,c2) +#define RE_SV_ESCAPE(rpv,isuni,dsv,sv,m) +#define RE_PV_QUOTED_DECL(rpv,isuni,dsv,pv,l,m) +#define RE_SV_DUMPLEN(ItEm) +#define RE_SV_TAIL(ItEm) + +#endif /* DEBUG RELATED DEFINES */