X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.h;h=7df47d36f3995172cf8eaba1a119f7b4a4c486d2;hb=db4997f00d6b1ad267e4fec6a272e72e29719dd1;hp=b6f3617ccf19bee676ff465a407fb831f3744fec;hpb=786e8c118e1218e4c348fecf469934e080881633;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.h b/regcomp.h index b6f3617..7df47d3 100644 --- a/regcomp.h +++ b/regcomp.h @@ -11,10 +11,25 @@ typedef OP OP_4tree; /* Will be redefined later. */ +/* Convert branch sequences to more efficient trie ops? */ #define PERL_ENABLE_TRIE_OPTIMISATION 1 + +/* Be really agressive about optimising patterns with trie sequences? */ #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1 + +/* Should the optimiser take positive assertions into account? */ +#define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 1 + +/* Not for production use: */ #define PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 0 +/* Unless the next line is uncommented it is illegal to combine lazy + matching with possessive matching. Frankly it doesn't make much sense + to allow it as X*?+ matches nothing, X+?+ matches a single char only, + and X{min,max}?+ matches min times only. + */ +/* #define REG_ALLOW_MINMOD_SUSPEND */ + /* * The "internal use only" fields in regexp.h are present to pass info from * compile to execute that permits the execute phase to run lots faster on @@ -78,6 +93,51 @@ typedef OP OP_4tree; /* Will be redefined later. */ * stored negative.] */ +/* This is the stuff that used to live in regexp.h that was truly + private to the engine itself. It now lives here. */ + +/* swap buffer for paren structs */ +typedef struct regexp_paren_ofs { + I32 *startp; + I32 *endp; +} regexp_paren_ofs; + + typedef struct regexp_internal { +#ifdef DEBUGGING + int name_list_idx; /* Optional data index of an array of paren names */ +#endif + + U32 *offsets; /* offset annotations 20001228 MJD + data about mapping the program to the + string*/ + regexp_paren_ofs *swap; /* Swap copy of *startp / *endp */ + regnode *regstclass; /* Optional startclass as identified or constructed + by the optimiser */ + struct reg_data *data; /* Additional miscellaneous data used by the program. + Used to make it easier to clone and free arbitrary + data that the regops need. Often the ARG field of + a regop is an index into this structure */ + regnode program[1]; /* Unwarranted chumminess with compiler. */ +} regexp_internal; + +#define RXi_SET(x,y) (x)->pprivate = (void*)(y) +#define RXi_GET(x) ((regexp_internal *)((x)->pprivate)) +#define RXi_GET_DECL(r,ri) regexp_internal *ri = RXi_GET(r) +/* + * Flags stored in regexp->intflags + * These are used only internally to the regexp engine + * + * See regexp.h for flags used externally to the regexp engine + */ +#define PREGf_SKIP 0x00000001 +#define PREGf_IMPLICIT 0x00000002 /* Converted .* to ^.* */ +#define PREGf_NAUGHTY 0x00000004 /* how exponential is this pattern? */ +#define PREGf_VERBARG_SEEN 0x00000008 +#define PREGf_CUTGROUP_SEEN 0x00000010 + + +/* this is where the old regcomp.h started */ + struct regnode_string { U8 str_len; U8 type; @@ -85,6 +145,8 @@ struct regnode_string { char string[1]; }; +/* Argument bearing node - workhorse, + arg1 is often for the data field */ struct regnode_1 { U8 flags; U8 type; @@ -92,6 +154,16 @@ struct regnode_1 { U32 arg1; }; +/* Similar to a regnode_1 but with an extra signed argument */ +struct regnode_2L { + U8 flags; + U8 type; + U16 next_off; + U32 arg1; + I32 arg2; +}; + +/* 'Two field' -- Two 16 bit unsigned args */ struct regnode_2 { U8 flags; U8 type; @@ -100,6 +172,7 @@ struct regnode_2 { U16 arg2; }; + #define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ #define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */ @@ -153,10 +226,12 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ARG(p) ARG_VALUE(ARG_LOC(p)) #define ARG1(p) ARG_VALUE(ARG1_LOC(p)) #define ARG2(p) ARG_VALUE(ARG2_LOC(p)) +#define ARG2L(p) ARG_VALUE(ARG2L_LOC(p)) #define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val)) #define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val)) #define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val)) +#define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val)) #undef NEXT_OFF #undef NODE_ALIGN @@ -189,7 +264,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ARG_LOC(p) (((struct regnode_1 *)p)->arg1) #define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) #define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) - +#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2) #define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */ #define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2) @@ -327,6 +402,10 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define REG_SEEN_EVAL 0x00000008 #define REG_SEEN_CANY 0x00000010 #define REG_SEEN_SANY REG_SEEN_CANY /* src bckwrd cmpt */ +#define REG_SEEN_RECURSE 0x00000020 +#define REG_TOP_LEVEL_BRANCHES 0x00000040 +#define REG_SEEN_VERBARG 0x00000080 +#define REG_SEEN_CUTGROUP 0x00000100 START_EXTERN_C @@ -342,7 +421,9 @@ EXTCONST U8 PL_varies[]; #else EXTCONST U8 PL_varies[] = { BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, - WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP, 0 + WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP, + NREF, NREFF, NREFFL, + 0 }; #endif @@ -363,13 +444,26 @@ EXTCONST U8 PL_simple[] = { }; #endif +#ifndef PLUGGABLE_RE_EXTENSION +#ifndef DOINIT +EXTCONST regexp_engine PL_core_reg_engine; +#else /* DOINIT */ +EXTCONST regexp_engine PL_core_reg_engine = { + Perl_re_compile, + Perl_regexec_flags, + Perl_re_intuit_start, + Perl_re_intuit_string, + Perl_regfree_internal, +#if defined(USE_ITHREADS) + Perl_regdupe_internal +#endif +}; +#endif /* DOINIT */ +#endif /* PLUGGABLE_RE_EXTENSION */ + + END_EXTERN_C -typedef struct re_scream_pos_data_s -{ - char **scream_olds; /* match pos */ - I32 *scream_pos; /* Internal iterator of scream. */ -} re_scream_pos_data; /* .what is a character array with one character for each member of .data * The character describes the function of the corresponding .data item: @@ -381,7 +475,10 @@ typedef struct re_scream_pos_data_s * strings resulting from casefolding the single-character entries * in the character class * t - trie struct + * u - trie struct's widecharmap (a HV, so can't share, must dup) + * also used for revcharmap and words under DEBUGGING * T - aho-trie struct + * S - sv for named capture lookup * 20010712 mjd@plover.com * (Remember to update re_dup() and pregfree() if you add any items.) */ @@ -391,28 +488,24 @@ struct reg_data { void* data[1]; }; -struct reg_substr_datum { - I32 min_offset; - I32 max_offset; - SV *substr; /* non-utf8 variant */ - SV *utf8_substr; /* utf8 variant */ -}; - -struct reg_substr_data { - struct reg_substr_datum data[3]; /* Actual array */ -}; - +/* Code in S_to_utf8_substr() and S_to_byte_substr() in regexec.c accesses + anchored* and float* via array indexes 0 and 1. */ #define anchored_substr substrs->data[0].substr #define anchored_utf8 substrs->data[0].utf8_substr #define anchored_offset substrs->data[0].min_offset +#define anchored_end_shift substrs->data[0].end_shift + #define float_substr substrs->data[1].substr #define float_utf8 substrs->data[1].utf8_substr #define float_min_offset substrs->data[1].min_offset #define float_max_offset substrs->data[1].max_offset +#define float_end_shift substrs->data[1].end_shift + #define check_substr substrs->data[2].substr #define check_utf8 substrs->data[2].utf8_substr #define check_offset_min substrs->data[2].min_offset #define check_offset_max substrs->data[2].max_offset +#define check_end_shift substrs->data[2].end_shift @@ -457,16 +550,17 @@ typedef struct _reg_trie_trans reg_trie_trans; /* anything in here that needs to be freed later - should be dealt with in pregfree */ + should be dealt with in pregfree. + refcount is first in both this and _reg_ac_data to allow a space + optimisation in Perl_regdupe. */ struct _reg_trie_data { + U32 refcount; /* number of times this trie is referenced */ U16 uniquecharcount; /* unique chars in trie (width of trans table) */ U32 lasttrans; /* last valid transition element */ U16 *charmap; /* byte to charid lookup array */ - HV *widecharmap; /* code points > 255 to charid */ reg_trie_state *states; /* state data */ reg_trie_trans *trans; /* array of transition elements */ char *bitmap; /* stclass bitmap */ - U32 refcount; /* number of times this trie is referenced */ U32 startstate; /* initial state - used for common prefix optimisation */ STRLEN minlen; /* minimum length of words in trie - build/opt only? */ STRLEN maxlen; /* maximum length of words in trie - build/opt only? */ @@ -475,21 +569,34 @@ struct _reg_trie_data { for the node following a given word. */ U16 *nextword; /* optional 1 indexed array to support linked list of duplicate wordnums */ - U32 laststate; /* Build only */ + U32 statecount; /* Build only - number of states in the states array + (including the unused zero state) */ U32 wordcount; /* Build only */ #ifdef DEBUGGING STRLEN charcount; /* Build only */ - AV *words; /* Array of words contained in trie, for dumping */ - AV *revcharmap; /* Map of each charid back to its character representation */ #endif }; +/* There is one (3 under DEBUGGING) pointers that logically belong in this + structure, but are held outside as they need duplication on thread cloning, + whereas the rest of the structure can be read only: + HV *widecharmap; code points > 255 to charid +#ifdef DEBUGGING + AV *words; Array of words contained in trie, for dumping + AV *revcharmap; Map of each charid back to its character representation +#endif +*/ + +#define TRIE_WORDS_OFFSET 2 + typedef struct _reg_trie_data reg_trie_data; +/* refcount is first in both this and _reg_trie_data to allow a space + optimisation in Perl_regdupe. */ struct _reg_ac_data { + U32 refcount; U32 *fail; reg_trie_state *states; - reg_trie_data *trie; - U32 refcount; + U32 trie; }; typedef struct _reg_ac_data reg_ac_data; @@ -503,6 +610,10 @@ typedef struct _reg_ac_data reg_ac_data; #define TRIE_BITMAP_CLEAR(p,c) (TRIE_BITMAP_BYTE(p, c) &= ~ANYOF_BIT((U8)c)) #define TRIE_BITMAP_TEST(p, c) (TRIE_BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c)) +#define IS_ANYOF_TRIE(op) ((op)==TRIEC || (op)==AHOCORASICKC) +#define IS_TRIE_AC(op) ((op)>=AHOCORASICK) + + #define BITMAP_BYTE(p, c) (((U8*)p)[(((U8)(c)) >> 3) & 31]) #define BITMAP_TEST(p, c) (BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c)) @@ -513,10 +624,8 @@ typedef struct _reg_ac_data reg_ac_data; #ifdef DEBUGGING #define TRIE_CHARCOUNT(trie) ((trie)->charcount) -#define TRIE_REVCHARMAP(trie) ((trie)->revcharmap) #else #define TRIE_CHARCOUNT(trie) (trie_charcount) -#define TRIE_REVCHARMAP(trie) (trie_revcharmap) #endif #define RE_TRIE_MAXBUF_INIT 65536 @@ -561,7 +670,6 @@ re.pm, especially to the documentation. #define RE_DEBUG_COMPILE_OPTIMISE 0x000002 #define RE_DEBUG_COMPILE_TRIE 0x000004 #define RE_DEBUG_COMPILE_DUMP 0x000008 -#define RE_DEBUG_COMPILE_OFFSETS 0x000010 /* Execute */ #define RE_DEBUG_EXECUTE_MASK 0x00FF00 @@ -573,7 +681,11 @@ re.pm, especially to the documentation. #define RE_DEBUG_EXTRA_MASK 0xFF0000 #define RE_DEBUG_EXTRA_TRIE 0x010000 #define RE_DEBUG_EXTRA_OFFSETS 0x020000 -#define RE_DEBUG_EXTRA_STATE 0x040000 +#define RE_DEBUG_EXTRA_OFFDEBUG 0x040000 +#define RE_DEBUG_EXTRA_STATE 0x080000 +#define RE_DEBUG_EXTRA_OPTIMISE 0x100000 +/* combined */ +#define RE_DEBUG_EXTRA_STACK 0x280000 #define RE_DEBUG_FLAG(x) (re_debug_flags & x) /* Compile */ @@ -587,8 +699,6 @@ re.pm, especially to the documentation. if (re_debug_flags & RE_DEBUG_COMPILE_PARSE) x ) #define DEBUG_DUMP_r(x) DEBUG_r( \ if (re_debug_flags & RE_DEBUG_COMPILE_DUMP) x ) -#define DEBUG_OFFSETS_r(x) DEBUG_r( \ - if (re_debug_flags & RE_DEBUG_COMPILE_OFFSETS) x ) #define DEBUG_TRIE_COMPILE_r(x) DEBUG_r( \ if (re_debug_flags & RE_DEBUG_COMPILE_TRIE) x ) @@ -605,10 +715,17 @@ re.pm, especially to the documentation. /* Extra */ #define DEBUG_EXTRA_r(x) DEBUG_r( \ if (re_debug_flags & RE_DEBUG_EXTRA_MASK) x ) +#define DEBUG_OFFSETS_r(x) DEBUG_r( \ + if (re_debug_flags & RE_DEBUG_EXTRA_OFFSETS) x ) #define DEBUG_STATE_r(x) DEBUG_r( \ if (re_debug_flags & RE_DEBUG_EXTRA_STATE) x ) +#define DEBUG_STACK_r(x) DEBUG_r( \ + if (re_debug_flags & RE_DEBUG_EXTRA_STACK) x ) +#define DEBUG_OPTIMISE_MORE_r(x) DEBUG_r( \ + if ((RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE) == \ + (re_debug_flags & (RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE)) ) x ) #define MJD_OFFSET_DEBUG(x) DEBUG_r( \ - if (re_debug_flags & RE_DEBUG_EXTRA_OFFSETS) \ + if (re_debug_flags & RE_DEBUG_EXTRA_OFFDEBUG) \ Perl_warn_nocontext x ) #define DEBUG_TRIE_COMPILE_MORE_r(x) DEBUG_TRIE_COMPILE_r( \ if (re_debug_flags & RE_DEBUG_EXTRA_TRIE) x ) @@ -670,3 +787,4 @@ re.pm, especially to the documentation. #endif /* DEBUG RELATED DEFINES */ +