X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=regcomp.h;h=1ef9d2daeeeb5f0e321b539895ef5e4737c3120c;hb=62b8d8ab959abb036445775bc06610eda92ce142;hp=d3c75f05856d259d31338a2e3fee99243c450eb2;hpb=07bc277f32c1d7aff237dd3f55d558b5d4b93314;p=p5sagit%2Fp5-mst-13.2.git diff --git a/regcomp.h b/regcomp.h index d3c75f0..1ef9d2d 100644 --- a/regcomp.h +++ b/regcomp.h @@ -18,6 +18,25 @@ typedef OP OP_4tree; /* Will be redefined later. */ /* Be really agressive about optimising patterns with trie sequences? */ #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1 +/* Use old style unicode mappings for perl and posix character classes + * + * NOTE: Enabling this essentially breaks character class matching against unicode + * strings, so that POSIX char classes match when they shouldn't, and \d matches + * way more than 10 characters, and sometimes a charclass and its complement either + * both match or neither match. + * NOTE: Disabling this will cause various backwards compatibility issues to rear + * their head, and tests to fail. However it will make the charclass behaviour + * consistant regardless of internal string type, and make character class inversions + * consistant. The tests that fail in the regex engine are basically broken tests. + * + * Personally I think 5.12 should disable this for sure. Its a bit more debatable for + * 5.10, so for now im leaving it enabled. + * XXX: It is now enabled for 5.11/5.12 + * + * -demerphq + */ +#define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 1 + /* Should the optimiser take positive assertions into account? */ #define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 0 @@ -178,7 +197,7 @@ struct regnode_2 { #define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ -#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 40 (8*5) named classes */ +#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */ /* also used by trie */ struct regnode_charclass { @@ -317,9 +336,9 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ANYOF_NALNUM 1 #define ANYOF_SPACE 2 /* \s */ #define ANYOF_NSPACE 3 -#define ANYOF_DIGIT 4 +#define ANYOF_DIGIT 4 /* \d */ #define ANYOF_NDIGIT 5 -#define ANYOF_ALNUMC 6 /* isalnum(3), utf8::IsAlnum, ALNUMC */ +#define ANYOF_ALNUMC 6 /* [[:alnum:]] isalnum(3), utf8::IsAlnum, ALNUMC */ #define ANYOF_NALNUMC 7 #define ANYOF_ALPHA 8 #define ANYOF_NALPHA 9 @@ -428,37 +447,6 @@ START_EXTERN_C #include "regnodes.h" #endif -/* The following have no fixed length. U8 so we can do strchr() on it. */ -#ifndef DOINIT -EXTCONST U8 PL_varies[]; -#else -EXTCONST U8 PL_varies[] = { - BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, - WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP, - NREF, NREFF, NREFFL, - 0 -}; -#endif - -/* The following always have a length of 1. U8 we can do strchr() on it. */ -/* (Note that length 1 means "one character" under UTF8, not "one octet".) */ -#ifndef DOINIT -EXTCONST U8 PL_simple[]; -#else -EXTCONST U8 PL_simple[] = { - REG_ANY, SANY, CANY, - ANYOF, - ALNUM, ALNUML, - NALNUM, NALNUML, - SPACE, SPACEL, - NSPACE, NSPACEL, - DIGIT, NDIGIT, - VERTWS, NVERTWS, - HORIZWS, NHORIZWS, - 0 -}; -#endif - #ifndef PLUGGABLE_RE_EXTENSION #ifndef DOINIT EXTCONST regexp_engine PL_core_reg_engine; @@ -488,6 +476,7 @@ END_EXTERN_C /* .what is a character array with one character for each member of .data * The character describes the function of the corresponding .data item: + * a - AV for paren_name_list under DEBUGGING * f - start-class data for regstclass optimization * n - Root of op tree for (?{EVAL}) item * o - Start op for (?{EVAL}) item @@ -528,10 +517,10 @@ struct reg_data { #define check_offset_max substrs->data[2].max_offset #define check_end_shift substrs->data[2].end_shift -#define RX_ANCHORED_SUBSTR(rx) ((rx)->anchored_substr) -#define RX_ANCHORED_UTF8(rx) ((rx)->anchored_utf8) -#define RX_FLOAT_SUBSTR(rx) ((rx)->float_substr) -#define RX_FLOAT_UTF8(rx) ((rx)->float_utf8) +#define RX_ANCHORED_SUBSTR(rx) (((struct regexp *)SvANY(rx))->anchored_substr) +#define RX_ANCHORED_UTF8(rx) (((struct regexp *)SvANY(rx))->anchored_utf8) +#define RX_FLOAT_SUBSTR(rx) (((struct regexp *)SvANY(rx))->float_substr) +#define RX_FLOAT_UTF8(rx) (((struct regexp *)SvANY(rx))->float_utf8) /* trie related stuff */ @@ -567,6 +556,15 @@ struct _reg_trie_state { } trans; }; +/* info per word; indexed by wordnum */ +typedef struct { + U16 prev; /* previous word in acceptance chain; eg in + * zzz|abc|ab/ after matching the chars abc, the + * accepted word is #2, and the previous accepted + * word is #3 */ + U32 len; /* how many chars long is this word? */ + U32 accept; /* accept state for this word */ +} reg_trie_wordinfo; typedef struct _reg_trie_state reg_trie_state; @@ -584,15 +582,14 @@ struct _reg_trie_data { reg_trie_state *states; /* state data */ reg_trie_trans *trans; /* array of transition elements */ char *bitmap; /* stclass bitmap */ - U32 *wordlen; /* array of lengths of words */ U16 *jump; /* optional 1 indexed array of offsets before tail for the node following a given word. */ - U16 *nextword; /* optional 1 indexed array to support linked list - of duplicate wordnums */ + reg_trie_wordinfo *wordinfo; /* array of info per word */ U16 uniquecharcount; /* unique chars in trie (width of trans table) */ U32 startstate; /* initial state - used for common prefix optimisation */ STRLEN minlen; /* minimum length of words in trie - build/opt only? */ STRLEN maxlen; /* maximum length of words in trie - build/opt only? */ + U32 prefixlen; /* #chars in common prefix */ U32 statecount; /* Build only - number of states in the states array (including the unused zero state) */ U32 wordcount; /* Build only */ @@ -710,6 +707,7 @@ re.pm, especially to the documentation. #define RE_DEBUG_EXTRA_STATE 0x080000 #define RE_DEBUG_EXTRA_OPTIMISE 0x100000 #define RE_DEBUG_EXTRA_BUFFERS 0x400000 +#define RE_DEBUG_EXTRA_GPOS 0x800000 /* combined */ #define RE_DEBUG_EXTRA_STACK 0x280000 @@ -765,6 +763,8 @@ re.pm, especially to the documentation. #define DEBUG_TRIE_r(x) DEBUG_r( \ if (re_debug_flags & (RE_DEBUG_COMPILE_TRIE \ | RE_DEBUG_EXECUTE_TRIE )) x ) +#define DEBUG_GPOS_r(x) DEBUG_r( \ + if (re_debug_flags & RE_DEBUG_EXTRA_GPOS) x ) /* initialization */ /* get_sv() can return NULL during global destruction. */