/* regcomp.h
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+ * 2000, 2001, 2002, 2003, 2005 by Larry Wall and others
+ *
+ * You may distribute under the terms of either the GNU General Public
+ * License or the Artistic License, as specified in the README file.
+ *
*/
typedef OP OP_4tree; /* Will be redefined later. */
* a literal string; for others, it is a node leading into a sub-FSM. In
* particular, the operand of a BRANCH node is the first node of the branch.
* (NB this is *not* a tree structure: the tail of the branch connects
- * to the thing following the set of BRANCHes.) The opcodes are:
+ * to the thing following the set of BRANCHes.) The opcodes are defined
+ * in regnodes.h which is generated from regcomp.sym by regcomp.pl.
*/
/*
U8 type;
U16 next_off;
U32 arg1;
- char bitmap[ANYOF_BITMAP_SIZE];
+ char bitmap[ANYOF_BITMAP_SIZE]; /* only compile-time */
};
-struct regnode_charclass_class {
- U8 flags;
+struct regnode_charclass_class { /* has [[:blah:]] classes */
+ U8 flags; /* should have ANYOF_CLASS here */
U8 type;
U16 next_off;
U32 arg1;
- char bitmap[ANYOF_BITMAP_SIZE];
- char classflags[ANYOF_CLASSBITMAP_SIZE];
+ char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */
+ char classflags[ANYOF_CLASSBITMAP_SIZE]; /* and run-time */
};
/* XXX fix this description.
#define ARG_VALUE(arg) (arg)
#define ARG__SET(arg,val) ((arg) = (val))
+#undef ARG
+#undef ARG1
+#undef ARG2
+
#define ARG(p) ARG_VALUE(ARG_LOC(p))
#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
-#ifndef lint
-# define NEXT_OFF(p) ((p)->next_off)
-# define NODE_ALIGN(node)
-# define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */
-#else /* lint */
-# define NEXT_OFF(p) 0
-# define NODE_ALIGN(node)
-# define NODE_ALIGN_FILL(node)
-#endif /* lint */
+#undef NEXT_OFF
+#undef NODE_ALIGN
+
+#define NEXT_OFF(p) ((p)->next_off)
+#define NODE_ALIGN(node)
+#define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */
#define SIZE_ALIGN NODE_ALIGN
+#undef OP
+#undef OPERAND
+#undef MASK
+#undef STRING
+
#define OP(p) ((p)->type)
#define OPERAND(p) (((struct regnode_string *)p)->string)
#define MASK(p) ((char*)OPERAND(p))
#define STR_SZ(l) ((l + sizeof(regnode) - 1) / sizeof(regnode))
#define NODE_SZ_STR(p) (STR_SZ(STR_LEN(p))+1)
+#undef NODE_ALIGN
+#undef ARG_LOC
+#undef NEXTOPER
+#undef PREVOPER
+
#define NODE_ALIGN(node)
#define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
/* Flags for node->flags of ANYOF */
-#define ANYOF_CLASS 0x08
+#define ANYOF_CLASS 0x08 /* has [[:blah:]] classes */
#define ANYOF_INVERT 0x04
#define ANYOF_FOLD 0x02
#define ANYOF_LOCALE 0x01
#define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c))
#define ANYOF_BITMAP_TEST(p, c) (ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c))
+#define ANYOF_BITMAP_SETALL(p) \
+ memset (ANYOF_BITMAP(p), 255, ANYOF_BITMAP_SIZE)
+#define ANYOF_BITMAP_CLEARALL(p) \
+ Zero (ANYOF_BITMAP(p), ANYOF_BITMAP_SIZE)
+/* Check that all 256 bits are all set. Used in S_cl_is_anything() */
+#define ANYOF_BITMAP_TESTALLSET(p) \
+ memEQ (ANYOF_BITMAP(p), "\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377", ANYOF_BITMAP_SIZE)
+
#define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode))
#define ANYOF_CLASS_SKIP ((ANYOF_CLASS_SIZE - 1)/sizeof(regnode))
#define ANYOF_CLASS_ADD_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP)
/*
* Utility definitions.
*/
-#ifndef lint
#ifndef CHARMASK
-#define UCHARAT(p) ((int)*(U8*)(p))
+# define UCHARAT(p) ((int)*(const U8*)(p))
#else
-#define UCHARAT(p) ((int)*(p)&CHARMASK)
+# define UCHARAT(p) ((int)*(p)&CHARMASK)
#endif
-#else /* lint */
-#define UCHARAT(p) PL_regdummy
-#endif /* lint */
#define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
#define REG_SEEN_LOOKBEHIND 2
#define REG_SEEN_GPOS 4
#define REG_SEEN_EVAL 8
-#define REG_SEEN_SANY 16
+#define REG_SEEN_CANY 16
+#define REG_SEEN_SANY REG_SEEN_CANY /* src bckwrd cmpt */
START_EXTERN_C
EXTCONST U8 PL_varies[];
#else
EXTCONST U8 PL_varies[] = {
- BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL,
+ BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL,
WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP, 0
};
#endif
EXTCONST U8 PL_simple[];
#else
EXTCONST U8 PL_simple[] = {
- REG_ANY, SANY,
+ REG_ANY, SANY, CANY,
ANYOF,
ALNUM, ALNUML,
NALNUM, NALNUML,
I32 *scream_pos; /* Internal iterator of scream. */
} re_scream_pos_data;
+/* .what is a character array with one character for each member of .data
+ * The character describes the function of the corresponding .data item:
+ * f - start-class data for regstclass optimization
+ * n - Root of op tree for (?{EVAL}) item
+ * o - Start op for (?{EVAL}) item
+ * p - Pad for (?{EVAL} item
+ * s - swash for unicode-style character class, and the multicharacter
+ * strings resulting from casefolding the single-character entries
+ * in the character class
+ * t - trie struct
+ * 20010712 mjd@plover.com
+ * (Remember to update re_dup() and pregfree() if you add any items.)
+ */
struct reg_data {
U32 count;
U8 *what;
struct reg_substr_datum {
I32 min_offset;
I32 max_offset;
- SV *substr;
+ SV *substr; /* non-utf8 variant */
+ SV *utf8_substr; /* utf8 variant */
};
struct reg_substr_data {
};
#define anchored_substr substrs->data[0].substr
+#define anchored_utf8 substrs->data[0].utf8_substr
#define anchored_offset substrs->data[0].min_offset
#define float_substr substrs->data[1].substr
+#define float_utf8 substrs->data[1].utf8_substr
#define float_min_offset substrs->data[1].min_offset
#define float_max_offset substrs->data[1].max_offset
#define check_substr substrs->data[2].substr
+#define check_utf8 substrs->data[2].utf8_substr
#define check_offset_min substrs->data[2].min_offset
#define check_offset_max substrs->data[2].max_offset
+
+
+
+/* trie related stuff */
+/* an accepting state/position*/
+struct _reg_trie_accepted {
+ U8 *endpos;
+ U16 wordnum;
+};
+/* a transition record for the state machine. the
+ check field determines which state "owns" the
+ transition. the char the transition is for is
+ determined by offset from the owning states base
+ field. the next field determines which state
+ is to be transitioned to if any.
+*/
+struct _reg_trie_trans {
+ U32 next;
+ U32 check;
+};
+
+/* a transition list element for the list based representation */
+struct _reg_trie_trans_list_elem {
+ U16 forid;
+ U32 newstate;
+};
+typedef struct _reg_trie_trans_list_elem reg_trie_trans_le;
+
+/* a state for compressed nodes. base is an offset
+ into an array of reg_trie_trans array. If wordnum is
+ nonzero the state is accepting. if base is zero then
+ the state has no children (and will be accepting)
+*/
+struct _reg_trie_state {
+ U16 wordnum;
+ union {
+ U32 base;
+ reg_trie_trans_le* list;
+ } trans;
+};
+
+
+
+typedef struct _reg_trie_accepted reg_trie_accepted;
+typedef struct _reg_trie_state reg_trie_state;
+typedef struct _reg_trie_trans reg_trie_trans;
+
+
+/* anything in here that needs to be freed later
+should be dealt with in pregfree */
+struct _reg_trie_data {
+ U16 uniquecharcount;
+ U16 wordcount;
+ STRLEN charcount;
+ U32 laststate;
+ U32 lasttrans;
+ U16 *charmap;
+ HV *widecharmap;
+ reg_trie_state *states;
+ reg_trie_trans *trans;
+ U32 refcount;
+#ifdef DEBUGGING
+ AV *words;
+ AV *revcharmap;
+#endif
+};
+
+typedef struct _reg_trie_data reg_trie_data;
+
+/* these defines assume uniquecharcount is the correct variable, and state may be evaluated twice */
+#define TRIE_NODENUM(state) (((state)-1)/(trie->uniquecharcount)+1)
+#define SAFE_TRIE_NODENUM(state) ((state) ? (((state)-1)/(trie->uniquecharcount)+1) : (state))
+#define TRIE_NODEIDX(state) ((state) ? (((state)-1)*(trie->uniquecharcount)+1) : (state))
+
+#define DO_TRIE 1
+#define TRIE_DEBUG 1
+
+#define RE_TRIE_MAXBUF_INIT 65536
+#define RE_TRIE_MAXBUF_NAME "\022E_TRIE_MAXBUF"
+#define RE_DEBUG_FLAGS "\022E_DEBUG_FLAGS"
+
+/* If you change these be sure to update ext/re/re.pm as well */
+#define RE_DEBUG_COMPILE 1
+#define RE_DEBUG_EXECUTE 2
+#define RE_DEBUG_TRIE_COMPILE 4
+#define RE_DEBUG_TRIE_EXECUTE 8
+#define RE_DEBUG_TRIE_MORE 16
+#define RE_DEBUG_OPTIMISE 32
+#define RE_DEBUG_OFFSETS 64
+
+#define DEBUG_OPTIMISE_r(x) DEBUG_r( if (SvIV(re_debug_flags) & RE_DEBUG_OPTIMISE) x )
+#define DEBUG_EXECUTE_r(x) DEBUG_r( if (SvIV(re_debug_flags) & RE_DEBUG_EXECUTE) x )
+#define DEBUG_COMPILE_r(x) DEBUG_r( if (SvIV(re_debug_flags) & RE_DEBUG_COMPILE) x )
+#define DEBUG_OFFSETS_r(x) DEBUG_r( if (SvIV(re_debug_flags) & RE_DEBUG_OFFSETS) x )
+#define DEBUG_TRIE_r(x) DEBUG_r( \
+ if (SvIV(re_debug_flags) & RE_DEBUG_TRIE_COMPILE \
+ || SvIV(re_debug_flags) & RE_DEBUG_TRIE_EXECUTE ) \
+ x \
+)
+#define DEBUG_TRIE_EXECUTE_r(x) \
+ DEBUG_r( if (SvIV(re_debug_flags) & RE_DEBUG_TRIE_EXECUTE) x )
+
+#define DEBUG_TRIE_COMPILE_r(x) \
+ DEBUG_r( if (SvIV(re_debug_flags) & RE_DEBUG_TRIE_COMPILE) x )
+
+#define DEBUG_TRIE_EXECUTE_MORE_r(x) \
+ DEBUG_TRIE_EXECUTE_r( if (SvIV(re_debug_flags) & RE_DEBUG_TRIE_MORE) x )
+
+#define DEBUG_TRIE_COMPILE_MORE_r(x) \
+ DEBUG_TRIE_COMPILE_r( if (SvIV(re_debug_flags) & RE_DEBUG_TRIE_MORE) x )
+
+#define GET_RE_DEBUG_FLAGS DEBUG_r( \
+ re_debug_flags=get_sv(RE_DEBUG_FLAGS, 1); \
+ if (!SvIOK(re_debug_flags)) { \
+ sv_setiv(re_debug_flags, RE_DEBUG_COMPILE | RE_DEBUG_EXECUTE | RE_DEBUG_OFFSETS); \
+ } \
+ )
+
+
+#ifdef DEBUGGING
+#define GET_RE_DEBUG_FLAGS_DECL SV *re_debug_flags = NULL; GET_RE_DEBUG_FLAGS;
+#else
+#define GET_RE_DEBUG_FLAGS_DECL
+#endif
+
+