/* regexp.h
*
* Copyright (C) 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2003,
- * by Larry Wall and others
+ * 2005, 2006 by Larry Wall and others
*
* You may distribute under the terms of either the GNU General Public
* License or the Artistic License, as specified in the README file.
* not the System V one.
*/
#ifndef PLUGGABLE_RE_EXTENSION
-/* we don't want to include this stuff if we are inside Nicholas'
- * pluggable regex engine code */
+/* we don't want to include this stuff if we are inside of
+ an external regex engine based on the core one - like re 'debug'*/
struct regnode {
U8 flags;
struct reg_data;
struct regexp_engine;
-
-typedef struct regexp_paren_ofs {
- I32 *startp;
- I32 *endp;
-} regexp_paren_ofs;
+struct regexp;
+
+struct reg_substr_datum {
+ I32 min_offset;
+ I32 max_offset;
+ SV *substr; /* non-utf8 variant */
+ SV *utf8_substr; /* utf8 variant */
+ I32 end_shift;
+};
+struct reg_substr_data {
+ struct reg_substr_datum data[3]; /* Actual array */
+};
#ifdef PERL_OLD_COPY_ON_WRITE
#define SV_SAVED_COPY SV *saved_copy; /* If non-NULL, SV which is COW from original */
#else
#define SV_SAVED_COPY
#endif
+
+/* swap buffer for paren structs */
+typedef struct regexp_paren_ofs {
+ I32 *startp;
+ I32 *endp;
+} regexp_paren_ofs;
+
/* this is ordered such that the most commonly used
fields are at the start of the struct */
typedef struct regexp {
/* what engine created this regexp? */
const struct regexp_engine* engine;
+ struct regexp* mother_re; /* what re is this a lightweight copy of? */
/* Information about the match that the perl core uses to manage things */
U32 extflags; /* Flags used both externally and internally */
/* Data about the last/current match. These are modified during matching*/
U32 lastparen; /* last open paren matched */
U32 lastcloseparen; /* last close paren matched */
+ regexp_paren_ofs *swap; /* Swap copy of *startp / *endp */
I32 *startp; /* Array of offsets from start of string (@-) */
I32 *endp; /* Array of offsets from start of string (@+) */
+
char *subbeg; /* saved or original string
so \digit works forever. */
I32 sublen; /* Length of string pointed by subbeg */
/* Information about the match that isn't often used */
char *precomp; /* pre-compilation regular expression */
I32 prelen; /* length of precomp */
+ char *wrapped; /* wrapped version of the pattern */
+ I32 wraplen; /* length of wrapped */
I32 seen_evals; /* number of eval groups in the pattern - for security checks */
HV *paren_names; /* Optional hash of paren names */
I32 refcnt; /* Refcount of this regexp */
} regexp;
-
-typedef struct regexp_internal {
- regexp_paren_ofs *swap; /* Swap copy of *startp / *endp */
- U32 *offsets; /* offset annotations 20001228 MJD
- data about mapping the program to the
- string*/
- regnode *regstclass; /* Optional startclass as identified or constructed
- by the optimiser */
- struct reg_data *data; /* Additional miscellaneous data used by the program.
- Used to make it easier to clone and free arbitrary
- data that the regops need. Often the ARG field of
- a regop is an index into this structure */
- regnode program[1]; /* Unwarranted chumminess with compiler. */
-} regexp_internal;
-
-#define RXi_SET(x,y) (x)->pprivate = (void*)(y)
-#define RXi_GET(x) ((regexp_internal *)((x)->pprivate))
-#define RXi_GET_DECL(r,ri) regexp_internal *ri = RXi_GET(r)
-
+/* used for high speed searches */
typedef struct re_scream_pos_data_s
{
char **scream_olds; /* match pos */
I32 *scream_pos; /* Internal iterator of scream. */
} re_scream_pos_data;
+/* regexp_engine structure. This is the dispatch table for regexes.
+ * Any regex engine implementation must be able to build one of these.
+ */
typedef struct regexp_engine {
regexp* (*comp) (pTHX_ char* exp, char* xend, PMOP* pm);
I32 (*exec) (pTHX_ regexp* prog, char* stringarg, char* strend,
struct re_scream_pos_data_s *data);
SV* (*checkstr) (pTHX_ regexp *prog);
void (*free) (pTHX_ struct regexp* r);
+ SV* (*numbered_buff_get) (pTHX_ const REGEXP * const rx, I32 paren, SV* usesv);
+ SV* (*named_buff_get)(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags);
#ifdef USE_ITHREADS
void* (*dupe) (pTHX_ const regexp *r, CLONE_PARAMS *param);
#endif
} regexp_engine;
-/*
- * Flags stored in regexp->intflags
- * These are used only internally to the regexp engine
- */
-#define PREGf_SKIP 0x00000001
-#define PREGf_IMPLICIT 0x00000002 /* Converted .* to ^.* */
-#define PREGf_NAUGHTY 0x00000004 /* how exponential is this pattern? */
-#define PREGf_VERBARG_SEEN 0x00000008
-#define PREGf_CUTGROUP_SEEN 0x00000010
-
-
/* Flags stored in regexp->extflags
* These are used by code external to the regexp engine
+ *
+ * Note that flags starting with RXf_PMf_ have exact equivalents
+ * stored in op_pmflags and which are defined in op.h, they are defined
+ * numerically here only for clarity.
*/
/* Anchor and GPOS related stuff */
#define RXf_ANCH_GPOS 0x00000008
#define RXf_GPOS_SEEN 0x00000010
#define RXf_GPOS_FLOAT 0x00000020
-/* five bits here */
+/* two bits here */
#define RXf_ANCH (RXf_ANCH_BOL|RXf_ANCH_MBOL|RXf_ANCH_GPOS|RXf_ANCH_SBOL)
#define RXf_GPOS_CHECK (RXf_GPOS_SEEN|RXf_ANCH_GPOS)
-#define RXf_ANCH_SINGLE (RXf_ANCH_SBOL|RXf_ANCH_GPOS)
-/*
- * 0xF800 of extflags is used by PMf_COMPILETIME
- * These are the regex equivelent of the PMf_xyz stuff defined
- * in op.h
+#define RXf_ANCH_SINGLE (RXf_ANCH_SBOL|RXf_ANCH_GPOS)
+
+/* Flags indicating special patterns */
+#define RXf_START_ONLY 0x00000200 /* Pattern is /^/ */
+#define RXf_WHITE 0x00000400 /* Pattern is /\s+/ */
+
+/* 0x1F800 of extflags is used by (RXf_)PMf_COMPILETIME */
+#define RXf_PMf_LOCALE 0x00000800 /* use locale */
+#define RXf_PMf_MULTILINE 0x00001000 /* /m */
+#define RXf_PMf_SINGLELINE 0x00002000 /* /s */
+#define RXf_PMf_FOLD 0x00004000 /* /i */
+#define RXf_PMf_EXTENDED 0x00008000 /* /x */
+#define RXf_PMf_KEEPCOPY 0x00010000 /* /k */
+/* these flags are transfered from the PMOP->op_pmflags member during compilation */
+#define RXf_PMf_STD_PMMOD (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED)
+#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_LOCALE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY)
+
+#define CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl) \
+ case IGNORE_PAT_MOD: *(pmfl) |= RXf_PMf_FOLD; break; \
+ case MULTILINE_PAT_MOD: *(pmfl) |= RXf_PMf_MULTILINE; break; \
+ case SINGLE_PAT_MOD: *(pmfl) |= RXf_PMf_SINGLELINE; break; \
+ case XTENDED_PAT_MOD: *(pmfl) |= RXf_PMf_EXTENDED; break
+
+/* chars and strings used as regex pattern modifiers
+ * Singlular is a 'c'har, plural is a "string"
+ *
+ * NOTE, KEEPCOPY was originally 'k', but was changed to 'p' for preserve
+ * for compatibility reasons with Regexp::Common which highjacked (?k:...)
+ * for its own uses. So 'k' is out as well.
*/
-#define RXf_PMf_LOCALE 0x00000800
-#define RXf_PMf_MULTILINE 0x00001000
-#define RXf_PMf_SINGLELINE 0x00002000
-#define RXf_PMf_FOLD 0x00004000
-#define RXf_PMf_EXTENDED 0x00008000
-#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_LOCALE|RXf_PMf_FOLD|RXf_PMf_EXTENDED)
+#define EXEC_PAT_MOD 'e'
+#define KEEPCOPY_PAT_MOD 'p'
+#define ONCE_PAT_MOD 'o'
+#define GLOBAL_PAT_MOD 'g'
+#define CONTINUE_PAT_MOD 'c'
+#define MULTILINE_PAT_MOD 'm'
+#define SINGLE_PAT_MOD 's'
+#define IGNORE_PAT_MOD 'i'
+#define XTENDED_PAT_MOD 'x'
+
+#define ONCE_PAT_MODS "o"
+#define KEEPCOPY_PAT_MODS "p"
+#define EXEC_PAT_MODS "e"
+#define LOOP_PAT_MODS "gc"
+
+#define STD_PAT_MODS "msix"
+
+#define INT_PAT_MODS STD_PAT_MODS KEEPCOPY_PAT_MODS
+
+#define EXT_PAT_MODS ONCE_PAT_MODS KEEPCOPY_PAT_MODS
+#define QR_PAT_MODS STD_PAT_MODS EXT_PAT_MODS
+#define M_PAT_MODS QR_PAT_MODS LOOP_PAT_MODS
+#define S_PAT_MODS M_PAT_MODS EXEC_PAT_MODS
+
/* What we have seen */
-/* one bit here */
#define RXf_LOOKBEHIND_SEEN 0x00020000
#define RXf_EVAL_SEEN 0x00040000
#define RXf_CANY_SEEN 0x00080000
#define RXf_TAINTED_SEEN 0x20000000
/* two bits here */
-
#define RX_HAS_CUTGROUP(prog) ((prog)->intflags & PREGf_CUTGROUP_SEEN)
#define RX_MATCH_TAINTED(prog) ((prog)->extflags & RXf_TAINTED_SEEN)
#define RX_MATCH_TAINTED_on(prog) ((prog)->extflags |= RXf_TAINTED_SEEN)
/* structures for holding and saving the state maintained by regmatch() */
-#define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 50
+#ifndef MAX_RECURSE_EVAL_NOCHANGE_DEPTH
+#define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 1000
+#endif
typedef I32 CHECKPOINT;
struct regmatch_state *prev_yes_state;
} yes;
- struct {
+ /* branchlike members */
+ /* this is a fake union member that matches the first elements
+ * of each member that needs to behave like a branch */
+ struct {
/* this first element must match u.yes */
struct regmatch_state *prev_yes_state;
- reg_trie_accepted *accept_buff;
+ U32 lastparen;
+ CHECKPOINT cp;
+
+ } branchlike;
+
+ struct {
+ /* the first elements must match u.branchlike */
+ struct regmatch_state *prev_yes_state;
+ U32 lastparen;
+ CHECKPOINT cp;
+
+ regnode *next_branch; /* next branch node */
+ } branch;
+
+ struct {
+ /* the first elements must match u.branchlike */
+ struct regmatch_state *prev_yes_state;
+ U32 lastparen;
+ CHECKPOINT cp;
+
+ reg_trie_accepted *accept_buff; /* accepting states we have seen */
U32 accepted; /* how many accepting states we have seen */
U16 *jump; /* positive offsets from me */
regnode *B; /* node following the trie */
regnode *me; /* Which node am I - needed for jump tries*/
} trie;
+ /* special types - these members are used to store state for special
+ regops like eval, if/then, lookaround and the markpoint state */
struct {
/* this first element must match u.yes */
struct regmatch_state *prev_yes_state;
struct {
/* this first element must match u.yes */
struct regmatch_state *prev_yes_state;
+ I32 wanted;
+ I32 logical; /* saved copy of 'logical' var */
+ regnode *me; /* the IFMATCH/SUSPEND/UNLESSM node */
+ } ifmatch; /* and SUSPEND/UNLESSM */
+
+ struct {
+ /* this first element must match u.yes */
+ struct regmatch_state *prev_yes_state;
+ struct regmatch_state *prev_mark;
+ SV* mark_name;
+ char *mark_loc;
+ } mark;
+
+ struct {
+ int val;
+ } keeper;
+
+ /* quantifiers - these members are used for storing state for
+ for the regops used to implement quantifiers */
+ struct {
+ /* this first element must match u.yes */
+ struct regmatch_state *prev_yes_state;
struct regmatch_state *prev_curlyx; /* previous cur_curlyx */
CHECKPOINT cp; /* remember current savestack index */
bool minmod;
struct {
/* this first element must match u.yes */
struct regmatch_state *prev_yes_state;
- U32 lastparen;
- regnode *next_branch; /* next branch node */
- CHECKPOINT cp;
- } branch;
-
- struct {
- /* this first element must match u.yes */
- struct regmatch_state *prev_yes_state;
I32 c1, c2; /* case fold search */
CHECKPOINT cp;
I32 alen; /* length of first-matched A string */
regnode *A, *B; /* the nodes corresponding to /A*B/ */
} curly; /* and CURLYN/PLUS/STAR */
- struct {
- /* this first element must match u.yes */
- struct regmatch_state *prev_yes_state;
- I32 wanted;
- I32 logical; /* saved copy of 'logical' var */
- regnode *me; /* the IFMATCH/SUSPEND/UNLESSM node */
- } ifmatch; /* and SUSPEND/UNLESSM */
-
- struct {
- /* this first element must match u.yes */
- struct regmatch_state *prev_yes_state;
- struct regmatch_state *prev_mark;
- SV* mark_name;
- char *mark_loc;
- } mark;
} u;
} regmatch_state;
#define SAVESTACK_ALLOC_FOR_RE_SAVE_STATE \
(1 + ((sizeof(struct re_save_state) - 1) / sizeof(*PL_savestack)))
+
/*
* Local variables:
* c-indentation-style: bsd