/* regcomp.h
*
* Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
- * 2000, 2001, 2002, 2003, 2005, 2006 by Larry Wall and others
+ * 2000, 2001, 2002, 2003, 2005, 2006, 2007, by Larry Wall and others
*
* You may distribute under the terms of either the GNU General Public
* License or the Artistic License, as specified in the README file.
*
*/
+#include "regcharclass.h"
typedef OP OP_4tree; /* Will be redefined later. */
/* Be really agressive about optimising patterns with trie sequences? */
#define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1
+/* Use old style unicode mappings for perl and posix character classes
+ *
+ * NOTE: Enabling this essentially breaks character class matching against unicode
+ * strings, so that POSIX char classes match when they shouldn't, and \d matches
+ * way more than 10 characters, and sometimes a charclass and its complement either
+ * both match or neither match.
+ * NOTE: Disabling this will cause various backwards compatibility issues to rear
+ * their head, and tests to fail. However it will make the charclass behaviour
+ * consistant regardless of internal string type, and make character class inversions
+ * consistant. The tests that fail in the regex engine are basically broken tests.
+ *
+ * Personally I think 5.12 should disable this for sure. Its a bit more debatable for
+ * 5.10, so for now im leaving it enabled.
+ * XXX: It is now enabled for 5.11/5.12
+ *
+ * -demerphq
+ */
+#define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 0
+
/* Should the optimiser take positive assertions into account? */
#define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 0
/* This is the stuff that used to live in regexp.h that was truly
private to the engine itself. It now lives here. */
-/* swap buffer for paren structs */
-typedef struct regexp_paren_ofs {
- I32 *startp;
- I32 *endp;
-} regexp_paren_ofs;
+
typedef struct regexp_internal {
int name_list_idx; /* Optional data index of an array of paren names */
U32 proglen;
} u;
- regexp_paren_ofs *swap; /* Swap copy of *startp / *endp */
regnode *regstclass; /* Optional startclass as identified or constructed
by the optimiser */
struct reg_data *data; /* Additional miscellaneous data used by the program.
#define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */
-#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */
+#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 40 (8*5) named classes */
/* also used by trie */
struct regnode_charclass {
#define ANYOF_NALNUM 1
#define ANYOF_SPACE 2 /* \s */
#define ANYOF_NSPACE 3
-#define ANYOF_DIGIT 4
+#define ANYOF_DIGIT 4 /* \d */
#define ANYOF_NDIGIT 5
-#define ANYOF_ALNUMC 6 /* isalnum(3), utf8::IsAlnum, ALNUMC */
+#define ANYOF_ALNUMC 6 /* [[:alnum:]] isalnum(3), utf8::IsAlnum, ALNUMC */
#define ANYOF_NALNUMC 7
#define ANYOF_ALPHA 8
#define ANYOF_NALPHA 9
#define ANYOF_MAX 32
+/* pseudo classes, not stored in the class bitmap, but used as flags
+ during compilation of char classes */
+
+#define ANYOF_VERTWS (ANYOF_MAX+1)
+#define ANYOF_NVERTWS (ANYOF_MAX+2)
+#define ANYOF_HORIZWS (ANYOF_MAX+3)
+#define ANYOF_NHORIZWS (ANYOF_MAX+4)
+
/* Backward source code compatibility. */
#define ANYOF_ALNUML ANYOF_ALNUM
#define REG_TOP_LEVEL_BRANCHES 0x00000040
#define REG_SEEN_VERBARG 0x00000080
#define REG_SEEN_CUTGROUP 0x00000100
+#define REG_SEEN_RUN_ON_COMMENT 0x00000200
START_EXTERN_C
SPACE, SPACEL,
NSPACE, NSPACEL,
DIGIT, NDIGIT,
+ VERTWS, NVERTWS,
+ HORIZWS, NHORIZWS,
0
};
#endif
EXTCONST regexp_engine PL_core_reg_engine;
#else /* DOINIT */
EXTCONST regexp_engine PL_core_reg_engine = {
- Perl_re_compile,
- Perl_regexec_flags,
+ Perl_re_compile,
+ Perl_regexec_flags,
Perl_re_intuit_start,
Perl_re_intuit_string,
- Perl_regfree_internal,
+ Perl_regfree_internal,
+ Perl_reg_numbered_buff_fetch,
+ Perl_reg_numbered_buff_store,
+ Perl_reg_numbered_buff_length,
+ Perl_reg_named_buff,
+ Perl_reg_named_buff_iter,
+ Perl_reg_qr_package,
#if defined(USE_ITHREADS)
Perl_regdupe_internal
#endif
* n - Root of op tree for (?{EVAL}) item
* o - Start op for (?{EVAL}) item
* p - Pad for (?{EVAL}) item
- * s - swash for unicode-style character class, and the multicharacter
+ * s - swash for Unicode-style character class, and the multicharacter
* strings resulting from casefolding the single-character entries
* in the character class
* t - trie struct
#define check_offset_max substrs->data[2].max_offset
#define check_end_shift substrs->data[2].end_shift
-
+#define RX_ANCHORED_SUBSTR(rx) (((struct regexp *)SvANY(rx))->anchored_substr)
+#define RX_ANCHORED_UTF8(rx) (((struct regexp *)SvANY(rx))->anchored_utf8)
+#define RX_FLOAT_SUBSTR(rx) (((struct regexp *)SvANY(rx))->float_substr)
+#define RX_FLOAT_UTF8(rx) (((struct regexp *)SvANY(rx))->float_utf8)
/* trie related stuff */
optimisation in Perl_regdupe. */
struct _reg_trie_data {
U32 refcount; /* number of times this trie is referenced */
- U16 uniquecharcount; /* unique chars in trie (width of trans table) */
U32 lasttrans; /* last valid transition element */
U16 *charmap; /* byte to charid lookup array */
reg_trie_state *states; /* state data */
reg_trie_trans *trans; /* array of transition elements */
char *bitmap; /* stclass bitmap */
- U32 startstate; /* initial state - used for common prefix optimisation */
- STRLEN minlen; /* minimum length of words in trie - build/opt only? */
- STRLEN maxlen; /* maximum length of words in trie - build/opt only? */
U32 *wordlen; /* array of lengths of words */
U16 *jump; /* optional 1 indexed array of offsets before tail
for the node following a given word. */
U16 *nextword; /* optional 1 indexed array to support linked list
of duplicate wordnums */
+ U16 uniquecharcount; /* unique chars in trie (width of trans table) */
+ U32 startstate; /* initial state - used for common prefix optimisation */
+ STRLEN minlen; /* minimum length of words in trie - build/opt only? */
+ STRLEN maxlen; /* maximum length of words in trie - build/opt only? */
U32 statecount; /* Build only - number of states in the states array
(including the unused zero state) */
U32 wordcount; /* Build only */
optimisation in Perl_regdupe. */
struct _reg_ac_data {
U32 refcount;
+ U32 trie;
U32 *fail;
reg_trie_state *states;
- U32 trie;
};
typedef struct _reg_ac_data reg_ac_data;
#define RE_DEBUG_COMPILE_OPTIMISE 0x000002
#define RE_DEBUG_COMPILE_TRIE 0x000004
#define RE_DEBUG_COMPILE_DUMP 0x000008
+#define RE_DEBUG_COMPILE_FLAGS 0x000010
/* Execute */
#define RE_DEBUG_EXECUTE_MASK 0x00FF00
#define RE_DEBUG_EXTRA_OFFDEBUG 0x040000
#define RE_DEBUG_EXTRA_STATE 0x080000
#define RE_DEBUG_EXTRA_OPTIMISE 0x100000
+#define RE_DEBUG_EXTRA_BUFFERS 0x400000
+#define RE_DEBUG_EXTRA_GPOS 0x800000
/* combined */
#define RE_DEBUG_EXTRA_STACK 0x280000
if (re_debug_flags & RE_DEBUG_COMPILE_DUMP) x )
#define DEBUG_TRIE_COMPILE_r(x) DEBUG_r( \
if (re_debug_flags & RE_DEBUG_COMPILE_TRIE) x )
-
+#define DEBUG_FLAGS_r(x) DEBUG_r( \
+ if (re_debug_flags & RE_DEBUG_COMPILE_FLAGS) x )
/* Execute */
#define DEBUG_EXECUTE_r(x) DEBUG_r( \
if (re_debug_flags & RE_DEBUG_EXECUTE_MASK) x )
if (re_debug_flags & RE_DEBUG_EXTRA_STATE) x )
#define DEBUG_STACK_r(x) DEBUG_r( \
if (re_debug_flags & RE_DEBUG_EXTRA_STACK) x )
+#define DEBUG_BUFFERS_r(x) DEBUG_r( \
+ if (re_debug_flags & RE_DEBUG_EXTRA_BUFFERS) x )
+
#define DEBUG_OPTIMISE_MORE_r(x) DEBUG_r( \
if ((RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE) == \
(re_debug_flags & (RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE)) ) x )
#define DEBUG_TRIE_r(x) DEBUG_r( \
if (re_debug_flags & (RE_DEBUG_COMPILE_TRIE \
| RE_DEBUG_EXECUTE_TRIE )) x )
+#define DEBUG_GPOS_r(x) DEBUG_r( \
+ if (re_debug_flags & RE_DEBUG_EXTRA_GPOS) x )
/* initialization */
/* get_sv() can return NULL during global destruction. */
const char * const rpv = \
pv_pretty((dsv), (pv), (l), (m), \
PL_colors[0], PL_colors[1], \
- ( PERL_PV_PRETTY_QUOTE | PERL_PV_ESCAPE_RE | PERL_PV_PRETTY_ELIPSES | \
+ ( PERL_PV_PRETTY_QUOTE | PERL_PV_ESCAPE_RE | PERL_PV_PRETTY_ELLIPSES | \
((isuni) ? PERL_PV_ESCAPE_UNI : 0)) \
)
#endif /* DEBUG RELATED DEFINES */
-
+/*
+ * Local variables:
+ * c-indentation-style: bsd
+ * c-basic-offset: 4
+ * indent-tabs-mode: t
+ * End:
+ *
+ * ex: set ts=8 sts=4 sw=4 noet:
+ */