5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static const scan_data_t zero_scan_data =
199 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
259 #define FAIL(msg) STMT_START { \
260 const char *ellipses = ""; \
261 IV len = RExC_end - RExC_precomp; \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
275 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
277 #define Simple_vFAIL(m) STMT_START { \
278 const IV offset = RExC_parse - RExC_precomp; \
279 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
280 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
284 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
286 #define vFAIL(m) STMT_START { \
288 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
293 * Like Simple_vFAIL(), but accepts two arguments.
295 #define Simple_vFAIL2(m,a1) STMT_START { \
296 const IV offset = RExC_parse - RExC_precomp; \
297 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
298 (int)offset, RExC_precomp, RExC_precomp + offset); \
302 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
304 #define vFAIL2(m,a1) STMT_START { \
306 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
307 Simple_vFAIL2(m, a1); \
312 * Like Simple_vFAIL(), but accepts three arguments.
314 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
315 const IV offset = RExC_parse - RExC_precomp; \
316 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
317 (int)offset, RExC_precomp, RExC_precomp + offset); \
321 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
323 #define vFAIL3(m,a1,a2) STMT_START { \
325 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
326 Simple_vFAIL3(m, a1, a2); \
330 * Like Simple_vFAIL(), but accepts four arguments.
332 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
333 const IV offset = RExC_parse - RExC_precomp; \
334 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
335 (int)offset, RExC_precomp, RExC_precomp + offset); \
338 #define vWARN(loc,m) STMT_START { \
339 const IV offset = loc - RExC_precomp; \
340 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
341 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
344 #define vWARNdep(loc,m) STMT_START { \
345 const IV offset = loc - RExC_precomp; \
346 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
347 "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
352 #define vWARN2(loc, m, a1) STMT_START { \
353 const IV offset = loc - RExC_precomp; \
354 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
355 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
358 #define vWARN3(loc, m, a1, a2) STMT_START { \
359 const IV offset = loc - RExC_precomp; \
360 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
361 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
364 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
365 const IV offset = loc - RExC_precomp; \
366 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
367 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
370 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
371 const IV offset = loc - RExC_precomp; \
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
373 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 /* Allow for side effects in s */
378 #define REGC(c,s) STMT_START { \
379 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
382 /* Macros for recording node offsets. 20001227 mjd@plover.com
383 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
384 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
385 * Element 0 holds the number n.
388 #define MJD_OFFSET_DEBUG(x)
389 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
392 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
394 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
395 __LINE__, (node), (byte))); \
397 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
399 RExC_offsets[2*(node)-1] = (byte); \
404 #define Set_Node_Offset(node,byte) \
405 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
406 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
408 #define Set_Node_Length_To_R(node,len) STMT_START { \
410 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
411 __LINE__, (int)(node), (int)(len))); \
413 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
415 RExC_offsets[2*(node)] = (len); \
420 #define Set_Node_Length(node,len) \
421 Set_Node_Length_To_R((node)-RExC_emit_start, len)
422 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
423 #define Set_Node_Cur_Length(node) \
424 Set_Node_Length(node, RExC_parse - parse_start)
426 /* Get offsets and lengths */
427 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
428 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
430 static void clear_re(pTHX_ void *r);
432 /* Mark that we cannot extend a found fixed substring at this point.
433 Updata the longest found anchored substring and the longest found
434 floating substrings if needed. */
437 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
439 const STRLEN l = CHR_SVLEN(data->last_found);
440 const STRLEN old_l = CHR_SVLEN(*data->longest);
442 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
443 SvSetMagicSV(*data->longest, data->last_found);
444 if (*data->longest == data->longest_fixed) {
445 data->offset_fixed = l ? data->last_start_min : data->pos_min;
446 if (data->flags & SF_BEFORE_EOL)
448 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
450 data->flags &= ~SF_FIX_BEFORE_EOL;
453 data->offset_float_min = l ? data->last_start_min : data->pos_min;
454 data->offset_float_max = (l
455 ? data->last_start_max
456 : data->pos_min + data->pos_delta);
457 if ((U32)data->offset_float_max > (U32)I32_MAX)
458 data->offset_float_max = I32_MAX;
459 if (data->flags & SF_BEFORE_EOL)
461 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
463 data->flags &= ~SF_FL_BEFORE_EOL;
466 SvCUR_set(data->last_found, 0);
468 SV * const sv = data->last_found;
469 if (SvUTF8(sv) && SvMAGICAL(sv)) {
470 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
476 data->flags &= ~SF_BEFORE_EOL;
479 /* Can match anything (initialization) */
481 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
483 ANYOF_CLASS_ZERO(cl);
484 ANYOF_BITMAP_SETALL(cl);
485 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
487 cl->flags |= ANYOF_LOCALE;
490 /* Can match anything (initialization) */
492 S_cl_is_anything(const struct regnode_charclass_class *cl)
496 for (value = 0; value <= ANYOF_MAX; value += 2)
497 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
499 if (!(cl->flags & ANYOF_UNICODE_ALL))
501 if (!ANYOF_BITMAP_TESTALLSET(cl))
506 /* Can match anything (initialization) */
508 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
510 Zero(cl, 1, struct regnode_charclass_class);
512 cl_anything(pRExC_state, cl);
516 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
518 Zero(cl, 1, struct regnode_charclass_class);
520 cl_anything(pRExC_state, cl);
522 cl->flags |= ANYOF_LOCALE;
525 /* 'And' a given class with another one. Can create false positives */
526 /* We assume that cl is not inverted */
528 S_cl_and(struct regnode_charclass_class *cl,
529 const struct regnode_charclass_class *and_with)
531 if (!(and_with->flags & ANYOF_CLASS)
532 && !(cl->flags & ANYOF_CLASS)
533 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
534 && !(and_with->flags & ANYOF_FOLD)
535 && !(cl->flags & ANYOF_FOLD)) {
538 if (and_with->flags & ANYOF_INVERT)
539 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
540 cl->bitmap[i] &= ~and_with->bitmap[i];
542 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
543 cl->bitmap[i] &= and_with->bitmap[i];
544 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
545 if (!(and_with->flags & ANYOF_EOS))
546 cl->flags &= ~ANYOF_EOS;
548 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
549 !(and_with->flags & ANYOF_INVERT)) {
550 cl->flags &= ~ANYOF_UNICODE_ALL;
551 cl->flags |= ANYOF_UNICODE;
552 ARG_SET(cl, ARG(and_with));
554 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
555 !(and_with->flags & ANYOF_INVERT))
556 cl->flags &= ~ANYOF_UNICODE_ALL;
557 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
558 !(and_with->flags & ANYOF_INVERT))
559 cl->flags &= ~ANYOF_UNICODE;
562 /* 'OR' a given class with another one. Can create false positives */
563 /* We assume that cl is not inverted */
565 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
567 if (or_with->flags & ANYOF_INVERT) {
569 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
570 * <= (B1 | !B2) | (CL1 | !CL2)
571 * which is wasteful if CL2 is small, but we ignore CL2:
572 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
573 * XXXX Can we handle case-fold? Unclear:
574 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
575 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
577 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
578 && !(or_with->flags & ANYOF_FOLD)
579 && !(cl->flags & ANYOF_FOLD) ) {
582 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583 cl->bitmap[i] |= ~or_with->bitmap[i];
584 } /* XXXX: logic is complicated otherwise */
586 cl_anything(pRExC_state, cl);
589 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
590 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
591 && (!(or_with->flags & ANYOF_FOLD)
592 || (cl->flags & ANYOF_FOLD)) ) {
595 /* OR char bitmap and class bitmap separately */
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= or_with->bitmap[i];
598 if (or_with->flags & ANYOF_CLASS) {
599 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
600 cl->classflags[i] |= or_with->classflags[i];
601 cl->flags |= ANYOF_CLASS;
604 else { /* XXXX: logic is complicated, leave it along for a moment. */
605 cl_anything(pRExC_state, cl);
608 if (or_with->flags & ANYOF_EOS)
609 cl->flags |= ANYOF_EOS;
611 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
612 ARG(cl) != ARG(or_with)) {
613 cl->flags |= ANYOF_UNICODE_ALL;
614 cl->flags &= ~ANYOF_UNICODE;
616 if (or_with->flags & ANYOF_UNICODE_ALL) {
617 cl->flags |= ANYOF_UNICODE_ALL;
618 cl->flags &= ~ANYOF_UNICODE;
624 make_trie(startbranch,first,last,tail,flags,depth)
625 startbranch: the first branch in the whole branch sequence
626 first : start branch of sequence of branch-exact nodes.
627 May be the same as startbranch
628 last : Thing following the last branch.
629 May be the same as tail.
630 tail : item following the branch sequence
631 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
634 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
636 A trie is an N'ary tree where the branches are determined by digital
637 decomposition of the key. IE, at the root node you look up the 1st character and
638 follow that branch repeat until you find the end of the branches. Nodes can be
639 marked as "accepting" meaning they represent a complete word. Eg:
643 would convert into the following structure. Numbers represent states, letters
644 following numbers represent valid transitions on the letter from that state, if
645 the number is in square brackets it represents an accepting state, otherwise it
646 will be in parenthesis.
648 +-h->+-e->[3]-+-r->(8)-+-s->[9]
652 (1) +-i->(6)-+-s->[7]
654 +-s->(3)-+-h->(4)-+-e->[5]
656 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
658 This shows that when matching against the string 'hers' we will begin at state 1
659 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
660 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
661 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
662 single traverse. We store a mapping from accepting to state to which word was
663 matched, and then when we have multiple possibilities we try to complete the
664 rest of the regex in the order in which they occured in the alternation.
666 The only prior NFA like behaviour that would be changed by the TRIE support is
667 the silent ignoring of duplicate alternations which are of the form:
669 / (DUPE|DUPE) X? (?{ ... }) Y /x
671 Thus EVAL blocks follwing a trie may be called a different number of times with
672 and without the optimisation. With the optimisations dupes will be silently
673 ignored. This inconsistant behaviour of EVAL type nodes is well established as
674 the following demonstrates:
676 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
678 which prints out 'word' three times, but
680 'words'=~/(word|word|word)(?{ print $1 })S/
682 which doesnt print it out at all. This is due to other optimisations kicking in.
684 Example of what happens on a structural level:
686 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
688 1: CURLYM[1] {1,32767}(18)
699 This would be optimizable with startbranch=5, first=5, last=16, tail=16
700 and should turn into:
702 1: CURLYM[1] {1,32767}(18)
704 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
712 Cases where tail != last would be like /(?foo|bar)baz/:
722 which would be optimizable with startbranch=1, first=1, last=7, tail=8
723 and would end up looking like:
726 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
733 d = uvuni_to_utf8_flags(d, uv, 0);
735 is the recommended Unicode-aware way of saying
740 #define TRIE_STORE_REVCHAR \
742 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
743 av_push( TRIE_REVCHARMAP(trie), tmp ); \
746 #define TRIE_READ_CHAR STMT_START { \
749 if ( foldlen > 0 ) { \
750 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
755 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
756 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
757 foldlen -= UNISKIP( uvc ); \
758 scan = foldbuf + UNISKIP( uvc ); \
761 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
770 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
771 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
772 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
773 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
775 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
776 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
777 TRIE_LIST_LEN( state ) *= 2; \
778 Renew( trie->states[ state ].trans.list, \
779 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
781 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
782 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
783 TRIE_LIST_CUR( state )++; \
786 #define TRIE_LIST_NEW(state) STMT_START { \
787 Newxz( trie->states[ state ].trans.list, \
788 4, reg_trie_trans_le ); \
789 TRIE_LIST_CUR( state ) = 1; \
790 TRIE_LIST_LEN( state ) = 4; \
793 #define TRIE_HANDLE_WORD(state) STMT_START { \
794 if ( !trie->states[ state ].wordnum ) { \
795 /* we havent inserted this word into the structure yet. */\
796 trie->states[ state ].wordnum = ++curword; \
798 /* store the word for dumping */ \
800 if (OP(noper) != NOTHING ) \
801 tmp=newSVpvn( STRING( noper ), STR_LEN( noper ) );\
803 tmp=newSVpvn( "", 0 ); \
804 if ( UTF ) SvUTF8_on( tmp ); \
805 av_push( trie->words, tmp ); \
808 NOOP; /* It's a dupe. So ignore it. */ \
815 dump_trie_interim_list(trie,next_alloc)
816 dump_trie_interim_table(trie,next_alloc)
818 These routines dump out a trie in a somewhat readable format.
819 The _interim_ variants are used for debugging the interim
820 tables that are used to generate the final compressed
821 representation which is what dump_trie expects.
823 Part of the reason for their existance is to provide a form
824 of documentation as to how the different representations function.
830 Dumps the final compressed table form of the trie to Perl_debug_log.
831 Used for debugging make_trie().
835 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
838 GET_RE_DEBUG_FLAGS_DECL;
840 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
841 (int)depth * 2 + 2,"",
842 "Match","Base","Ofs" );
844 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
845 SV **tmp = av_fetch( trie->revcharmap, state, 0);
847 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
850 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
851 (int)depth * 2 + 2,"");
853 for( state = 0 ; state < trie->uniquecharcount ; state++ )
854 PerlIO_printf( Perl_debug_log, "-----");
855 PerlIO_printf( Perl_debug_log, "\n");
857 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
858 const U32 base = trie->states[ state ].trans.base;
860 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
862 if ( trie->states[ state ].wordnum ) {
863 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
865 PerlIO_printf( Perl_debug_log, "%6s", "" );
868 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
873 while( ( base + ofs < trie->uniquecharcount ) ||
874 ( base + ofs - trie->uniquecharcount < trie->lasttrans
875 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
878 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
880 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
881 if ( ( base + ofs >= trie->uniquecharcount ) &&
882 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
883 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
885 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
886 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
888 PerlIO_printf( Perl_debug_log, "%4s "," ." );
892 PerlIO_printf( Perl_debug_log, "]");
895 PerlIO_printf( Perl_debug_log, "\n" );
899 dump_trie_interim_list(trie,next_alloc)
900 Dumps a fully constructed but uncompressed trie in list form.
901 List tries normally only are used for construction when the number of
902 possible chars (trie->uniquecharcount) is very high.
903 Used for debugging make_trie().
906 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
913 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
915 for( state=1 ; state < next_alloc ; state ++ ) {
918 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
928 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
929 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
930 SvPV_nolen_const( *tmp ),
931 TRIE_LIST_ITEM(state,charid).forid,
932 (UV)TRIE_LIST_ITEM(state,charid).newstate
940 dump_trie_interim_table(trie,next_alloc)
941 Dumps a fully constructed but uncompressed trie in table form.
942 This is the normal DFA style state transition table, with a few
943 twists to facilitate compression later.
944 Used for debugging make_trie().
947 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
951 GET_RE_DEBUG_FLAGS_DECL;
954 print out the table precompression so that we can do a visual check
955 that they are identical.
958 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
960 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
961 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
963 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
967 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
969 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
970 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
973 PerlIO_printf( Perl_debug_log, "\n" );
975 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
977 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
978 (int)depth * 2 + 2,"",
979 (UV)TRIE_NODENUM( state ) );
981 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
982 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
983 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
985 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
986 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
988 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
989 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1001 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1004 /* first pass, loop through and scan words */
1005 reg_trie_data *trie;
1007 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1012 /* we just use folder as a flag in utf8 */
1013 const U8 * const folder = ( flags == EXACTF
1015 : ( flags == EXACTFL
1021 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1022 SV *re_trie_maxbuff;
1024 /* these are only used during construction but are useful during
1025 * debugging so we store them in the struct when debugging.
1026 * Wordcount is actually superfluous in debugging as we have
1027 * (AV*)trie->words to use for it, but that's not available when
1028 * not debugging... We could make the macro use the AV during
1029 * debugging though...
1031 U16 trie_wordcount=0;
1032 STRLEN trie_charcount=0;
1033 U32 trie_laststate=0;
1034 AV *trie_revcharmap;
1036 GET_RE_DEBUG_FLAGS_DECL;
1038 Newxz( trie, 1, reg_trie_data );
1040 trie->startstate = 1;
1041 RExC_rx->data->data[ data_slot ] = (void*)trie;
1042 Newxz( trie->charmap, 256, U16 );
1043 if (!(UTF && folder))
1044 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1046 trie->words = newAV();
1048 TRIE_REVCHARMAP(trie) = newAV();
1050 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1051 if (!SvIOK(re_trie_maxbuff)) {
1052 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1055 PerlIO_printf( Perl_debug_log,
1056 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1057 (int)depth * 2 + 2, "",
1058 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1059 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1061 /* -- First loop and Setup --
1063 We first traverse the branches and scan each word to determine if it
1064 contains widechars, and how many unique chars there are, this is
1065 important as we have to build a table with at least as many columns as we
1068 We use an array of integers to represent the character codes 0..255
1069 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1070 native representation of the character value as the key and IV's for the
1073 *TODO* If we keep track of how many times each character is used we can
1074 remap the columns so that the table compression later on is more
1075 efficient in terms of memory by ensuring most common value is in the
1076 middle and the least common are on the outside. IMO this would be better
1077 than a most to least common mapping as theres a decent chance the most
1078 common letter will share a node with the least common, meaning the node
1079 will not be compressable. With a middle is most common approach the worst
1080 case is when we have the least common nodes twice.
1084 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1085 regnode * const noper = NEXTOPER( cur );
1086 const U8 *uc = (U8*)STRING( noper );
1087 const U8 * const e = uc + STR_LEN( noper );
1089 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1090 const U8 *scan = (U8*)NULL;
1093 TRIE_WORDCOUNT(trie)++;
1094 if (OP(noper) == NOTHING) {
1099 TRIE_BITMAP_SET(trie,*uc);
1100 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1102 for ( ; uc < e ; uc += len ) {
1103 TRIE_CHARCOUNT(trie)++;
1107 if ( !trie->charmap[ uvc ] ) {
1108 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1110 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1115 if ( !trie->widecharmap )
1116 trie->widecharmap = newHV();
1118 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1121 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1123 if ( !SvTRUE( *svpp ) ) {
1124 sv_setiv( *svpp, ++trie->uniquecharcount );
1129 if( cur == first ) {
1132 } else if (chars < trie->minlen) {
1134 } else if (chars > trie->maxlen) {
1138 } /* end first pass */
1139 DEBUG_TRIE_COMPILE_r(
1140 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1141 (int)depth * 2 + 2,"",
1142 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1143 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, trie->minlen, trie->maxlen )
1148 We now know what we are dealing with in terms of unique chars and
1149 string sizes so we can calculate how much memory a naive
1150 representation using a flat table will take. If it's over a reasonable
1151 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1152 conservative but potentially much slower representation using an array
1155 At the end we convert both representations into the same compressed
1156 form that will be used in regexec.c for matching with. The latter
1157 is a form that cannot be used to construct with but has memory
1158 properties similar to the list form and access properties similar
1159 to the table form making it both suitable for fast searches and
1160 small enough that its feasable to store for the duration of a program.
1162 See the comment in the code where the compressed table is produced
1163 inplace from the flat tabe representation for an explanation of how
1164 the compression works.
1169 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1171 Second Pass -- Array Of Lists Representation
1173 Each state will be represented by a list of charid:state records
1174 (reg_trie_trans_le) the first such element holds the CUR and LEN
1175 points of the allocated array. (See defines above).
1177 We build the initial structure using the lists, and then convert
1178 it into the compressed table form which allows faster lookups
1179 (but cant be modified once converted).
1182 STRLEN transcount = 1;
1184 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1188 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1190 regnode * const noper = NEXTOPER( cur );
1191 U8 *uc = (U8*)STRING( noper );
1192 const U8 * const e = uc + STR_LEN( noper );
1193 U32 state = 1; /* required init */
1194 U16 charid = 0; /* sanity init */
1195 U8 *scan = (U8*)NULL; /* sanity init */
1196 STRLEN foldlen = 0; /* required init */
1197 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1199 if (OP(noper) != NOTHING) {
1200 for ( ; uc < e ; uc += len ) {
1205 charid = trie->charmap[ uvc ];
1207 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1211 charid=(U16)SvIV( *svpp );
1220 if ( !trie->states[ state ].trans.list ) {
1221 TRIE_LIST_NEW( state );
1223 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1224 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1225 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1230 newstate = next_alloc++;
1231 TRIE_LIST_PUSH( state, charid, newstate );
1236 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1238 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1241 TRIE_HANDLE_WORD(state);
1243 } /* end second pass */
1245 TRIE_LASTSTATE(trie) = next_alloc;
1246 Renew( trie->states, next_alloc, reg_trie_state );
1248 /* and now dump it out before we compress it */
1249 DEBUG_TRIE_COMPILE_MORE_r(
1250 dump_trie_interim_list(trie,next_alloc,depth+1)
1253 Newxz( trie->trans, transcount ,reg_trie_trans );
1260 for( state=1 ; state < next_alloc ; state ++ ) {
1264 DEBUG_TRIE_COMPILE_MORE_r(
1265 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1269 if (trie->states[state].trans.list) {
1270 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1274 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1275 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1276 if ( forid < minid ) {
1278 } else if ( forid > maxid ) {
1282 if ( transcount < tp + maxid - minid + 1) {
1284 Renew( trie->trans, transcount, reg_trie_trans );
1285 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1287 base = trie->uniquecharcount + tp - minid;
1288 if ( maxid == minid ) {
1290 for ( ; zp < tp ; zp++ ) {
1291 if ( ! trie->trans[ zp ].next ) {
1292 base = trie->uniquecharcount + zp - minid;
1293 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1294 trie->trans[ zp ].check = state;
1300 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1301 trie->trans[ tp ].check = state;
1306 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1307 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1308 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1309 trie->trans[ tid ].check = state;
1311 tp += ( maxid - minid + 1 );
1313 Safefree(trie->states[ state ].trans.list);
1316 DEBUG_TRIE_COMPILE_MORE_r(
1317 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1320 trie->states[ state ].trans.base=base;
1322 trie->lasttrans = tp + 1;
1326 Second Pass -- Flat Table Representation.
1328 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1329 We know that we will need Charcount+1 trans at most to store the data
1330 (one row per char at worst case) So we preallocate both structures
1331 assuming worst case.
1333 We then construct the trie using only the .next slots of the entry
1336 We use the .check field of the first entry of the node temporarily to
1337 make compression both faster and easier by keeping track of how many non
1338 zero fields are in the node.
1340 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1343 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1344 number representing the first entry of the node, and state as a
1345 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1346 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1347 are 2 entrys per node. eg:
1355 The table is internally in the right hand, idx form. However as we also
1356 have to deal with the states array which is indexed by nodenum we have to
1357 use TRIE_NODENUM() to convert.
1362 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1364 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1365 next_alloc = trie->uniquecharcount + 1;
1368 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1370 regnode * const noper = NEXTOPER( cur );
1371 const U8 *uc = (U8*)STRING( noper );
1372 const U8 * const e = uc + STR_LEN( noper );
1374 U32 state = 1; /* required init */
1376 U16 charid = 0; /* sanity init */
1377 U32 accept_state = 0; /* sanity init */
1378 U8 *scan = (U8*)NULL; /* sanity init */
1380 STRLEN foldlen = 0; /* required init */
1381 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1383 if ( OP(noper) != NOTHING ) {
1384 for ( ; uc < e ; uc += len ) {
1389 charid = trie->charmap[ uvc ];
1391 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1392 charid = svpp ? (U16)SvIV(*svpp) : 0;
1396 if ( !trie->trans[ state + charid ].next ) {
1397 trie->trans[ state + charid ].next = next_alloc;
1398 trie->trans[ state ].check++;
1399 next_alloc += trie->uniquecharcount;
1401 state = trie->trans[ state + charid ].next;
1403 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1405 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1408 accept_state = TRIE_NODENUM( state );
1409 TRIE_HANDLE_WORD(accept_state);
1411 } /* end second pass */
1413 /* and now dump it out before we compress it */
1414 DEBUG_TRIE_COMPILE_MORE_r(
1415 dump_trie_interim_table(trie,next_alloc,depth+1)
1420 * Inplace compress the table.*
1422 For sparse data sets the table constructed by the trie algorithm will
1423 be mostly 0/FAIL transitions or to put it another way mostly empty.
1424 (Note that leaf nodes will not contain any transitions.)
1426 This algorithm compresses the tables by eliminating most such
1427 transitions, at the cost of a modest bit of extra work during lookup:
1429 - Each states[] entry contains a .base field which indicates the
1430 index in the state[] array wheres its transition data is stored.
1432 - If .base is 0 there are no valid transitions from that node.
1434 - If .base is nonzero then charid is added to it to find an entry in
1437 -If trans[states[state].base+charid].check!=state then the
1438 transition is taken to be a 0/Fail transition. Thus if there are fail
1439 transitions at the front of the node then the .base offset will point
1440 somewhere inside the previous nodes data (or maybe even into a node
1441 even earlier), but the .check field determines if the transition is
1444 The following process inplace converts the table to the compressed
1445 table: We first do not compress the root node 1,and mark its all its
1446 .check pointers as 1 and set its .base pointer as 1 as well. This
1447 allows to do a DFA construction from the compressed table later, and
1448 ensures that any .base pointers we calculate later are greater than
1451 - We set 'pos' to indicate the first entry of the second node.
1453 - We then iterate over the columns of the node, finding the first and
1454 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1455 and set the .check pointers accordingly, and advance pos
1456 appropriately and repreat for the next node. Note that when we copy
1457 the next pointers we have to convert them from the original
1458 NODEIDX form to NODENUM form as the former is not valid post
1461 - If a node has no transitions used we mark its base as 0 and do not
1462 advance the pos pointer.
1464 - If a node only has one transition we use a second pointer into the
1465 structure to fill in allocated fail transitions from other states.
1466 This pointer is independent of the main pointer and scans forward
1467 looking for null transitions that are allocated to a state. When it
1468 finds one it writes the single transition into the "hole". If the
1469 pointer doesnt find one the single transition is appeneded as normal.
1471 - Once compressed we can Renew/realloc the structures to release the
1474 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1475 specifically Fig 3.47 and the associated pseudocode.
1479 const U32 laststate = TRIE_NODENUM( next_alloc );
1482 TRIE_LASTSTATE(trie) = laststate;
1484 for ( state = 1 ; state < laststate ; state++ ) {
1486 const U32 stateidx = TRIE_NODEIDX( state );
1487 const U32 o_used = trie->trans[ stateidx ].check;
1488 U32 used = trie->trans[ stateidx ].check;
1489 trie->trans[ stateidx ].check = 0;
1491 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1492 if ( flag || trie->trans[ stateidx + charid ].next ) {
1493 if ( trie->trans[ stateidx + charid ].next ) {
1495 for ( ; zp < pos ; zp++ ) {
1496 if ( ! trie->trans[ zp ].next ) {
1500 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1501 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1502 trie->trans[ zp ].check = state;
1503 if ( ++zp > pos ) pos = zp;
1510 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1512 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1513 trie->trans[ pos ].check = state;
1518 trie->lasttrans = pos + 1;
1519 Renew( trie->states, laststate + 1, reg_trie_state);
1520 DEBUG_TRIE_COMPILE_MORE_r(
1521 PerlIO_printf( Perl_debug_log,
1522 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1523 (int)depth * 2 + 2,"",
1524 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1527 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1530 } /* end table compress */
1532 /* resize the trans array to remove unused space */
1533 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1535 /* and now dump out the compressed format */
1536 DEBUG_TRIE_COMPILE_r(
1537 dump_trie(trie,depth+1)
1540 { /* Modify the program and insert the new TRIE node*/
1542 U8 nodetype =(U8)(flags & 0xFF);
1545 This means we convert either the first branch or the first Exact,
1546 depending on whether the thing following (in 'last') is a branch
1547 or not and whther first is the startbranch (ie is it a sub part of
1548 the alternation or is it the whole thing.)
1549 Assuming its a sub part we conver the EXACT otherwise we convert
1550 the whole branch sequence, including the first.
1552 /* Find the node we are going to overwrite */
1553 if ( first == startbranch && OP( last ) != BRANCH ) {
1556 convert = NEXTOPER( first );
1557 NEXT_OFF( first ) = (U16)(last - first);
1560 /* But first we check to see if there is a common prefix we can
1561 split out as an EXACT and put in front of the TRIE node. */
1562 trie->startstate= 1;
1563 if ( trie->bitmap && !trie->widecharmap ) {
1566 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1567 (int)depth * 2 + 2, "",
1568 TRIE_LASTSTATE(trie))
1570 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1574 const U32 base = trie->states[ state ].trans.base;
1576 if ( trie->states[state].wordnum )
1579 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1580 if ( ( base + ofs >= trie->uniquecharcount ) &&
1581 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1582 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1584 if ( ++count > 1 ) {
1585 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1586 const char *ch = SvPV_nolen_const( *tmp );
1587 if ( state == 1 ) break;
1589 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1591 PerlIO_printf(Perl_debug_log,
1592 "%*sNew Start State=%"UVuf" Class: [",
1593 (int)depth * 2 + 2, "",
1596 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1597 const char *ch = SvPV_nolen_const( *tmp );
1599 TRIE_BITMAP_SET(trie,*ch);
1601 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1603 PerlIO_printf(Perl_debug_log, ch)
1607 TRIE_BITMAP_SET(trie,*ch);
1609 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1610 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1616 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1617 const char *ch = SvPV_nolen_const( *tmp );
1619 PerlIO_printf( Perl_debug_log,
1620 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1621 (int)depth * 2 + 2, "",
1625 OP( convert ) = nodetype;
1626 str=STRING(convert);
1634 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1639 regnode *n = convert+NODE_SZ_STR(convert);
1640 NEXT_OFF(convert)= NODE_SZ_STR(convert);
1641 trie->startstate = state;
1642 trie->minlen-= (state-1);
1643 trie->maxlen-= (state-1);
1647 NEXT_OFF(convert) = (U16)(tail - convert);
1651 if ( trie->maxlen ) {
1652 OP( convert ) = TRIE;
1653 NEXT_OFF( convert ) = (U16)(tail - convert);
1654 ARG_SET( convert, data_slot );
1656 /* store the type in the flags */
1657 convert->flags = nodetype;
1658 /* XXX We really should free up the resource in trie now, as we wont use them */
1660 /* needed for dumping*/
1662 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1663 /* We now need to mark all of the space originally used by the
1664 branches as optimized away. This keeps the dumpuntil from
1665 throwing a wobbly as it doesnt use regnext() to traverse the
1668 while( optimize < last ) {
1669 OP( optimize ) = OPTIMIZED;
1673 } /* end node insert */
1678 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1679 * These need to be revisited when a newer toolchain becomes available.
1681 #if defined(__sparc64__) && defined(__GNUC__)
1682 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1683 # undef SPARC64_GCC_WORKAROUND
1684 # define SPARC64_GCC_WORKAROUND 1
1688 /* REx optimizer. Converts nodes into quickier variants "in place".
1689 Finds fixed substrings. */
1691 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1692 to the position after last scanned or to NULL. */
1695 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1696 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1697 /* scanp: Start here (read-write). */
1698 /* deltap: Write maxlen-minlen here. */
1699 /* last: Stop before this one. */
1702 I32 min = 0, pars = 0, code;
1703 regnode *scan = *scanp, *next;
1705 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1706 int is_inf_internal = 0; /* The studied chunk is infinite */
1707 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1708 scan_data_t data_fake;
1709 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1710 SV *re_trie_maxbuff = NULL;
1712 GET_RE_DEBUG_FLAGS_DECL;
1714 while (scan && OP(scan) != END && scan < last) {
1718 /* Peephole optimizer: */
1720 SV * const mysv=sv_newmortal();
1721 regprop(RExC_rx, mysv, scan);
1722 PerlIO_printf(Perl_debug_log, "%*s%4s~ %s (%d)\n",
1724 scan==*scanp ? "Peep" : "",
1725 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
1727 if (PL_regkind[OP(scan)] == EXACT) {
1728 /* Merge several consecutive EXACTish nodes into one. */
1729 regnode *n = regnext(scan);
1732 regnode *stop = scan;
1734 next = scan + NODE_SZ_STR(scan);
1735 /* Skip NOTHING, merge EXACT*. */
1737 ( PL_regkind[OP(n)] == NOTHING ||
1738 (stringok && (OP(n) == OP(scan))))
1740 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1741 if (OP(n) == TAIL || n > next)
1743 if (PL_regkind[OP(n)] == NOTHING) {
1745 SV * const mysv=sv_newmortal();
1746 regprop(RExC_rx, mysv, n);
1747 PerlIO_printf(Perl_debug_log, "%*sskip: %s (%d)\n",
1748 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
1750 NEXT_OFF(scan) += NEXT_OFF(n);
1751 next = n + NODE_STEP_REGNODE;
1758 else if (stringok) {
1759 const int oldl = STR_LEN(scan);
1760 regnode * const nnext = regnext(n);
1762 SV * const mysv=sv_newmortal();
1763 regprop(RExC_rx, mysv, n);
1764 PerlIO_printf(Perl_debug_log, "%*s mrg: %s (%d)\n",
1765 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
1768 if (oldl + STR_LEN(n) > U8_MAX)
1770 NEXT_OFF(scan) += NEXT_OFF(n);
1771 STR_LEN(scan) += STR_LEN(n);
1772 next = n + NODE_SZ_STR(n);
1773 /* Now we can overwrite *n : */
1774 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1782 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1784 Two problematic code points in Unicode casefolding of EXACT nodes:
1786 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1787 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1793 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1794 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1796 This means that in case-insensitive matching (or "loose matching",
1797 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1798 length of the above casefolded versions) can match a target string
1799 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1800 This would rather mess up the minimum length computation.
1802 What we'll do is to look for the tail four bytes, and then peek
1803 at the preceding two bytes to see whether we need to decrease
1804 the minimum length by four (six minus two).
1806 Thanks to the design of UTF-8, there cannot be false matches:
1807 A sequence of valid UTF-8 bytes cannot be a subsequence of
1808 another valid sequence of UTF-8 bytes.
1811 char * const s0 = STRING(scan), *s, *t;
1812 char * const s1 = s0 + STR_LEN(scan) - 1;
1813 char * const s2 = s1 - 4;
1814 const char t0[] = "\xcc\x88\xcc\x81";
1815 const char * const t1 = t0 + 3;
1818 s < s2 && (t = ninstr(s, s1, t0, t1));
1820 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1821 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1828 n = scan + NODE_SZ_STR(scan);
1830 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1841 /* Follow the next-chain of the current node and optimize
1842 away all the NOTHINGs from it. */
1843 if (OP(scan) != CURLYX) {
1844 const int max = (reg_off_by_arg[OP(scan)]
1846 /* I32 may be smaller than U16 on CRAYs! */
1847 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1848 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1852 /* Skip NOTHING and LONGJMP. */
1853 while ((n = regnext(n))
1854 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1855 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1856 && off + noff < max)
1858 if (reg_off_by_arg[OP(scan)])
1861 NEXT_OFF(scan) = off;
1864 DEBUG_OPTIMISE_r({if (merged){
1865 SV * const mysv=sv_newmortal();
1866 regprop(RExC_rx, mysv, scan);
1867 PerlIO_printf(Perl_debug_log, "%*s res: %s (%d)\n",
1868 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
1871 /* The principal pseudo-switch. Cannot be a switch, since we
1872 look into several different things. */
1873 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1874 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1875 next = regnext(scan);
1877 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1879 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1880 I32 max1 = 0, min1 = I32_MAX, num = 0;
1881 struct regnode_charclass_class accum;
1882 regnode * const startbranch=scan;
1884 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1885 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1886 if (flags & SCF_DO_STCLASS)
1887 cl_init_zero(pRExC_state, &accum);
1889 while (OP(scan) == code) {
1890 I32 deltanext, minnext, f = 0, fake;
1891 struct regnode_charclass_class this_class;
1894 data_fake.flags = 0;
1896 data_fake.whilem_c = data->whilem_c;
1897 data_fake.last_closep = data->last_closep;
1900 data_fake.last_closep = &fake;
1901 next = regnext(scan);
1902 scan = NEXTOPER(scan);
1904 scan = NEXTOPER(scan);
1905 if (flags & SCF_DO_STCLASS) {
1906 cl_init(pRExC_state, &this_class);
1907 data_fake.start_class = &this_class;
1908 f = SCF_DO_STCLASS_AND;
1910 if (flags & SCF_WHILEM_VISITED_POS)
1911 f |= SCF_WHILEM_VISITED_POS;
1913 /* we suppose the run is continuous, last=next...*/
1914 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1915 next, &data_fake, f,depth+1);
1918 if (max1 < minnext + deltanext)
1919 max1 = minnext + deltanext;
1920 if (deltanext == I32_MAX)
1921 is_inf = is_inf_internal = 1;
1923 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1926 if (data_fake.flags & SF_HAS_EVAL)
1927 data->flags |= SF_HAS_EVAL;
1928 data->whilem_c = data_fake.whilem_c;
1930 if (flags & SCF_DO_STCLASS)
1931 cl_or(pRExC_state, &accum, &this_class);
1932 if (code == SUSPEND)
1935 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1937 if (flags & SCF_DO_SUBSTR) {
1938 data->pos_min += min1;
1939 data->pos_delta += max1 - min1;
1940 if (max1 != min1 || is_inf)
1941 data->longest = &(data->longest_float);
1944 delta += max1 - min1;
1945 if (flags & SCF_DO_STCLASS_OR) {
1946 cl_or(pRExC_state, data->start_class, &accum);
1948 cl_and(data->start_class, &and_with);
1949 flags &= ~SCF_DO_STCLASS;
1952 else if (flags & SCF_DO_STCLASS_AND) {
1954 cl_and(data->start_class, &accum);
1955 flags &= ~SCF_DO_STCLASS;
1958 /* Switch to OR mode: cache the old value of
1959 * data->start_class */
1960 StructCopy(data->start_class, &and_with,
1961 struct regnode_charclass_class);
1962 flags &= ~SCF_DO_STCLASS_AND;
1963 StructCopy(&accum, data->start_class,
1964 struct regnode_charclass_class);
1965 flags |= SCF_DO_STCLASS_OR;
1966 data->start_class->flags |= ANYOF_EOS;
1972 Assuming this was/is a branch we are dealing with: 'scan' now
1973 points at the item that follows the branch sequence, whatever
1974 it is. We now start at the beginning of the sequence and look
1980 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1982 If we can find such a subseqence we need to turn the first
1983 element into a trie and then add the subsequent branch exact
1984 strings to the trie.
1988 1. patterns where the whole set of branch can be converted to a trie,
1990 2. patterns where only a subset of the alternations can be
1991 converted to a trie.
1993 In case 1 we can replace the whole set with a single regop
1994 for the trie. In case 2 we need to keep the start and end
1997 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1998 becomes BRANCH TRIE; BRANCH X;
2000 Hypthetically when we know the regex isnt anchored we can
2001 turn a case 1 into a DFA and let it rip... Every time it finds a match
2002 it would just call its tail, no WHILEM/CURLY needed.
2007 if (!re_trie_maxbuff) {
2008 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2009 if (!SvIOK(re_trie_maxbuff))
2010 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2012 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2014 regnode *first = (regnode *)NULL;
2015 regnode *last = (regnode *)NULL;
2016 regnode *tail = scan;
2021 SV * const mysv = sv_newmortal(); /* for dumping */
2023 /* var tail is used because there may be a TAIL
2024 regop in the way. Ie, the exacts will point to the
2025 thing following the TAIL, but the last branch will
2026 point at the TAIL. So we advance tail. If we
2027 have nested (?:) we may have to move through several
2031 while ( OP( tail ) == TAIL ) {
2032 /* this is the TAIL generated by (?:) */
2033 tail = regnext( tail );
2038 regprop(RExC_rx, mysv, tail );
2039 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2040 (int)depth * 2 + 2, "",
2041 "Looking for TRIE'able sequences. Tail node is: ",
2042 SvPV_nolen_const( mysv )
2048 step through the branches, cur represents each
2049 branch, noper is the first thing to be matched
2050 as part of that branch and noper_next is the
2051 regnext() of that node. if noper is an EXACT
2052 and noper_next is the same as scan (our current
2053 position in the regex) then the EXACT branch is
2054 a possible optimization target. Once we have
2055 two or more consequetive such branches we can
2056 create a trie of the EXACT's contents and stich
2057 it in place. If the sequence represents all of
2058 the branches we eliminate the whole thing and
2059 replace it with a single TRIE. If it is a
2060 subsequence then we need to stitch it in. This
2061 means the first branch has to remain, and needs
2062 to be repointed at the item on the branch chain
2063 following the last branch optimized. This could
2064 be either a BRANCH, in which case the
2065 subsequence is internal, or it could be the
2066 item following the branch sequence in which
2067 case the subsequence is at the end.
2071 /* dont use tail as the end marker for this traverse */
2072 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2073 regnode * const noper = NEXTOPER( cur );
2074 regnode * const noper_next = regnext( noper );
2077 regprop(RExC_rx, mysv, cur);
2078 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2079 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2081 regprop(RExC_rx, mysv, noper);
2082 PerlIO_printf( Perl_debug_log, " -> %s",
2083 SvPV_nolen_const(mysv));
2086 regprop(RExC_rx, mysv, noper_next );
2087 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2088 SvPV_nolen_const(mysv));
2090 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2091 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2093 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2094 : PL_regkind[ OP( noper ) ] == EXACT )
2095 || OP(noper) == NOTHING )
2096 && noper_next == tail && count<U16_MAX)
2099 if ( !first || optype == NOTHING ) {
2100 if (!first) first = cur;
2101 optype = OP( noper );
2107 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2109 if ( PL_regkind[ OP( noper ) ] == EXACT
2110 && noper_next == tail )
2114 optype = OP( noper );
2124 regprop(RExC_rx, mysv, cur);
2125 PerlIO_printf( Perl_debug_log,
2126 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2127 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2131 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2132 #ifdef TRIE_STUDY_OPT
2133 if ( OP(first)!=TRIE && startbranch == first ) {
2142 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2143 scan = NEXTOPER(NEXTOPER(scan));
2144 } else /* single branch is optimized. */
2145 scan = NEXTOPER(scan);
2148 else if (OP(scan) == EXACT) {
2149 I32 l = STR_LEN(scan);
2152 const U8 * const s = (U8*)STRING(scan);
2153 l = utf8_length(s, s + l);
2154 uc = utf8_to_uvchr(s, NULL);
2156 uc = *((U8*)STRING(scan));
2159 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2160 /* The code below prefers earlier match for fixed
2161 offset, later match for variable offset. */
2162 if (data->last_end == -1) { /* Update the start info. */
2163 data->last_start_min = data->pos_min;
2164 data->last_start_max = is_inf
2165 ? I32_MAX : data->pos_min + data->pos_delta;
2167 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2169 SvUTF8_on(data->last_found);
2171 SV * const sv = data->last_found;
2172 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2173 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2174 if (mg && mg->mg_len >= 0)
2175 mg->mg_len += utf8_length((U8*)STRING(scan),
2176 (U8*)STRING(scan)+STR_LEN(scan));
2178 data->last_end = data->pos_min + l;
2179 data->pos_min += l; /* As in the first entry. */
2180 data->flags &= ~SF_BEFORE_EOL;
2182 if (flags & SCF_DO_STCLASS_AND) {
2183 /* Check whether it is compatible with what we know already! */
2187 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2188 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2189 && (!(data->start_class->flags & ANYOF_FOLD)
2190 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2193 ANYOF_CLASS_ZERO(data->start_class);
2194 ANYOF_BITMAP_ZERO(data->start_class);
2196 ANYOF_BITMAP_SET(data->start_class, uc);
2197 data->start_class->flags &= ~ANYOF_EOS;
2199 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2201 else if (flags & SCF_DO_STCLASS_OR) {
2202 /* false positive possible if the class is case-folded */
2204 ANYOF_BITMAP_SET(data->start_class, uc);
2206 data->start_class->flags |= ANYOF_UNICODE_ALL;
2207 data->start_class->flags &= ~ANYOF_EOS;
2208 cl_and(data->start_class, &and_with);
2210 flags &= ~SCF_DO_STCLASS;
2212 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2213 I32 l = STR_LEN(scan);
2214 UV uc = *((U8*)STRING(scan));
2216 /* Search for fixed substrings supports EXACT only. */
2217 if (flags & SCF_DO_SUBSTR) {
2219 scan_commit(pRExC_state, data);
2222 const U8 * const s = (U8 *)STRING(scan);
2223 l = utf8_length(s, s + l);
2224 uc = utf8_to_uvchr(s, NULL);
2227 if (flags & SCF_DO_SUBSTR)
2229 if (flags & SCF_DO_STCLASS_AND) {
2230 /* Check whether it is compatible with what we know already! */
2234 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2235 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2236 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2238 ANYOF_CLASS_ZERO(data->start_class);
2239 ANYOF_BITMAP_ZERO(data->start_class);
2241 ANYOF_BITMAP_SET(data->start_class, uc);
2242 data->start_class->flags &= ~ANYOF_EOS;
2243 data->start_class->flags |= ANYOF_FOLD;
2244 if (OP(scan) == EXACTFL)
2245 data->start_class->flags |= ANYOF_LOCALE;
2248 else if (flags & SCF_DO_STCLASS_OR) {
2249 if (data->start_class->flags & ANYOF_FOLD) {
2250 /* false positive possible if the class is case-folded.
2251 Assume that the locale settings are the same... */
2253 ANYOF_BITMAP_SET(data->start_class, uc);
2254 data->start_class->flags &= ~ANYOF_EOS;
2256 cl_and(data->start_class, &and_with);
2258 flags &= ~SCF_DO_STCLASS;
2260 #ifdef TRIE_STUDY_OPT
2261 else if (OP(scan) == TRIE) {
2262 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2263 min += trie->minlen;
2264 flags &= ~SCF_DO_STCLASS; /* xxx */
2265 if (flags & SCF_DO_SUBSTR) {
2266 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2267 data->pos_min += trie->minlen;
2268 data->pos_delta+= (trie->maxlen-trie->minlen);
2272 else if (strchr((const char*)PL_varies,OP(scan))) {
2273 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2274 I32 f = flags, pos_before = 0;
2275 regnode * const oscan = scan;
2276 struct regnode_charclass_class this_class;
2277 struct regnode_charclass_class *oclass = NULL;
2278 I32 next_is_eval = 0;
2280 switch (PL_regkind[OP(scan)]) {
2281 case WHILEM: /* End of (?:...)* . */
2282 scan = NEXTOPER(scan);
2285 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2286 next = NEXTOPER(scan);
2287 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2289 maxcount = REG_INFTY;
2290 next = regnext(scan);
2291 scan = NEXTOPER(scan);
2295 if (flags & SCF_DO_SUBSTR)
2300 if (flags & SCF_DO_STCLASS) {
2302 maxcount = REG_INFTY;
2303 next = regnext(scan);
2304 scan = NEXTOPER(scan);
2307 is_inf = is_inf_internal = 1;
2308 scan = regnext(scan);
2309 if (flags & SCF_DO_SUBSTR) {
2310 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2311 data->longest = &(data->longest_float);
2313 goto optimize_curly_tail;
2315 mincount = ARG1(scan);
2316 maxcount = ARG2(scan);
2317 next = regnext(scan);
2318 if (OP(scan) == CURLYX) {
2319 I32 lp = (data ? *(data->last_closep) : 0);
2320 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2322 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2323 next_is_eval = (OP(scan) == EVAL);
2325 if (flags & SCF_DO_SUBSTR) {
2326 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2327 pos_before = data->pos_min;
2331 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2333 data->flags |= SF_IS_INF;
2335 if (flags & SCF_DO_STCLASS) {
2336 cl_init(pRExC_state, &this_class);
2337 oclass = data->start_class;
2338 data->start_class = &this_class;
2339 f |= SCF_DO_STCLASS_AND;
2340 f &= ~SCF_DO_STCLASS_OR;
2342 /* These are the cases when once a subexpression
2343 fails at a particular position, it cannot succeed
2344 even after backtracking at the enclosing scope.
2346 XXXX what if minimal match and we are at the
2347 initial run of {n,m}? */
2348 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2349 f &= ~SCF_WHILEM_VISITED_POS;
2351 /* This will finish on WHILEM, setting scan, or on NULL: */
2352 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2354 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2356 if (flags & SCF_DO_STCLASS)
2357 data->start_class = oclass;
2358 if (mincount == 0 || minnext == 0) {
2359 if (flags & SCF_DO_STCLASS_OR) {
2360 cl_or(pRExC_state, data->start_class, &this_class);
2362 else if (flags & SCF_DO_STCLASS_AND) {
2363 /* Switch to OR mode: cache the old value of
2364 * data->start_class */
2365 StructCopy(data->start_class, &and_with,
2366 struct regnode_charclass_class);
2367 flags &= ~SCF_DO_STCLASS_AND;
2368 StructCopy(&this_class, data->start_class,
2369 struct regnode_charclass_class);
2370 flags |= SCF_DO_STCLASS_OR;
2371 data->start_class->flags |= ANYOF_EOS;
2373 } else { /* Non-zero len */
2374 if (flags & SCF_DO_STCLASS_OR) {
2375 cl_or(pRExC_state, data->start_class, &this_class);
2376 cl_and(data->start_class, &and_with);
2378 else if (flags & SCF_DO_STCLASS_AND)
2379 cl_and(data->start_class, &this_class);
2380 flags &= ~SCF_DO_STCLASS;
2382 if (!scan) /* It was not CURLYX, but CURLY. */
2384 if ( /* ? quantifier ok, except for (?{ ... }) */
2385 (next_is_eval || !(mincount == 0 && maxcount == 1))
2386 && (minnext == 0) && (deltanext == 0)
2387 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2388 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2389 && ckWARN(WARN_REGEXP))
2392 "Quantifier unexpected on zero-length expression");
2395 min += minnext * mincount;
2396 is_inf_internal |= ((maxcount == REG_INFTY
2397 && (minnext + deltanext) > 0)
2398 || deltanext == I32_MAX);
2399 is_inf |= is_inf_internal;
2400 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2402 /* Try powerful optimization CURLYX => CURLYN. */
2403 if ( OP(oscan) == CURLYX && data
2404 && data->flags & SF_IN_PAR
2405 && !(data->flags & SF_HAS_EVAL)
2406 && !deltanext && minnext == 1 ) {
2407 /* Try to optimize to CURLYN. */
2408 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2409 regnode * const nxt1 = nxt;
2416 if (!strchr((const char*)PL_simple,OP(nxt))
2417 && !(PL_regkind[OP(nxt)] == EXACT
2418 && STR_LEN(nxt) == 1))
2424 if (OP(nxt) != CLOSE)
2426 /* Now we know that nxt2 is the only contents: */
2427 oscan->flags = (U8)ARG(nxt);
2429 OP(nxt1) = NOTHING; /* was OPEN. */
2431 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2432 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2433 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2434 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2435 OP(nxt + 1) = OPTIMIZED; /* was count. */
2436 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2441 /* Try optimization CURLYX => CURLYM. */
2442 if ( OP(oscan) == CURLYX && data
2443 && !(data->flags & SF_HAS_PAR)
2444 && !(data->flags & SF_HAS_EVAL)
2445 && !deltanext /* atom is fixed width */
2446 && minnext != 0 /* CURLYM can't handle zero width */
2448 /* XXXX How to optimize if data == 0? */
2449 /* Optimize to a simpler form. */
2450 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2454 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2455 && (OP(nxt2) != WHILEM))
2457 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2458 /* Need to optimize away parenths. */
2459 if (data->flags & SF_IN_PAR) {
2460 /* Set the parenth number. */
2461 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2463 if (OP(nxt) != CLOSE)
2464 FAIL("Panic opt close");
2465 oscan->flags = (U8)ARG(nxt);
2466 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2467 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2469 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2470 OP(nxt + 1) = OPTIMIZED; /* was count. */
2471 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2472 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2475 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2476 regnode *nnxt = regnext(nxt1);
2479 if (reg_off_by_arg[OP(nxt1)])
2480 ARG_SET(nxt1, nxt2 - nxt1);
2481 else if (nxt2 - nxt1 < U16_MAX)
2482 NEXT_OFF(nxt1) = nxt2 - nxt1;
2484 OP(nxt) = NOTHING; /* Cannot beautify */
2489 /* Optimize again: */
2490 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2496 else if ((OP(oscan) == CURLYX)
2497 && (flags & SCF_WHILEM_VISITED_POS)
2498 /* See the comment on a similar expression above.
2499 However, this time it not a subexpression
2500 we care about, but the expression itself. */
2501 && (maxcount == REG_INFTY)
2502 && data && ++data->whilem_c < 16) {
2503 /* This stays as CURLYX, we can put the count/of pair. */
2504 /* Find WHILEM (as in regexec.c) */
2505 regnode *nxt = oscan + NEXT_OFF(oscan);
2507 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2509 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2510 | (RExC_whilem_seen << 4)); /* On WHILEM */
2512 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2514 if (flags & SCF_DO_SUBSTR) {
2515 SV *last_str = NULL;
2516 int counted = mincount != 0;
2518 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2519 #if defined(SPARC64_GCC_WORKAROUND)
2522 const char *s = NULL;
2525 if (pos_before >= data->last_start_min)
2528 b = data->last_start_min;
2531 s = SvPV_const(data->last_found, l);
2532 old = b - data->last_start_min;
2535 I32 b = pos_before >= data->last_start_min
2536 ? pos_before : data->last_start_min;
2538 const char * const s = SvPV_const(data->last_found, l);
2539 I32 old = b - data->last_start_min;
2543 old = utf8_hop((U8*)s, old) - (U8*)s;
2546 /* Get the added string: */
2547 last_str = newSVpvn(s + old, l);
2549 SvUTF8_on(last_str);
2550 if (deltanext == 0 && pos_before == b) {
2551 /* What was added is a constant string */
2553 SvGROW(last_str, (mincount * l) + 1);
2554 repeatcpy(SvPVX(last_str) + l,
2555 SvPVX_const(last_str), l, mincount - 1);
2556 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2557 /* Add additional parts. */
2558 SvCUR_set(data->last_found,
2559 SvCUR(data->last_found) - l);
2560 sv_catsv(data->last_found, last_str);
2562 SV * sv = data->last_found;
2564 SvUTF8(sv) && SvMAGICAL(sv) ?
2565 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2566 if (mg && mg->mg_len >= 0)
2567 mg->mg_len += CHR_SVLEN(last_str);
2569 data->last_end += l * (mincount - 1);
2572 /* start offset must point into the last copy */
2573 data->last_start_min += minnext * (mincount - 1);
2574 data->last_start_max += is_inf ? I32_MAX
2575 : (maxcount - 1) * (minnext + data->pos_delta);
2578 /* It is counted once already... */
2579 data->pos_min += minnext * (mincount - counted);
2580 data->pos_delta += - counted * deltanext +
2581 (minnext + deltanext) * maxcount - minnext * mincount;
2582 if (mincount != maxcount) {
2583 /* Cannot extend fixed substrings found inside
2585 scan_commit(pRExC_state,data);
2586 if (mincount && last_str) {
2587 SV * const sv = data->last_found;
2588 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2589 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2593 sv_setsv(sv, last_str);
2594 data->last_end = data->pos_min;
2595 data->last_start_min =
2596 data->pos_min - CHR_SVLEN(last_str);
2597 data->last_start_max = is_inf
2599 : data->pos_min + data->pos_delta
2600 - CHR_SVLEN(last_str);
2602 data->longest = &(data->longest_float);
2604 SvREFCNT_dec(last_str);
2606 if (data && (fl & SF_HAS_EVAL))
2607 data->flags |= SF_HAS_EVAL;
2608 optimize_curly_tail:
2609 if (OP(oscan) != CURLYX) {
2610 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2612 NEXT_OFF(oscan) += NEXT_OFF(next);
2615 default: /* REF and CLUMP only? */
2616 if (flags & SCF_DO_SUBSTR) {
2617 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2618 data->longest = &(data->longest_float);
2620 is_inf = is_inf_internal = 1;
2621 if (flags & SCF_DO_STCLASS_OR)
2622 cl_anything(pRExC_state, data->start_class);
2623 flags &= ~SCF_DO_STCLASS;
2627 else if (strchr((const char*)PL_simple,OP(scan))) {
2630 if (flags & SCF_DO_SUBSTR) {
2631 scan_commit(pRExC_state,data);
2635 if (flags & SCF_DO_STCLASS) {
2636 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2638 /* Some of the logic below assumes that switching
2639 locale on will only add false positives. */
2640 switch (PL_regkind[OP(scan)]) {
2644 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2645 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2646 cl_anything(pRExC_state, data->start_class);
2649 if (OP(scan) == SANY)
2651 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2652 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2653 || (data->start_class->flags & ANYOF_CLASS));
2654 cl_anything(pRExC_state, data->start_class);
2656 if (flags & SCF_DO_STCLASS_AND || !value)
2657 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2660 if (flags & SCF_DO_STCLASS_AND)
2661 cl_and(data->start_class,
2662 (struct regnode_charclass_class*)scan);
2664 cl_or(pRExC_state, data->start_class,
2665 (struct regnode_charclass_class*)scan);
2668 if (flags & SCF_DO_STCLASS_AND) {
2669 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2670 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2671 for (value = 0; value < 256; value++)
2672 if (!isALNUM(value))
2673 ANYOF_BITMAP_CLEAR(data->start_class, value);
2677 if (data->start_class->flags & ANYOF_LOCALE)
2678 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2680 for (value = 0; value < 256; value++)
2682 ANYOF_BITMAP_SET(data->start_class, value);
2687 if (flags & SCF_DO_STCLASS_AND) {
2688 if (data->start_class->flags & ANYOF_LOCALE)
2689 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2692 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2693 data->start_class->flags |= ANYOF_LOCALE;
2697 if (flags & SCF_DO_STCLASS_AND) {
2698 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2699 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2700 for (value = 0; value < 256; value++)
2702 ANYOF_BITMAP_CLEAR(data->start_class, value);
2706 if (data->start_class->flags & ANYOF_LOCALE)
2707 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2709 for (value = 0; value < 256; value++)
2710 if (!isALNUM(value))
2711 ANYOF_BITMAP_SET(data->start_class, value);
2716 if (flags & SCF_DO_STCLASS_AND) {
2717 if (data->start_class->flags & ANYOF_LOCALE)
2718 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2721 data->start_class->flags |= ANYOF_LOCALE;
2722 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2726 if (flags & SCF_DO_STCLASS_AND) {
2727 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2728 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2729 for (value = 0; value < 256; value++)
2730 if (!isSPACE(value))
2731 ANYOF_BITMAP_CLEAR(data->start_class, value);
2735 if (data->start_class->flags & ANYOF_LOCALE)
2736 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2738 for (value = 0; value < 256; value++)
2740 ANYOF_BITMAP_SET(data->start_class, value);
2745 if (flags & SCF_DO_STCLASS_AND) {
2746 if (data->start_class->flags & ANYOF_LOCALE)
2747 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2750 data->start_class->flags |= ANYOF_LOCALE;
2751 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2755 if (flags & SCF_DO_STCLASS_AND) {
2756 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2757 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2758 for (value = 0; value < 256; value++)
2760 ANYOF_BITMAP_CLEAR(data->start_class, value);
2764 if (data->start_class->flags & ANYOF_LOCALE)
2765 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2767 for (value = 0; value < 256; value++)
2768 if (!isSPACE(value))
2769 ANYOF_BITMAP_SET(data->start_class, value);
2774 if (flags & SCF_DO_STCLASS_AND) {
2775 if (data->start_class->flags & ANYOF_LOCALE) {
2776 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2777 for (value = 0; value < 256; value++)
2778 if (!isSPACE(value))
2779 ANYOF_BITMAP_CLEAR(data->start_class, value);
2783 data->start_class->flags |= ANYOF_LOCALE;
2784 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2788 if (flags & SCF_DO_STCLASS_AND) {
2789 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2790 for (value = 0; value < 256; value++)
2791 if (!isDIGIT(value))
2792 ANYOF_BITMAP_CLEAR(data->start_class, value);
2795 if (data->start_class->flags & ANYOF_LOCALE)
2796 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2798 for (value = 0; value < 256; value++)
2800 ANYOF_BITMAP_SET(data->start_class, value);
2805 if (flags & SCF_DO_STCLASS_AND) {
2806 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2807 for (value = 0; value < 256; value++)
2809 ANYOF_BITMAP_CLEAR(data->start_class, value);
2812 if (data->start_class->flags & ANYOF_LOCALE)
2813 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2815 for (value = 0; value < 256; value++)
2816 if (!isDIGIT(value))
2817 ANYOF_BITMAP_SET(data->start_class, value);
2822 if (flags & SCF_DO_STCLASS_OR)
2823 cl_and(data->start_class, &and_with);
2824 flags &= ~SCF_DO_STCLASS;
2827 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2828 data->flags |= (OP(scan) == MEOL
2832 else if ( PL_regkind[OP(scan)] == BRANCHJ
2833 /* Lookbehind, or need to calculate parens/evals/stclass: */
2834 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2835 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2836 /* Lookahead/lookbehind */
2837 I32 deltanext, minnext, fake = 0;
2839 struct regnode_charclass_class intrnl;
2842 data_fake.flags = 0;
2844 data_fake.whilem_c = data->whilem_c;
2845 data_fake.last_closep = data->last_closep;
2848 data_fake.last_closep = &fake;
2849 if ( flags & SCF_DO_STCLASS && !scan->flags
2850 && OP(scan) == IFMATCH ) { /* Lookahead */
2851 cl_init(pRExC_state, &intrnl);
2852 data_fake.start_class = &intrnl;
2853 f |= SCF_DO_STCLASS_AND;
2855 if (flags & SCF_WHILEM_VISITED_POS)
2856 f |= SCF_WHILEM_VISITED_POS;
2857 next = regnext(scan);
2858 nscan = NEXTOPER(NEXTOPER(scan));
2859 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2862 vFAIL("Variable length lookbehind not implemented");
2864 else if (minnext > U8_MAX) {
2865 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2867 scan->flags = (U8)minnext;
2869 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2871 if (data && (data_fake.flags & SF_HAS_EVAL))
2872 data->flags |= SF_HAS_EVAL;
2874 data->whilem_c = data_fake.whilem_c;
2875 if (f & SCF_DO_STCLASS_AND) {
2876 const int was = (data->start_class->flags & ANYOF_EOS);
2878 cl_and(data->start_class, &intrnl);
2880 data->start_class->flags |= ANYOF_EOS;
2883 else if (OP(scan) == OPEN) {
2886 else if (OP(scan) == CLOSE) {
2887 if ((I32)ARG(scan) == is_par) {
2888 next = regnext(scan);
2890 if ( next && (OP(next) != WHILEM) && next < last)
2891 is_par = 0; /* Disable optimization */
2894 *(data->last_closep) = ARG(scan);
2896 else if (OP(scan) == EVAL) {
2898 data->flags |= SF_HAS_EVAL;
2900 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2901 if (flags & SCF_DO_SUBSTR) {
2902 scan_commit(pRExC_state,data);
2903 data->longest = &(data->longest_float);
2905 is_inf = is_inf_internal = 1;
2906 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2907 cl_anything(pRExC_state, data->start_class);
2908 flags &= ~SCF_DO_STCLASS;
2910 /* Else: zero-length, ignore. */
2911 scan = regnext(scan);
2916 *deltap = is_inf_internal ? I32_MAX : delta;
2917 if (flags & SCF_DO_SUBSTR && is_inf)
2918 data->pos_delta = I32_MAX - data->pos_min;
2919 if (is_par > U8_MAX)
2921 if (is_par && pars==1 && data) {
2922 data->flags |= SF_IN_PAR;
2923 data->flags &= ~SF_HAS_PAR;
2925 else if (pars && data) {
2926 data->flags |= SF_HAS_PAR;
2927 data->flags &= ~SF_IN_PAR;
2929 if (flags & SCF_DO_STCLASS_OR)
2930 cl_and(data->start_class, &and_with);
2935 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
2937 if (RExC_rx->data) {
2938 Renewc(RExC_rx->data,
2939 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2940 char, struct reg_data);
2941 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2942 RExC_rx->data->count += n;
2945 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2946 char, struct reg_data);
2947 Newx(RExC_rx->data->what, n, U8);
2948 RExC_rx->data->count = n;
2950 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2951 return RExC_rx->data->count - n;
2954 #ifndef PERL_IN_XSUB_RE
2956 Perl_reginitcolors(pTHX)
2959 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2961 char *t = savepv(s);
2965 t = strchr(t, '\t');
2971 PL_colors[i] = t = (char *)"";
2976 PL_colors[i++] = (char *)"";
2983 - pregcomp - compile a regular expression into internal code
2985 * We can't allocate space until we know how big the compiled form will be,
2986 * but we can't compile it (and thus know how big it is) until we've got a
2987 * place to put the code. So we cheat: we compile it twice, once with code
2988 * generation turned off and size counting turned on, and once "for real".
2989 * This also means that we don't allocate space until we are sure that the
2990 * thing really will compile successfully, and we never have to move the
2991 * code and thus invalidate pointers into it. (Note that it has to be in
2992 * one piece because free() must be able to free it all.) [NB: not true in perl]
2994 * Beware that the optimization-preparation code in here knows about some
2995 * of the structure of the compiled regexp. [I'll say.]
2998 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3009 RExC_state_t RExC_state;
3010 RExC_state_t *pRExC_state = &RExC_state;
3012 GET_RE_DEBUG_FLAGS_DECL;
3015 FAIL("NULL regexp argument");
3017 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3020 DEBUG_r(if (!PL_colorset) reginitcolors());
3022 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
3023 PL_colors[4],PL_colors[5],PL_colors[0],
3024 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3026 RExC_flags = pm->op_pmflags;
3030 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3031 RExC_seen_evals = 0;
3034 /* First pass: determine size, legality. */
3041 RExC_emit = &PL_regdummy;
3042 RExC_whilem_seen = 0;
3043 #if 0 /* REGC() is (currently) a NOP at the first pass.
3044 * Clever compilers notice this and complain. --jhi */
3045 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3047 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3048 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3049 RExC_precomp = NULL;
3052 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3053 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3054 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3056 /* Small enough for pointer-storage convention?
3057 If extralen==0, this means that we will not need long jumps. */
3058 if (RExC_size >= 0x10000L && RExC_extralen)
3059 RExC_size += RExC_extralen;
3062 if (RExC_whilem_seen > 15)
3063 RExC_whilem_seen = 15;
3065 /* Allocate space and initialize. */
3066 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3069 FAIL("Regexp out of space");
3072 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3073 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3076 r->prelen = xend - exp;
3077 r->precomp = savepvn(RExC_precomp, r->prelen);
3079 #ifdef PERL_OLD_COPY_ON_WRITE
3080 r->saved_copy = NULL;
3082 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3083 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3084 r->lastparen = 0; /* mg.c reads this. */
3086 r->substrs = 0; /* Useful during FAIL. */
3087 r->startp = 0; /* Useful during FAIL. */
3088 r->endp = 0; /* Useful during FAIL. */
3090 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3092 r->offsets[0] = RExC_size;
3094 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3095 "%s %"UVuf" bytes for offset annotations.\n",
3096 r->offsets ? "Got" : "Couldn't get",
3097 (UV)((2*RExC_size+1) * sizeof(U32))));
3101 /* Second pass: emit code. */
3102 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3107 RExC_emit_start = r->program;
3108 RExC_emit = r->program;
3109 /* Store the count of eval-groups for security checks: */
3110 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3111 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3113 if (reg(pRExC_state, 0, &flags,1) == NULL)
3117 /* Dig out information for optimizations. */
3118 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3119 pm->op_pmflags = RExC_flags;
3121 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3122 r->regstclass = NULL;
3123 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3124 r->reganch |= ROPT_NAUGHTY;
3125 scan = r->program + 1; /* First BRANCH. */
3127 /* XXXX To minimize changes to RE engine we always allocate
3128 3-units-long substrs field. */
3129 Newxz(r->substrs, 1, struct reg_substr_data);
3131 StructCopy(&zero_scan_data, &data, scan_data_t);
3132 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3133 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3135 STRLEN longest_float_length, longest_fixed_length;
3136 struct regnode_charclass_class ch_class;
3141 /* Skip introductions and multiplicators >= 1. */
3142 while ((OP(first) == OPEN && (sawopen = 1)) ||
3143 /* An OR of *one* alternative - should not happen now. */
3144 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3145 (OP(first) == PLUS) ||
3146 (OP(first) == MINMOD) ||
3147 /* An {n,m} with n>0 */
3148 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) {
3149 if (OP(first) == PLUS)
3152 first += regarglen[OP(first)];
3153 first = NEXTOPER(first);
3156 /* Starting-point info. */
3158 if (PL_regkind[OP(first)] == EXACT) {
3159 if (OP(first) == EXACT)
3160 NOOP; /* Empty, get anchored substr later. */
3161 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3162 r->regstclass = first;
3164 else if (strchr((const char*)PL_simple,OP(first)))
3165 r->regstclass = first;
3166 else if (PL_regkind[OP(first)] == BOUND ||
3167 PL_regkind[OP(first)] == NBOUND)
3168 r->regstclass = first;
3169 else if (PL_regkind[OP(first)] == BOL) {
3170 r->reganch |= (OP(first) == MBOL
3172 : (OP(first) == SBOL
3175 first = NEXTOPER(first);
3178 else if (OP(first) == GPOS) {
3179 r->reganch |= ROPT_ANCH_GPOS;
3180 first = NEXTOPER(first);
3183 else if (!sawopen && (OP(first) == STAR &&
3184 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3185 !(r->reganch & ROPT_ANCH) )
3187 /* turn .* into ^.* with an implied $*=1 */
3189 (OP(NEXTOPER(first)) == REG_ANY)
3192 r->reganch |= type | ROPT_IMPLICIT;
3193 first = NEXTOPER(first);
3196 if (sawplus && (!sawopen || !RExC_sawback)
3197 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3198 /* x+ must match at the 1st pos of run of x's */
3199 r->reganch |= ROPT_SKIP;
3201 /* Scan is after the zeroth branch, first is atomic matcher. */
3202 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3203 (IV)(first - scan + 1)));
3205 * If there's something expensive in the r.e., find the
3206 * longest literal string that must appear and make it the
3207 * regmust. Resolve ties in favor of later strings, since
3208 * the regstart check works with the beginning of the r.e.
3209 * and avoiding duplication strengthens checking. Not a
3210 * strong reason, but sufficient in the absence of others.
3211 * [Now we resolve ties in favor of the earlier string if
3212 * it happens that c_offset_min has been invalidated, since the
3213 * earlier string may buy us something the later one won't.]
3217 data.longest_fixed = newSVpvs("");
3218 data.longest_float = newSVpvs("");
3219 data.last_found = newSVpvs("");
3220 data.longest = &(data.longest_fixed);
3222 if (!r->regstclass) {
3223 cl_init(pRExC_state, &ch_class);
3224 data.start_class = &ch_class;
3225 stclass_flag = SCF_DO_STCLASS_AND;
3226 } else /* XXXX Check for BOUND? */
3228 data.last_closep = &last_close;
3230 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3231 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3232 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3233 && data.last_start_min == 0 && data.last_end > 0
3234 && !RExC_seen_zerolen
3235 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3236 r->reganch |= ROPT_CHECK_ALL;
3237 scan_commit(pRExC_state, &data);
3238 SvREFCNT_dec(data.last_found);
3240 longest_float_length = CHR_SVLEN(data.longest_float);
3241 if (longest_float_length
3242 || (data.flags & SF_FL_BEFORE_EOL
3243 && (!(data.flags & SF_FL_BEFORE_MEOL)
3244 || (RExC_flags & PMf_MULTILINE)))) {
3247 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3248 && data.offset_fixed == data.offset_float_min
3249 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3250 goto remove_float; /* As in (a)+. */
3252 if (SvUTF8(data.longest_float)) {
3253 r->float_utf8 = data.longest_float;
3254 r->float_substr = NULL;
3256 r->float_substr = data.longest_float;
3257 r->float_utf8 = NULL;
3259 r->float_min_offset = data.offset_float_min;
3260 r->float_max_offset = data.offset_float_max;
3261 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3262 && (!(data.flags & SF_FL_BEFORE_MEOL)
3263 || (RExC_flags & PMf_MULTILINE)));
3264 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3268 r->float_substr = r->float_utf8 = NULL;
3269 SvREFCNT_dec(data.longest_float);
3270 longest_float_length = 0;
3273 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3274 if (longest_fixed_length
3275 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3276 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3277 || (RExC_flags & PMf_MULTILINE)))) {
3280 if (SvUTF8(data.longest_fixed)) {
3281 r->anchored_utf8 = data.longest_fixed;
3282 r->anchored_substr = NULL;
3284 r->anchored_substr = data.longest_fixed;
3285 r->anchored_utf8 = NULL;
3287 r->anchored_offset = data.offset_fixed;
3288 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3289 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3290 || (RExC_flags & PMf_MULTILINE)));
3291 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3294 r->anchored_substr = r->anchored_utf8 = NULL;
3295 SvREFCNT_dec(data.longest_fixed);
3296 longest_fixed_length = 0;
3299 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3300 r->regstclass = NULL;
3301 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3303 && !(data.start_class->flags & ANYOF_EOS)
3304 && !cl_is_anything(data.start_class))
3306 const I32 n = add_data(pRExC_state, 1, "f");
3308 Newx(RExC_rx->data->data[n], 1,
3309 struct regnode_charclass_class);
3310 StructCopy(data.start_class,
3311 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3312 struct regnode_charclass_class);
3313 r->regstclass = (regnode*)RExC_rx->data->data[n];
3314 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3315 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3316 regprop(r, sv, (regnode*)data.start_class);
3317 PerlIO_printf(Perl_debug_log,
3318 "synthetic stclass \"%s\".\n",
3319 SvPVX_const(sv));});
3322 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3323 if (longest_fixed_length > longest_float_length) {
3324 r->check_substr = r->anchored_substr;
3325 r->check_utf8 = r->anchored_utf8;
3326 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3327 if (r->reganch & ROPT_ANCH_SINGLE)
3328 r->reganch |= ROPT_NOSCAN;
3331 r->check_substr = r->float_substr;
3332 r->check_utf8 = r->float_utf8;
3333 r->check_offset_min = data.offset_float_min;
3334 r->check_offset_max = data.offset_float_max;
3336 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3337 This should be changed ASAP! */
3338 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3339 r->reganch |= RE_USE_INTUIT;
3340 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3341 r->reganch |= RE_INTUIT_TAIL;
3345 /* Several toplevels. Best we can is to set minlen. */
3347 struct regnode_charclass_class ch_class;
3350 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3351 scan = r->program + 1;
3352 cl_init(pRExC_state, &ch_class);
3353 data.start_class = &ch_class;
3354 data.last_closep = &last_close;
3355 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3356 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3357 = r->float_substr = r->float_utf8 = NULL;
3358 if (!(data.start_class->flags & ANYOF_EOS)
3359 && !cl_is_anything(data.start_class))
3361 const I32 n = add_data(pRExC_state, 1, "f");
3363 Newx(RExC_rx->data->data[n], 1,
3364 struct regnode_charclass_class);
3365 StructCopy(data.start_class,
3366 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3367 struct regnode_charclass_class);
3368 r->regstclass = (regnode*)RExC_rx->data->data[n];
3369 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3370 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3371 regprop(r, sv, (regnode*)data.start_class);
3372 PerlIO_printf(Perl_debug_log,
3373 "synthetic stclass \"%s\".\n",
3374 SvPVX_const(sv));});
3379 if (RExC_seen & REG_SEEN_GPOS)
3380 r->reganch |= ROPT_GPOS_SEEN;
3381 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3382 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3383 if (RExC_seen & REG_SEEN_EVAL)
3384 r->reganch |= ROPT_EVAL_SEEN;
3385 if (RExC_seen & REG_SEEN_CANY)
3386 r->reganch |= ROPT_CANY_SEEN;
3387 Newxz(r->startp, RExC_npar, I32);
3388 Newxz(r->endp, RExC_npar, I32);
3390 if (SvIV(re_debug_flags)> (RE_DEBUG_COMPILE | RE_DEBUG_EXECUTE))
3391 PerlIO_printf(Perl_debug_log,"Final program:\n");
3398 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3399 int rem=(int)(RExC_end - RExC_parse); \
3408 if (RExC_lastparse!=RExC_parse) \
3409 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3412 iscut ? "..." : "<" \
3415 PerlIO_printf(Perl_debug_log,"%16s",""); \
3420 num=REG_NODE_NUM(RExC_emit); \
3421 if (RExC_lastnum!=num) \
3422 PerlIO_printf(Perl_debug_log,"%4d",num); \
3424 PerlIO_printf(Perl_debug_log,"%4s",""); \
3425 PerlIO_printf(Perl_debug_log,"%*s%-4s", \
3426 (int)(10+(depth*2)), "", \
3430 RExC_lastparse=RExC_parse; \
3433 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3434 DEBUG_PARSE_MSG((funcname)); \
3435 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3438 - reg - regular expression, i.e. main body or parenthesized thing
3440 * Caller must absorb opening parenthesis.
3442 * Combining parenthesis handling with the base level of regular expression
3443 * is a trifle forced, but the need to tie the tails of the branches to what
3444 * follows makes it hard to avoid.
3446 #define REGTAIL(x,y,z) regtail(x,y,z,depth+1)
3449 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3450 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3453 register regnode *ret; /* Will be the head of the group. */
3454 register regnode *br;
3455 register regnode *lastbr;
3456 register regnode *ender = NULL;
3457 register I32 parno = 0;
3459 const I32 oregflags = RExC_flags;
3460 bool have_branch = 0;
3463 /* for (?g), (?gc), and (?o) warnings; warning
3464 about (?c) will warn about (?g) -- japhy */
3466 #define WASTED_O 0x01
3467 #define WASTED_G 0x02
3468 #define WASTED_C 0x04
3469 #define WASTED_GC (0x02|0x04)
3470 I32 wastedflags = 0x00;
3472 char * parse_start = RExC_parse; /* MJD */
3473 char * const oregcomp_parse = RExC_parse;
3475 GET_RE_DEBUG_FLAGS_DECL;
3476 DEBUG_PARSE("reg ");
3479 *flagp = 0; /* Tentatively. */
3482 /* Make an OPEN node, if parenthesized. */
3484 if (*RExC_parse == '?') { /* (?...) */
3485 U32 posflags = 0, negflags = 0;
3486 U32 *flagsp = &posflags;
3487 bool is_logical = 0;
3488 const char * const seqstart = RExC_parse;
3491 paren = *RExC_parse++;
3492 ret = NULL; /* For look-ahead/behind. */
3494 case '<': /* (?<...) */
3495 RExC_seen |= REG_SEEN_LOOKBEHIND;
3496 if (*RExC_parse == '!')
3498 if (*RExC_parse != '=' && *RExC_parse != '!')
3501 case '=': /* (?=...) */
3502 case '!': /* (?!...) */
3503 RExC_seen_zerolen++;
3504 case ':': /* (?:...) */
3505 case '>': /* (?>...) */
3507 case '$': /* (?$...) */
3508 case '@': /* (?@...) */
3509 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3511 case '#': /* (?#...) */
3512 while (*RExC_parse && *RExC_parse != ')')
3514 if (*RExC_parse != ')')
3515 FAIL("Sequence (?#... not terminated");
3516 nextchar(pRExC_state);
3519 case 'p': /* (?p...) */
3520 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3521 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3523 case '?': /* (??...) */
3525 if (*RExC_parse != '{')
3527 paren = *RExC_parse++;
3529 case '{': /* (?{...}) */
3531 I32 count = 1, n = 0;
3533 char *s = RExC_parse;
3535 RExC_seen_zerolen++;
3536 RExC_seen |= REG_SEEN_EVAL;
3537 while (count && (c = *RExC_parse)) {
3548 if (*RExC_parse != ')') {
3550 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3554 OP_4tree *sop, *rop;
3555 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3558 Perl_save_re_context(aTHX);
3559 rop = sv_compile_2op(sv, &sop, "re", &pad);
3560 sop->op_private |= OPpREFCOUNTED;
3561 /* re_dup will OpREFCNT_inc */
3562 OpREFCNT_set(sop, 1);
3565 n = add_data(pRExC_state, 3, "nop");
3566 RExC_rx->data->data[n] = (void*)rop;
3567 RExC_rx->data->data[n+1] = (void*)sop;
3568 RExC_rx->data->data[n+2] = (void*)pad;
3571 else { /* First pass */
3572 if (PL_reginterp_cnt < ++RExC_seen_evals
3574 /* No compiled RE interpolated, has runtime
3575 components ===> unsafe. */
3576 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3577 if (PL_tainting && PL_tainted)
3578 FAIL("Eval-group in insecure regular expression");
3579 #if PERL_VERSION > 8
3580 if (IN_PERL_COMPILETIME)
3585 nextchar(pRExC_state);
3587 ret = reg_node(pRExC_state, LOGICAL);
3590 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3591 /* deal with the length of this later - MJD */
3594 ret = reganode(pRExC_state, EVAL, n);
3595 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3596 Set_Node_Offset(ret, parse_start);
3599 case '(': /* (?(?{...})...) and (?(?=...)...) */
3601 if (RExC_parse[0] == '?') { /* (?(?...)) */
3602 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3603 || RExC_parse[1] == '<'
3604 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3607 ret = reg_node(pRExC_state, LOGICAL);
3610 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3614 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3617 parno = atoi(RExC_parse++);
3619 while (isDIGIT(*RExC_parse))
3621 ret = reganode(pRExC_state, GROUPP, parno);
3623 if ((c = *nextchar(pRExC_state)) != ')')
3624 vFAIL("Switch condition not recognized");
3626 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3627 br = regbranch(pRExC_state, &flags, 1,depth+1);
3629 br = reganode(pRExC_state, LONGJMP, 0);
3631 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3632 c = *nextchar(pRExC_state);
3636 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3637 regbranch(pRExC_state, &flags, 1,depth+1);
3638 REGTAIL(pRExC_state, ret, lastbr);
3641 c = *nextchar(pRExC_state);
3646 vFAIL("Switch (?(condition)... contains too many branches");
3647 ender = reg_node(pRExC_state, TAIL);
3648 REGTAIL(pRExC_state, br, ender);
3650 REGTAIL(pRExC_state, lastbr, ender);
3651 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3654 REGTAIL(pRExC_state, ret, ender);
3658 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3662 RExC_parse--; /* for vFAIL to print correctly */
3663 vFAIL("Sequence (? incomplete");
3667 parse_flags: /* (?i) */
3668 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3669 /* (?g), (?gc) and (?o) are useless here
3670 and must be globally applied -- japhy */
3672 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3673 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3674 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3675 if (! (wastedflags & wflagbit) ) {
3676 wastedflags |= wflagbit;
3679 "Useless (%s%c) - %suse /%c modifier",
3680 flagsp == &negflags ? "?-" : "?",
3682 flagsp == &negflags ? "don't " : "",
3688 else if (*RExC_parse == 'c') {
3689 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3690 if (! (wastedflags & WASTED_C) ) {
3691 wastedflags |= WASTED_GC;
3694 "Useless (%sc) - %suse /gc modifier",
3695 flagsp == &negflags ? "?-" : "?",
3696 flagsp == &negflags ? "don't " : ""
3701 else { pmflag(flagsp, *RExC_parse); }
3705 if (*RExC_parse == '-') {
3707 wastedflags = 0; /* reset so (?g-c) warns twice */
3711 RExC_flags |= posflags;
3712 RExC_flags &= ~negflags;
3713 if (*RExC_parse == ':') {
3719 if (*RExC_parse != ')') {
3721 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3723 nextchar(pRExC_state);
3731 ret = reganode(pRExC_state, OPEN, parno);
3732 Set_Node_Length(ret, 1); /* MJD */
3733 Set_Node_Offset(ret, RExC_parse); /* MJD */
3740 /* Pick up the branches, linking them together. */
3741 parse_start = RExC_parse; /* MJD */
3742 br = regbranch(pRExC_state, &flags, 1,depth+1);
3743 /* branch_len = (paren != 0); */
3747 if (*RExC_parse == '|') {
3748 if (!SIZE_ONLY && RExC_extralen) {
3749 reginsert(pRExC_state, BRANCHJ, br);
3752 reginsert(pRExC_state, BRANCH, br);
3753 Set_Node_Length(br, paren != 0);
3754 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3758 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3760 else if (paren == ':') {
3761 *flagp |= flags&SIMPLE;
3763 if (is_open) { /* Starts with OPEN. */
3764 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
3766 else if (paren != '?') /* Not Conditional */
3768 *flagp |= flags & (SPSTART | HASWIDTH);
3770 while (*RExC_parse == '|') {
3771 if (!SIZE_ONLY && RExC_extralen) {
3772 ender = reganode(pRExC_state, LONGJMP,0);
3773 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3776 RExC_extralen += 2; /* Account for LONGJMP. */
3777 nextchar(pRExC_state);
3778 br = regbranch(pRExC_state, &flags, 0, depth+1);
3782 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3786 *flagp |= flags&SPSTART;
3789 if (have_branch || paren != ':') {
3790 /* Make a closing node, and hook it on the end. */
3793 ender = reg_node(pRExC_state, TAIL);
3796 ender = reganode(pRExC_state, CLOSE, parno);
3797 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3798 Set_Node_Length(ender,1); /* MJD */
3804 *flagp &= ~HASWIDTH;
3807 ender = reg_node(pRExC_state, SUCCEED);
3810 ender = reg_node(pRExC_state, END);
3813 REGTAIL(pRExC_state, lastbr, ender);
3815 if (have_branch && !SIZE_ONLY) {
3816 /* Hook the tails of the branches to the closing node. */
3818 for (br = ret; br; br = regnext(br)) {
3819 const U8 op = PL_regkind[OP(br)];
3822 exact_ret=regtail_study(pRExC_state, NEXTOPER(br), ender,depth+1);
3824 else if (op == BRANCHJ) {
3825 exact_ret=regtail_study(pRExC_state, NEXTOPER(NEXTOPER(br)), ender,depth+1);
3827 if ( exact == PSEUDO )
3829 else if ( exact != exact_ret )
3837 static const char parens[] = "=!<,>";
3839 if (paren && (p = strchr(parens, paren))) {
3840 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3841 int flag = (p - parens) > 1;
3844 node = SUSPEND, flag = 0;
3845 reginsert(pRExC_state, node,ret);
3846 Set_Node_Cur_Length(ret);
3847 Set_Node_Offset(ret, parse_start + 1);
3849 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3853 /* Check for proper termination. */
3855 RExC_flags = oregflags;
3856 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3857 RExC_parse = oregcomp_parse;
3858 vFAIL("Unmatched (");
3861 else if (!paren && RExC_parse < RExC_end) {
3862 if (*RExC_parse == ')') {
3864 vFAIL("Unmatched )");
3867 FAIL("Junk on end of regexp"); /* "Can't happen". */
3875 - regbranch - one alternative of an | operator
3877 * Implements the concatenation operator.
3880 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
3883 register regnode *ret;
3884 register regnode *chain = NULL;
3885 register regnode *latest;
3886 I32 flags = 0, c = 0;
3887 GET_RE_DEBUG_FLAGS_DECL;
3888 DEBUG_PARSE("brnc");
3892 if (!SIZE_ONLY && RExC_extralen)
3893 ret = reganode(pRExC_state, BRANCHJ,0);
3895 ret = reg_node(pRExC_state, BRANCH);
3896 Set_Node_Length(ret, 1);
3900 if (!first && SIZE_ONLY)
3901 RExC_extralen += 1; /* BRANCHJ */
3903 *flagp = WORST; /* Tentatively. */
3906 nextchar(pRExC_state);
3907 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3909 latest = regpiece(pRExC_state, &flags,depth+1);
3910 if (latest == NULL) {
3911 if (flags & TRYAGAIN)
3915 else if (ret == NULL)
3917 *flagp |= flags&HASWIDTH;
3918 if (chain == NULL) /* First piece. */
3919 *flagp |= flags&SPSTART;
3922 REGTAIL(pRExC_state, chain, latest);
3927 if (chain == NULL) { /* Loop ran zero times. */
3928 chain = reg_node(pRExC_state, NOTHING);
3933 *flagp |= flags&SIMPLE;
3940 - regpiece - something followed by possible [*+?]
3942 * Note that the branching code sequences used for ? and the general cases
3943 * of * and + are somewhat optimized: they use the same NOTHING node as
3944 * both the endmarker for their branch list and the body of the last branch.
3945 * It might seem that this node could be dispensed with entirely, but the
3946 * endmarker role is not redundant.
3949 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
3952 register regnode *ret;
3954 register char *next;
3956 const char * const origparse = RExC_parse;
3958 I32 max = REG_INFTY;
3960 GET_RE_DEBUG_FLAGS_DECL;
3961 DEBUG_PARSE("piec");
3963 ret = regatom(pRExC_state, &flags,depth+1);
3965 if (flags & TRYAGAIN)
3972 if (op == '{' && regcurly(RExC_parse)) {
3973 const char *maxpos = NULL;
3974 parse_start = RExC_parse; /* MJD */
3975 next = RExC_parse + 1;
3976 while (isDIGIT(*next) || *next == ',') {
3985 if (*next == '}') { /* got one */
3989 min = atoi(RExC_parse);
3993 maxpos = RExC_parse;
3995 if (!max && *maxpos != '0')
3996 max = REG_INFTY; /* meaning "infinity" */
3997 else if (max >= REG_INFTY)
3998 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4000 nextchar(pRExC_state);
4003 if ((flags&SIMPLE)) {
4004 RExC_naughty += 2 + RExC_naughty / 2;
4005 reginsert(pRExC_state, CURLY, ret);
4006 Set_Node_Offset(ret, parse_start+1); /* MJD */
4007 Set_Node_Cur_Length(ret);
4010 regnode * const w = reg_node(pRExC_state, WHILEM);
4013 REGTAIL(pRExC_state, ret, w);
4014 if (!SIZE_ONLY && RExC_extralen) {
4015 reginsert(pRExC_state, LONGJMP,ret);
4016 reginsert(pRExC_state, NOTHING,ret);
4017 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4019 reginsert(pRExC_state, CURLYX,ret);
4021 Set_Node_Offset(ret, parse_start+1);
4022 Set_Node_Length(ret,
4023 op == '{' ? (RExC_parse - parse_start) : 1);
4025 if (!SIZE_ONLY && RExC_extralen)
4026 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4027 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4029 RExC_whilem_seen++, RExC_extralen += 3;
4030 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4038 if (max && max < min)
4039 vFAIL("Can't do {n,m} with n > m");
4041 ARG1_SET(ret, (U16)min);
4042 ARG2_SET(ret, (U16)max);
4054 #if 0 /* Now runtime fix should be reliable. */
4056 /* if this is reinstated, don't forget to put this back into perldiag:
4058 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4060 (F) The part of the regexp subject to either the * or + quantifier
4061 could match an empty string. The {#} shows in the regular
4062 expression about where the problem was discovered.
4066 if (!(flags&HASWIDTH) && op != '?')
4067 vFAIL("Regexp *+ operand could be empty");
4070 parse_start = RExC_parse;
4071 nextchar(pRExC_state);
4073 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4075 if (op == '*' && (flags&SIMPLE)) {
4076 reginsert(pRExC_state, STAR, ret);
4080 else if (op == '*') {
4084 else if (op == '+' && (flags&SIMPLE)) {
4085 reginsert(pRExC_state, PLUS, ret);
4089 else if (op == '+') {
4093 else if (op == '?') {
4098 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4100 "%.*s matches null string many times",
4101 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4105 if (*RExC_parse == '?') {
4106 nextchar(pRExC_state);
4107 reginsert(pRExC_state, MINMOD, ret);
4108 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4110 if (ISMULT2(RExC_parse)) {
4112 vFAIL("Nested quantifiers");
4119 - regatom - the lowest level
4121 * Optimization: gobbles an entire sequence of ordinary characters so that
4122 * it can turn them into a single node, which is smaller to store and
4123 * faster to run. Backslashed characters are exceptions, each becoming a
4124 * separate node; the code is simpler that way and it's not worth fixing.
4126 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4127 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4130 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4133 register regnode *ret = NULL;
4135 char *parse_start = RExC_parse;
4136 GET_RE_DEBUG_FLAGS_DECL;
4137 DEBUG_PARSE("atom");
4138 *flagp = WORST; /* Tentatively. */
4141 switch (*RExC_parse) {
4143 RExC_seen_zerolen++;
4144 nextchar(pRExC_state);
4145 if (RExC_flags & PMf_MULTILINE)
4146 ret = reg_node(pRExC_state, MBOL);
4147 else if (RExC_flags & PMf_SINGLELINE)
4148 ret = reg_node(pRExC_state, SBOL);
4150 ret = reg_node(pRExC_state, BOL);
4151 Set_Node_Length(ret, 1); /* MJD */
4154 nextchar(pRExC_state);
4156 RExC_seen_zerolen++;
4157 if (RExC_flags & PMf_MULTILINE)
4158 ret = reg_node(pRExC_state, MEOL);
4159 else if (RExC_flags & PMf_SINGLELINE)
4160 ret = reg_node(pRExC_state, SEOL);
4162 ret = reg_node(pRExC_state, EOL);
4163 Set_Node_Length(ret, 1); /* MJD */
4166 nextchar(pRExC_state);
4167 if (RExC_flags & PMf_SINGLELINE)
4168 ret = reg_node(pRExC_state, SANY);
4170 ret = reg_node(pRExC_state, REG_ANY);
4171 *flagp |= HASWIDTH|SIMPLE;
4173 Set_Node_Length(ret, 1); /* MJD */
4177 char * const oregcomp_parse = ++RExC_parse;
4178 ret = regclass(pRExC_state,depth+1);
4179 if (*RExC_parse != ']') {
4180 RExC_parse = oregcomp_parse;
4181 vFAIL("Unmatched [");
4183 nextchar(pRExC_state);
4184 *flagp |= HASWIDTH|SIMPLE;
4185 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4189 nextchar(pRExC_state);
4190 ret = reg(pRExC_state, 1, &flags,depth+1);
4192 if (flags & TRYAGAIN) {
4193 if (RExC_parse == RExC_end) {
4194 /* Make parent create an empty node if needed. */
4202 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4206 if (flags & TRYAGAIN) {
4210 vFAIL("Internal urp");
4211 /* Supposed to be caught earlier. */
4214 if (!regcurly(RExC_parse)) {
4223 vFAIL("Quantifier follows nothing");
4226 switch (*++RExC_parse) {
4228 RExC_seen_zerolen++;
4229 ret = reg_node(pRExC_state, SBOL);
4231 nextchar(pRExC_state);
4232 Set_Node_Length(ret, 2); /* MJD */
4235 ret = reg_node(pRExC_state, GPOS);
4236 RExC_seen |= REG_SEEN_GPOS;
4238 nextchar(pRExC_state);
4239 Set_Node_Length(ret, 2); /* MJD */
4242 ret = reg_node(pRExC_state, SEOL);
4244 RExC_seen_zerolen++; /* Do not optimize RE away */
4245 nextchar(pRExC_state);
4248 ret = reg_node(pRExC_state, EOS);
4250 RExC_seen_zerolen++; /* Do not optimize RE away */
4251 nextchar(pRExC_state);
4252 Set_Node_Length(ret, 2); /* MJD */
4255 ret = reg_node(pRExC_state, CANY);
4256 RExC_seen |= REG_SEEN_CANY;
4257 *flagp |= HASWIDTH|SIMPLE;
4258 nextchar(pRExC_state);
4259 Set_Node_Length(ret, 2); /* MJD */
4262 ret = reg_node(pRExC_state, CLUMP);
4264 nextchar(pRExC_state);
4265 Set_Node_Length(ret, 2); /* MJD */
4268 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4269 *flagp |= HASWIDTH|SIMPLE;
4270 nextchar(pRExC_state);
4271 Set_Node_Length(ret, 2); /* MJD */
4274 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4275 *flagp |= HASWIDTH|SIMPLE;
4276 nextchar(pRExC_state);
4277 Set_Node_Length(ret, 2); /* MJD */
4280 RExC_seen_zerolen++;
4281 RExC_seen |= REG_SEEN_LOOKBEHIND;
4282 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4284 nextchar(pRExC_state);
4285 Set_Node_Length(ret, 2); /* MJD */
4288 RExC_seen_zerolen++;
4289 RExC_seen |= REG_SEEN_LOOKBEHIND;
4290 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4292 nextchar(pRExC_state);
4293 Set_Node_Length(ret, 2); /* MJD */
4296 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4297 *flagp |= HASWIDTH|SIMPLE;
4298 nextchar(pRExC_state);
4299 Set_Node_Length(ret, 2); /* MJD */
4302 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4303 *flagp |= HASWIDTH|SIMPLE;
4304 nextchar(pRExC_state);
4305 Set_Node_Length(ret, 2); /* MJD */
4308 ret = reg_node(pRExC_state, DIGIT);
4309 *flagp |= HASWIDTH|SIMPLE;
4310 nextchar(pRExC_state);
4311 Set_Node_Length(ret, 2); /* MJD */
4314 ret = reg_node(pRExC_state, NDIGIT);
4315 *flagp |= HASWIDTH|SIMPLE;
4316 nextchar(pRExC_state);
4317 Set_Node_Length(ret, 2); /* MJD */
4322 char* const oldregxend = RExC_end;
4323 char* parse_start = RExC_parse - 2;
4325 if (RExC_parse[1] == '{') {
4326 /* a lovely hack--pretend we saw [\pX] instead */
4327 RExC_end = strchr(RExC_parse, '}');
4329 const U8 c = (U8)*RExC_parse;
4331 RExC_end = oldregxend;
4332 vFAIL2("Missing right brace on \\%c{}", c);
4337 RExC_end = RExC_parse + 2;
4338 if (RExC_end > oldregxend)
4339 RExC_end = oldregxend;
4343 ret = regclass(pRExC_state,depth+1);
4345 RExC_end = oldregxend;
4348 Set_Node_Offset(ret, parse_start + 2);
4349 Set_Node_Cur_Length(ret);
4350 nextchar(pRExC_state);
4351 *flagp |= HASWIDTH|SIMPLE;
4364 case '1': case '2': case '3': case '4':
4365 case '5': case '6': case '7': case '8': case '9':
4367 const I32 num = atoi(RExC_parse);
4369 if (num > 9 && num >= RExC_npar)
4372 char * const parse_start = RExC_parse - 1; /* MJD */
4373 while (isDIGIT(*RExC_parse))
4376 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4377 vFAIL("Reference to nonexistent group");
4379 ret = reganode(pRExC_state,
4380 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4384 /* override incorrect value set in reganode MJD */
4385 Set_Node_Offset(ret, parse_start+1);
4386 Set_Node_Cur_Length(ret); /* MJD */
4388 nextchar(pRExC_state);
4393 if (RExC_parse >= RExC_end)
4394 FAIL("Trailing \\");
4397 /* Do not generate "unrecognized" warnings here, we fall
4398 back into the quick-grab loop below */
4405 if (RExC_flags & PMf_EXTENDED) {
4406 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4408 if (RExC_parse < RExC_end)
4414 register STRLEN len;
4419 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4421 parse_start = RExC_parse - 1;
4427 ret = reg_node(pRExC_state,
4428 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4430 for (len = 0, p = RExC_parse - 1;
4431 len < 127 && p < RExC_end;
4434 char * const oldp = p;
4436 if (RExC_flags & PMf_EXTENDED)
4437 p = regwhite(p, RExC_end);
4484 ender = ASCII_TO_NATIVE('\033');
4488 ender = ASCII_TO_NATIVE('\007');
4493 char* const e = strchr(p, '}');
4497 vFAIL("Missing right brace on \\x{}");
4500 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4501 | PERL_SCAN_DISALLOW_PREFIX;
4502 STRLEN numlen = e - p - 1;
4503 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4510 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4512 ender = grok_hex(p, &numlen, &flags, NULL);
4518 ender = UCHARAT(p++);
4519 ender = toCTRL(ender);
4521 case '0': case '1': case '2': case '3':case '4':
4522 case '5': case '6': case '7': case '8':case '9':
4524 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4527 ender = grok_oct(p, &numlen, &flags, NULL);
4537 FAIL("Trailing \\");
4540 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4541 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4542 goto normal_default;
4547 if (UTF8_IS_START(*p) && UTF) {
4549 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4550 &numlen, UTF8_ALLOW_DEFAULT);
4557 if (RExC_flags & PMf_EXTENDED)
4558 p = regwhite(p, RExC_end);
4560 /* Prime the casefolded buffer. */
4561 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4563 if (ISMULT2(p)) { /* Back off on ?+*. */
4568 /* Emit all the Unicode characters. */
4570 for (foldbuf = tmpbuf;
4572 foldlen -= numlen) {
4573 ender = utf8_to_uvchr(foldbuf, &numlen);
4575 const STRLEN unilen = reguni(pRExC_state, ender, s);
4578 /* In EBCDIC the numlen
4579 * and unilen can differ. */
4581 if (numlen >= foldlen)
4585 break; /* "Can't happen." */
4589 const STRLEN unilen = reguni(pRExC_state, ender, s);
4598 REGC((char)ender, s++);
4604 /* Emit all the Unicode characters. */
4606 for (foldbuf = tmpbuf;
4608 foldlen -= numlen) {
4609 ender = utf8_to_uvchr(foldbuf, &numlen);
4611 const STRLEN unilen = reguni(pRExC_state, ender, s);
4614 /* In EBCDIC the numlen
4615 * and unilen can differ. */
4617 if (numlen >= foldlen)
4625 const STRLEN unilen = reguni(pRExC_state, ender, s);
4634 REGC((char)ender, s++);
4638 Set_Node_Cur_Length(ret); /* MJD */
4639 nextchar(pRExC_state);
4641 /* len is STRLEN which is unsigned, need to copy to signed */
4644 vFAIL("Internal disaster");
4648 if (len == 1 && UNI_IS_INVARIANT(ender))
4652 RExC_size += STR_SZ(len);
4655 RExC_emit += STR_SZ(len);
4661 /* If the encoding pragma is in effect recode the text of
4662 * any EXACT-kind nodes. */
4663 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4664 const STRLEN oldlen = STR_LEN(ret);
4665 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4669 if (sv_utf8_downgrade(sv, TRUE)) {
4670 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4671 const STRLEN newlen = SvCUR(sv);
4676 GET_RE_DEBUG_FLAGS_DECL;
4677 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4678 (int)oldlen, STRING(ret),
4680 Copy(s, STRING(ret), newlen, char);
4681 STR_LEN(ret) += newlen - oldlen;
4682 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4684 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4692 S_regwhite(char *p, const char *e)
4697 else if (*p == '#') {
4700 } while (p < e && *p != '\n');
4708 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4709 Character classes ([:foo:]) can also be negated ([:^foo:]).
4710 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4711 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4712 but trigger failures because they are currently unimplemented. */
4714 #define POSIXCC_DONE(c) ((c) == ':')
4715 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4716 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4719 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4722 I32 namedclass = OOB_NAMEDCLASS;
4724 if (value == '[' && RExC_parse + 1 < RExC_end &&
4725 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4726 POSIXCC(UCHARAT(RExC_parse))) {
4727 const char c = UCHARAT(RExC_parse);
4728 char* const s = RExC_parse++;
4730 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4732 if (RExC_parse == RExC_end)
4733 /* Grandfather lone [:, [=, [. */
4736 const char* const t = RExC_parse++; /* skip over the c */
4739 if (UCHARAT(RExC_parse) == ']') {
4740 const char *posixcc = s + 1;
4741 RExC_parse++; /* skip over the ending ] */
4744 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4745 const I32 skip = t - posixcc;
4747 /* Initially switch on the length of the name. */
4750 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
4751 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4754 /* Names all of length 5. */
4755 /* alnum alpha ascii blank cntrl digit graph lower
4756 print punct space upper */
4757 /* Offset 4 gives the best switch position. */
4758 switch (posixcc[4]) {
4760 if (memEQ(posixcc, "alph", 4)) /* alpha */
4761 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4764 if (memEQ(posixcc, "spac", 4)) /* space */
4765 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4768 if (memEQ(posixcc, "grap", 4)) /* graph */
4769 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4772 if (memEQ(posixcc, "asci", 4)) /* ascii */
4773 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
4776 if (memEQ(posixcc, "blan", 4)) /* blank */
4777 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4780 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
4781 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4784 if (memEQ(posixcc, "alnu", 4)) /* alnum */
4785 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4788 if (memEQ(posixcc, "lowe", 4)) /* lower */
4789 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4790 else if (memEQ(posixcc, "uppe", 4)) /* upper */
4791 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4794 if (memEQ(posixcc, "digi", 4)) /* digit */
4795 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4796 else if (memEQ(posixcc, "prin", 4)) /* print */
4797 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4798 else if (memEQ(posixcc, "punc", 4)) /* punct */
4799 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4804 if (memEQ(posixcc, "xdigit", 6))
4805 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4809 if (namedclass == OOB_NAMEDCLASS)
4810 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4812 assert (posixcc[skip] == ':');
4813 assert (posixcc[skip+1] == ']');
4814 } else if (!SIZE_ONLY) {
4815 /* [[=foo=]] and [[.foo.]] are still future. */
4817 /* adjust RExC_parse so the warning shows after
4819 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4821 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4824 /* Maternal grandfather:
4825 * "[:" ending in ":" but not in ":]" */
4835 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4838 if (POSIXCC(UCHARAT(RExC_parse))) {
4839 const char *s = RExC_parse;
4840 const char c = *s++;
4844 if (*s && c == *s && s[1] == ']') {
4845 if (ckWARN(WARN_REGEXP))
4847 "POSIX syntax [%c %c] belongs inside character classes",
4850 /* [[=foo=]] and [[.foo.]] are still future. */
4851 if (POSIXCC_NOTYET(c)) {
4852 /* adjust RExC_parse so the error shows after
4854 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4856 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4864 parse a class specification and produce either an ANYOF node that
4865 matches the pattern. If the pattern matches a single char only and
4866 that char is < 256 then we produce an EXACT node instead.
4869 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
4872 register UV value = 0;
4873 register UV nextvalue;
4874 register IV prevvalue = OOB_UNICODE;
4875 register IV range = 0;
4876 register regnode *ret;
4879 char *rangebegin = NULL;
4880 bool need_class = 0;
4883 bool optimize_invert = TRUE;
4884 AV* unicode_alternate = NULL;
4886 UV literal_endpoint = 0;
4888 UV stored = 0; /* number of chars stored in the class */
4890 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
4891 case we need to change the emitted regop to an EXACT. */
4892 GET_RE_DEBUG_FLAGS_DECL;
4893 DEBUG_PARSE("clas");
4895 /* Assume we are going to generate an ANYOF node. */
4896 ret = reganode(pRExC_state, ANYOF, 0);
4899 ANYOF_FLAGS(ret) = 0;
4901 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4905 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4909 RExC_size += ANYOF_SKIP;
4910 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
4913 RExC_emit += ANYOF_SKIP;
4915 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4917 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4918 ANYOF_BITMAP_ZERO(ret);
4919 listsv = newSVpvs("# comment\n");
4922 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4924 if (!SIZE_ONLY && POSIXCC(nextvalue))
4925 checkposixcc(pRExC_state);
4927 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4928 if (UCHARAT(RExC_parse) == ']')
4931 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4935 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4938 rangebegin = RExC_parse;
4940 value = utf8n_to_uvchr((U8*)RExC_parse,
4941 RExC_end - RExC_parse,
4942 &numlen, UTF8_ALLOW_DEFAULT);
4943 RExC_parse += numlen;
4946 value = UCHARAT(RExC_parse++);
4948 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4949 if (value == '[' && POSIXCC(nextvalue))
4950 namedclass = regpposixcc(pRExC_state, value);
4951 else if (value == '\\') {
4953 value = utf8n_to_uvchr((U8*)RExC_parse,
4954 RExC_end - RExC_parse,
4955 &numlen, UTF8_ALLOW_DEFAULT);
4956 RExC_parse += numlen;
4959 value = UCHARAT(RExC_parse++);
4960 /* Some compilers cannot handle switching on 64-bit integer
4961 * values, therefore value cannot be an UV. Yes, this will
4962 * be a problem later if we want switch on Unicode.
4963 * A similar issue a little bit later when switching on
4964 * namedclass. --jhi */
4965 switch ((I32)value) {
4966 case 'w': namedclass = ANYOF_ALNUM; break;
4967 case 'W': namedclass = ANYOF_NALNUM; break;
4968 case 's': namedclass = ANYOF_SPACE; break;
4969 case 'S': namedclass = ANYOF_NSPACE; break;
4970 case 'd': namedclass = ANYOF_DIGIT; break;
4971 case 'D': namedclass = ANYOF_NDIGIT; break;
4976 if (RExC_parse >= RExC_end)
4977 vFAIL2("Empty \\%c{}", (U8)value);
4978 if (*RExC_parse == '{') {
4979 const U8 c = (U8)value;
4980 e = strchr(RExC_parse++, '}');
4982 vFAIL2("Missing right brace on \\%c{}", c);
4983 while (isSPACE(UCHARAT(RExC_parse)))
4985 if (e == RExC_parse)
4986 vFAIL2("Empty \\%c{}", c);
4988 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4996 if (UCHARAT(RExC_parse) == '^') {
4999 value = value == 'p' ? 'P' : 'p'; /* toggle */
5000 while (isSPACE(UCHARAT(RExC_parse))) {
5005 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5006 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5009 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5010 namedclass = ANYOF_MAX; /* no official name, but it's named */
5013 case 'n': value = '\n'; break;
5014 case 'r': value = '\r'; break;
5015 case 't': value = '\t'; break;
5016 case 'f': value = '\f'; break;
5017 case 'b': value = '\b'; break;
5018 case 'e': value = ASCII_TO_NATIVE('\033');break;
5019 case 'a': value = ASCII_TO_NATIVE('\007');break;
5021 if (*RExC_parse == '{') {
5022 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5023 | PERL_SCAN_DISALLOW_PREFIX;
5024 char * const e = strchr(RExC_parse++, '}');
5026 vFAIL("Missing right brace on \\x{}");
5028 numlen = e - RExC_parse;
5029 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5033 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5035 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5036 RExC_parse += numlen;
5040 value = UCHARAT(RExC_parse++);
5041 value = toCTRL(value);
5043 case '0': case '1': case '2': case '3': case '4':
5044 case '5': case '6': case '7': case '8': case '9':
5048 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5049 RExC_parse += numlen;
5053 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5055 "Unrecognized escape \\%c in character class passed through",
5059 } /* end of \blah */
5065 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5067 if (!SIZE_ONLY && !need_class)
5068 ANYOF_CLASS_ZERO(ret);
5072 /* a bad range like a-\d, a-[:digit:] ? */
5075 if (ckWARN(WARN_REGEXP)) {
5077 RExC_parse >= rangebegin ?
5078 RExC_parse - rangebegin : 0;
5080 "False [] range \"%*.*s\"",
5083 if (prevvalue < 256) {
5084 ANYOF_BITMAP_SET(ret, prevvalue);
5085 ANYOF_BITMAP_SET(ret, '-');
5088 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5089 Perl_sv_catpvf(aTHX_ listsv,
5090 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5094 range = 0; /* this was not a true range */
5098 const char *what = NULL;
5101 if (namedclass > OOB_NAMEDCLASS)
5102 optimize_invert = FALSE;
5103 /* Possible truncation here but in some 64-bit environments
5104 * the compiler gets heartburn about switch on 64-bit values.
5105 * A similar issue a little earlier when switching on value.
5107 switch ((I32)namedclass) {
5110 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5112 for (value = 0; value < 256; value++)
5114 ANYOF_BITMAP_SET(ret, value);
5121 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5123 for (value = 0; value < 256; value++)
5124 if (!isALNUM(value))
5125 ANYOF_BITMAP_SET(ret, value);
5132 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5134 for (value = 0; value < 256; value++)
5135 if (isALNUMC(value))
5136 ANYOF_BITMAP_SET(ret, value);
5143 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5145 for (value = 0; value < 256; value++)
5146 if (!isALNUMC(value))
5147 ANYOF_BITMAP_SET(ret, value);
5154 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5156 for (value = 0; value < 256; value++)
5158 ANYOF_BITMAP_SET(ret, value);
5165 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5167 for (value = 0; value < 256; value++)
5168 if (!isALPHA(value))
5169 ANYOF_BITMAP_SET(ret, value);
5176 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5179 for (value = 0; value < 128; value++)
5180 ANYOF_BITMAP_SET(ret, value);
5182 for (value = 0; value < 256; value++) {
5184 ANYOF_BITMAP_SET(ret, value);
5193 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5196 for (value = 128; value < 256; value++)
5197 ANYOF_BITMAP_SET(ret, value);
5199 for (value = 0; value < 256; value++) {
5200 if (!isASCII(value))
5201 ANYOF_BITMAP_SET(ret, value);
5210 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5212 for (value = 0; value < 256; value++)
5214 ANYOF_BITMAP_SET(ret, value);
5221 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5223 for (value = 0; value < 256; value++)
5224 if (!isBLANK(value))
5225 ANYOF_BITMAP_SET(ret, value);
5232 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5234 for (value = 0; value < 256; value++)
5236 ANYOF_BITMAP_SET(ret, value);
5243 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5245 for (value = 0; value < 256; value++)
5246 if (!isCNTRL(value))
5247 ANYOF_BITMAP_SET(ret, value);
5254 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5256 /* consecutive digits assumed */
5257 for (value = '0'; value <= '9'; value++)
5258 ANYOF_BITMAP_SET(ret, value);
5265 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5267 /* consecutive digits assumed */
5268 for (value = 0; value < '0'; value++)
5269 ANYOF_BITMAP_SET(ret, value);
5270 for (value = '9' + 1; value < 256; value++)
5271 ANYOF_BITMAP_SET(ret, value);
5278 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5280 for (value = 0; value < 256; value++)
5282 ANYOF_BITMAP_SET(ret, value);
5289 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5291 for (value = 0; value < 256; value++)
5292 if (!isGRAPH(value))
5293 ANYOF_BITMAP_SET(ret, value);
5300 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5302 for (value = 0; value < 256; value++)
5304 ANYOF_BITMAP_SET(ret, value);
5311 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5313 for (value = 0; value < 256; value++)
5314 if (!isLOWER(value))
5315 ANYOF_BITMAP_SET(ret, value);
5322 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5324 for (value = 0; value < 256; value++)
5326 ANYOF_BITMAP_SET(ret, value);
5333 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5335 for (value = 0; value < 256; value++)
5336 if (!isPRINT(value))
5337 ANYOF_BITMAP_SET(ret, value);
5344 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5346 for (value = 0; value < 256; value++)
5347 if (isPSXSPC(value))
5348 ANYOF_BITMAP_SET(ret, value);
5355 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5357 for (value = 0; value < 256; value++)
5358 if (!isPSXSPC(value))
5359 ANYOF_BITMAP_SET(ret, value);
5366 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5368 for (value = 0; value < 256; value++)
5370 ANYOF_BITMAP_SET(ret, value);
5377 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5379 for (value = 0; value < 256; value++)
5380 if (!isPUNCT(value))
5381 ANYOF_BITMAP_SET(ret, value);
5388 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5390 for (value = 0; value < 256; value++)
5392 ANYOF_BITMAP_SET(ret, value);
5399 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5401 for (value = 0; value < 256; value++)
5402 if (!isSPACE(value))
5403 ANYOF_BITMAP_SET(ret, value);
5410 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5412 for (value = 0; value < 256; value++)
5414 ANYOF_BITMAP_SET(ret, value);
5421 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5423 for (value = 0; value < 256; value++)
5424 if (!isUPPER(value))
5425 ANYOF_BITMAP_SET(ret, value);
5432 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5434 for (value = 0; value < 256; value++)
5435 if (isXDIGIT(value))
5436 ANYOF_BITMAP_SET(ret, value);
5443 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5445 for (value = 0; value < 256; value++)
5446 if (!isXDIGIT(value))
5447 ANYOF_BITMAP_SET(ret, value);
5453 /* this is to handle \p and \P */
5456 vFAIL("Invalid [::] class");
5460 /* Strings such as "+utf8::isWord\n" */
5461 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5464 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5467 } /* end of namedclass \blah */
5470 if (prevvalue > (IV)value) /* b-a */ {
5471 const int w = RExC_parse - rangebegin;
5472 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5473 range = 0; /* not a valid range */
5477 prevvalue = value; /* save the beginning of the range */
5478 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5479 RExC_parse[1] != ']') {
5482 /* a bad range like \w-, [:word:]- ? */
5483 if (namedclass > OOB_NAMEDCLASS) {
5484 if (ckWARN(WARN_REGEXP)) {
5486 RExC_parse >= rangebegin ?
5487 RExC_parse - rangebegin : 0;
5489 "False [] range \"%*.*s\"",
5493 ANYOF_BITMAP_SET(ret, '-');
5495 range = 1; /* yeah, it's a range! */
5496 continue; /* but do it the next time */
5500 /* now is the next time */
5501 stored += (value - prevvalue + 1);
5503 if (prevvalue < 256) {
5504 const IV ceilvalue = value < 256 ? value : 255;
5507 /* In EBCDIC [\x89-\x91] should include
5508 * the \x8e but [i-j] should not. */
5509 if (literal_endpoint == 2 &&
5510 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5511 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5513 if (isLOWER(prevvalue)) {
5514 for (i = prevvalue; i <= ceilvalue; i++)
5516 ANYOF_BITMAP_SET(ret, i);
5518 for (i = prevvalue; i <= ceilvalue; i++)
5520 ANYOF_BITMAP_SET(ret, i);
5525 for (i = prevvalue; i <= ceilvalue; i++)
5526 ANYOF_BITMAP_SET(ret, i);
5528 if (value > 255 || UTF) {
5529 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5530 const UV natvalue = NATIVE_TO_UNI(value);
5532 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5533 if (prevnatvalue < natvalue) { /* what about > ? */
5534 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5535 prevnatvalue, natvalue);
5537 else if (prevnatvalue == natvalue) {
5538 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5540 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5542 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5544 /* If folding and foldable and a single
5545 * character, insert also the folded version
5546 * to the charclass. */
5548 if (foldlen == (STRLEN)UNISKIP(f))
5549 Perl_sv_catpvf(aTHX_ listsv,
5552 /* Any multicharacter foldings
5553 * require the following transform:
5554 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5555 * where E folds into "pq" and F folds
5556 * into "rst", all other characters
5557 * fold to single characters. We save
5558 * away these multicharacter foldings,
5559 * to be later saved as part of the
5560 * additional "s" data. */
5563 if (!unicode_alternate)
5564 unicode_alternate = newAV();
5565 sv = newSVpvn((char*)foldbuf, foldlen);
5567 av_push(unicode_alternate, sv);
5571 /* If folding and the value is one of the Greek
5572 * sigmas insert a few more sigmas to make the
5573 * folding rules of the sigmas to work right.
5574 * Note that not all the possible combinations
5575 * are handled here: some of them are handled
5576 * by the standard folding rules, and some of
5577 * them (literal or EXACTF cases) are handled
5578 * during runtime in regexec.c:S_find_byclass(). */
5579 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5580 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5581 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5582 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5583 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5585 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5586 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5587 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5592 literal_endpoint = 0;
5596 range = 0; /* this range (if it was one) is done now */
5600 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5602 RExC_size += ANYOF_CLASS_ADD_SKIP;
5604 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5610 /****** !SIZE_ONLY AFTER HERE *********/
5612 if( stored == 1 && value < 256
5613 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5615 /* optimize single char class to an EXACT node
5616 but *only* when its not a UTF/high char */
5617 RExC_emit = orig_emit;
5618 ret = reg_node(pRExC_state,
5619 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5620 *STRING(ret)= (char)value;
5622 RExC_emit += STR_SZ(1);
5625 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5626 if ( /* If the only flag is folding (plus possibly inversion). */
5627 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5629 for (value = 0; value < 256; ++value) {
5630 if (ANYOF_BITMAP_TEST(ret, value)) {
5631 UV fold = PL_fold[value];
5634 ANYOF_BITMAP_SET(ret, fold);
5637 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5640 /* optimize inverted simple patterns (e.g. [^a-z]) */
5641 if (optimize_invert &&
5642 /* If the only flag is inversion. */
5643 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5644 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5645 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5646 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5649 AV * const av = newAV();
5651 /* The 0th element stores the character class description
5652 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5653 * to initialize the appropriate swash (which gets stored in
5654 * the 1st element), and also useful for dumping the regnode.
5655 * The 2nd element stores the multicharacter foldings,
5656 * used later (regexec.c:S_reginclass()). */
5657 av_store(av, 0, listsv);
5658 av_store(av, 1, NULL);
5659 av_store(av, 2, (SV*)unicode_alternate);
5660 rv = newRV_noinc((SV*)av);
5661 n = add_data(pRExC_state, 1, "s");
5662 RExC_rx->data->data[n] = (void*)rv;
5669 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5671 char* const retval = RExC_parse++;
5674 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5675 RExC_parse[2] == '#') {
5676 while (*RExC_parse != ')') {
5677 if (RExC_parse == RExC_end)
5678 FAIL("Sequence (?#... not terminated");
5684 if (RExC_flags & PMf_EXTENDED) {
5685 if (isSPACE(*RExC_parse)) {
5689 else if (*RExC_parse == '#') {
5690 while (RExC_parse < RExC_end)
5691 if (*RExC_parse++ == '\n') break;
5700 - reg_node - emit a node
5702 STATIC regnode * /* Location. */
5703 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5706 register regnode *ptr;
5707 regnode * const ret = RExC_emit;
5710 SIZE_ALIGN(RExC_size);
5714 NODE_ALIGN_FILL(ret);
5716 FILL_ADVANCE_NODE(ptr, op);
5717 if (RExC_offsets) { /* MJD */
5718 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5719 "reg_node", __LINE__,
5721 RExC_emit - RExC_emit_start > RExC_offsets[0]
5722 ? "Overwriting end of array!\n" : "OK",
5723 RExC_emit - RExC_emit_start,
5724 RExC_parse - RExC_start,
5726 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5735 - reganode - emit a node with an argument
5737 STATIC regnode * /* Location. */
5738 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5741 register regnode *ptr;
5742 regnode * const ret = RExC_emit;
5745 SIZE_ALIGN(RExC_size);
5750 NODE_ALIGN_FILL(ret);
5752 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5753 if (RExC_offsets) { /* MJD */
5754 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5758 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5759 "Overwriting end of array!\n" : "OK",
5760 RExC_emit - RExC_emit_start,
5761 RExC_parse - RExC_start,
5763 Set_Cur_Node_Offset;
5772 - reguni - emit (if appropriate) a Unicode character
5775 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
5778 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5782 - reginsert - insert an operator in front of already-emitted operand
5784 * Means relocating the operand.
5787 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5790 register regnode *src;
5791 register regnode *dst;
5792 register regnode *place;
5793 const int offset = regarglen[(U8)op];
5795 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5798 RExC_size += NODE_STEP_REGNODE + offset;
5803 RExC_emit += NODE_STEP_REGNODE + offset;
5805 while (src > opnd) {
5806 StructCopy(--src, --dst, regnode);
5807 if (RExC_offsets) { /* MJD 20010112 */
5808 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5812 dst - RExC_emit_start > RExC_offsets[0]
5813 ? "Overwriting end of array!\n" : "OK",
5814 src - RExC_emit_start,
5815 dst - RExC_emit_start,
5817 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5818 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5823 place = opnd; /* Op node, where operand used to be. */
5824 if (RExC_offsets) { /* MJD */
5825 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5829 place - RExC_emit_start > RExC_offsets[0]
5830 ? "Overwriting end of array!\n" : "OK",
5831 place - RExC_emit_start,
5832 RExC_parse - RExC_start,
5834 Set_Node_Offset(place, RExC_parse);
5835 Set_Node_Length(place, 1);
5837 src = NEXTOPER(place);
5838 FILL_ADVANCE_NODE(place, op);
5839 Zero(src, offset, regnode);
5843 - regtail - set the next-pointer at the end of a node chain of p to val.
5844 - SEE ALSO: regtail_study
5846 /* TODO: All three parms should be const */
5848 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
5851 register regnode *scan;
5852 GET_RE_DEBUG_FLAGS_DECL;
5857 /* Find last node. */
5860 regnode * const temp = regnext(scan);
5862 SV * const mysv=sv_newmortal();
5863 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
5864 regprop(RExC_rx, mysv, scan);
5865 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
5866 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
5873 if (reg_off_by_arg[OP(scan)]) {
5874 ARG_SET(scan, val - scan);
5877 NEXT_OFF(scan) = val - scan;
5882 - regtail_study - set the next-pointer at the end of a node chain of p to val.
5883 - Look for optimizable sequences at the same time.
5884 - currently only looks for EXACT chains.
5886 /* TODO: All four parms should be const */
5888 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
5891 register regnode *scan;
5893 GET_RE_DEBUG_FLAGS_DECL;
5898 /* Find last node. */
5902 regnode * const temp = regnext(scan);
5908 if( exact == PSEUDO )
5910 else if ( exact != OP(scan) )
5919 SV * const mysv=sv_newmortal();
5920 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
5921 regprop(RExC_rx, mysv, scan);
5922 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
5923 SvPV_nolen_const(mysv),
5925 REG_NODE_NUM(scan));
5932 if (reg_off_by_arg[OP(scan)]) {
5933 ARG_SET(scan, val - scan);
5936 NEXT_OFF(scan) = val - scan;
5943 - regcurly - a little FSA that accepts {\d+,?\d*}
5946 S_regcurly(register const char *s)
5965 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5968 Perl_regdump(pTHX_ const regexp *r)
5972 SV * const sv = sv_newmortal();
5974 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
5976 /* Header fields of interest. */
5977 if (r->anchored_substr)
5978 PerlIO_printf(Perl_debug_log,
5979 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5981 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5982 SvPVX_const(r->anchored_substr),
5984 SvTAIL(r->anchored_substr) ? "$" : "",
5985 (IV)r->anchored_offset);
5986 else if (r->anchored_utf8)
5987 PerlIO_printf(Perl_debug_log,
5988 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5990 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5991 SvPVX_const(r->anchored_utf8),
5993 SvTAIL(r->anchored_utf8) ? "$" : "",
5994 (IV)r->anchored_offset);
5995 if (r->float_substr)
5996 PerlIO_printf(Perl_debug_log,
5997 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5999 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
6000 SvPVX_const(r->float_substr),
6002 SvTAIL(r->float_substr) ? "$" : "",
6003 (IV)r->float_min_offset, (UV)r->float_max_offset);
6004 else if (r->float_utf8)
6005 PerlIO_printf(Perl_debug_log,
6006 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6008 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
6009 SvPVX_const(r->float_utf8),
6011 SvTAIL(r->float_utf8) ? "$" : "",
6012 (IV)r->float_min_offset, (UV)r->float_max_offset);
6013 if (r->check_substr || r->check_utf8)
6014 PerlIO_printf(Perl_debug_log,
6015 r->check_substr == r->float_substr
6016 && r->check_utf8 == r->float_utf8
6017 ? "(checking floating" : "(checking anchored");
6018 if (r->reganch & ROPT_NOSCAN)
6019 PerlIO_printf(Perl_debug_log, " noscan");
6020 if (r->reganch & ROPT_CHECK_ALL)
6021 PerlIO_printf(Perl_debug_log, " isall");
6022 if (r->check_substr || r->check_utf8)
6023 PerlIO_printf(Perl_debug_log, ") ");
6025 if (r->regstclass) {
6026 regprop(r, sv, r->regstclass);
6027 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6029 if (r->reganch & ROPT_ANCH) {
6030 PerlIO_printf(Perl_debug_log, "anchored");
6031 if (r->reganch & ROPT_ANCH_BOL)
6032 PerlIO_printf(Perl_debug_log, "(BOL)");
6033 if (r->reganch & ROPT_ANCH_MBOL)
6034 PerlIO_printf(Perl_debug_log, "(MBOL)");
6035 if (r->reganch & ROPT_ANCH_SBOL)
6036 PerlIO_printf(Perl_debug_log, "(SBOL)");
6037 if (r->reganch & ROPT_ANCH_GPOS)
6038 PerlIO_printf(Perl_debug_log, "(GPOS)");
6039 PerlIO_putc(Perl_debug_log, ' ');
6041 if (r->reganch & ROPT_GPOS_SEEN)
6042 PerlIO_printf(Perl_debug_log, "GPOS ");
6043 if (r->reganch & ROPT_SKIP)
6044 PerlIO_printf(Perl_debug_log, "plus ");
6045 if (r->reganch & ROPT_IMPLICIT)
6046 PerlIO_printf(Perl_debug_log, "implicit ");
6047 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6048 if (r->reganch & ROPT_EVAL_SEEN)
6049 PerlIO_printf(Perl_debug_log, "with eval ");
6050 PerlIO_printf(Perl_debug_log, "\n");
6052 const U32 len = r->offsets[0];
6053 GET_RE_DEBUG_FLAGS_DECL;
6056 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
6057 for (i = 1; i <= len; i++)
6058 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
6059 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
6060 PerlIO_printf(Perl_debug_log, "\n");
6064 PERL_UNUSED_CONTEXT;
6066 #endif /* DEBUGGING */
6070 - regprop - printable representation of opcode
6073 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6079 sv_setpvn(sv, "", 0);
6080 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6081 /* It would be nice to FAIL() here, but this may be called from
6082 regexec.c, and it would be hard to supply pRExC_state. */
6083 Perl_croak(aTHX_ "Corrupted regexp opcode");
6084 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6086 k = PL_regkind[OP(o)];
6089 SV * const dsv = sv_2mortal(newSVpvs(""));
6090 /* Using is_utf8_string() is a crude hack but it may
6091 * be the best for now since we have no flag "this EXACTish
6092 * node was UTF-8" --jhi */
6093 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
6094 const char * const s = do_utf8 ?
6095 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6096 UNI_DISPLAY_REGEX) :
6098 const int len = do_utf8 ?
6101 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6105 } else if (k == TRIE) {
6106 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6107 /* print the details of the trie in dumpuntil instead, as
6108 * prog->data isn't available here */
6109 } else if (k == CURLY) {
6110 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6111 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6112 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6114 else if (k == WHILEM && o->flags) /* Ordinal/of */
6115 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6116 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6117 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6118 else if (k == LOGICAL)
6119 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6120 else if (k == ANYOF) {
6121 int i, rangestart = -1;
6122 const U8 flags = ANYOF_FLAGS(o);
6124 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6125 static const char * const anyofs[] = {
6158 if (flags & ANYOF_LOCALE)
6159 sv_catpvs(sv, "{loc}");
6160 if (flags & ANYOF_FOLD)
6161 sv_catpvs(sv, "{i}");
6162 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6163 if (flags & ANYOF_INVERT)
6165 for (i = 0; i <= 256; i++) {
6166 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6167 if (rangestart == -1)
6169 } else if (rangestart != -1) {
6170 if (i <= rangestart + 3)
6171 for (; rangestart < i; rangestart++)
6172 put_byte(sv, rangestart);
6174 put_byte(sv, rangestart);
6176 put_byte(sv, i - 1);
6182 if (o->flags & ANYOF_CLASS)
6183 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6184 if (ANYOF_CLASS_TEST(o,i))
6185 sv_catpv(sv, anyofs[i]);
6187 if (flags & ANYOF_UNICODE)
6188 sv_catpvs(sv, "{unicode}");
6189 else if (flags & ANYOF_UNICODE_ALL)
6190 sv_catpvs(sv, "{unicode_all}");
6194 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6198 U8 s[UTF8_MAXBYTES_CASE+1];
6200 for (i = 0; i <= 256; i++) { /* just the first 256 */
6201 uvchr_to_utf8(s, i);
6203 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6204 if (rangestart == -1)
6206 } else if (rangestart != -1) {
6207 if (i <= rangestart + 3)
6208 for (; rangestart < i; rangestart++) {
6209 const U8 * const e = uvchr_to_utf8(s,rangestart);
6211 for(p = s; p < e; p++)
6215 const U8 *e = uvchr_to_utf8(s,rangestart);
6217 for (p = s; p < e; p++)
6220 e = uvchr_to_utf8(s, i-1);
6221 for (p = s; p < e; p++)
6228 sv_catpvs(sv, "..."); /* et cetera */
6232 char *s = savesvpv(lv);
6233 char * const origs = s;
6235 while (*s && *s != '\n')
6239 const char * const t = ++s;
6257 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6259 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6260 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
6262 PERL_UNUSED_CONTEXT;
6263 PERL_UNUSED_ARG(sv);
6265 #endif /* DEBUGGING */
6269 Perl_re_intuit_string(pTHX_ regexp *prog)
6270 { /* Assume that RE_INTUIT is set */
6272 GET_RE_DEBUG_FLAGS_DECL;
6273 PERL_UNUSED_CONTEXT;
6277 const char * const s = SvPV_nolen_const(prog->check_substr
6278 ? prog->check_substr : prog->check_utf8);
6280 if (!PL_colorset) reginitcolors();
6281 PerlIO_printf(Perl_debug_log,
6282 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6284 prog->check_substr ? "" : "utf8 ",
6285 PL_colors[5],PL_colors[0],
6288 (strlen(s) > 60 ? "..." : ""));
6291 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6295 Perl_pregfree(pTHX_ struct regexp *r)
6299 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6301 GET_RE_DEBUG_FLAGS_DECL;
6303 if (!r || (--r->refcnt > 0))
6305 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6306 const char * const s = (r->reganch & ROPT_UTF8)
6307 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6308 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6309 const int len = SvCUR(dsv);
6312 PerlIO_printf(Perl_debug_log,
6313 "%sFreeing REx:%s %s%*.*s%s%s\n",
6314 PL_colors[4],PL_colors[5],PL_colors[0],
6317 len > 60 ? "..." : "");
6320 /* gcov results gave these as non-null 100% of the time, so there's no
6321 optimisation in checking them before calling Safefree */
6322 Safefree(r->precomp);
6323 Safefree(r->offsets); /* 20010421 MJD */
6324 RX_MATCH_COPY_FREE(r);
6325 #ifdef PERL_OLD_COPY_ON_WRITE
6327 SvREFCNT_dec(r->saved_copy);
6330 if (r->anchored_substr)
6331 SvREFCNT_dec(r->anchored_substr);
6332 if (r->anchored_utf8)
6333 SvREFCNT_dec(r->anchored_utf8);
6334 if (r->float_substr)
6335 SvREFCNT_dec(r->float_substr);
6337 SvREFCNT_dec(r->float_utf8);
6338 Safefree(r->substrs);
6341 int n = r->data->count;
6342 PAD* new_comppad = NULL;
6347 /* If you add a ->what type here, update the comment in regcomp.h */
6348 switch (r->data->what[n]) {
6350 SvREFCNT_dec((SV*)r->data->data[n]);
6353 Safefree(r->data->data[n]);
6356 new_comppad = (AV*)r->data->data[n];
6359 if (new_comppad == NULL)
6360 Perl_croak(aTHX_ "panic: pregfree comppad");
6361 PAD_SAVE_LOCAL(old_comppad,
6362 /* Watch out for global destruction's random ordering. */
6363 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6366 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6369 op_free((OP_4tree*)r->data->data[n]);
6371 PAD_RESTORE_LOCAL(old_comppad);
6372 SvREFCNT_dec((SV*)new_comppad);
6379 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
6382 refcount = --trie->refcount;
6385 Safefree(trie->charmap);
6386 if (trie->widecharmap)
6387 SvREFCNT_dec((SV*)trie->widecharmap);
6388 Safefree(trie->states);
6389 Safefree(trie->trans);
6391 Safefree(trie->bitmap);
6394 SvREFCNT_dec((SV*)trie->words);
6395 if (trie->revcharmap)
6396 SvREFCNT_dec((SV*)trie->revcharmap);
6398 Safefree(r->data->data[n]); /* do this last!!!! */
6403 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6406 Safefree(r->data->what);
6409 Safefree(r->startp);
6414 #ifndef PERL_IN_XSUB_RE
6416 - regnext - dig the "next" pointer out of a node
6419 Perl_regnext(pTHX_ register regnode *p)
6422 register I32 offset;
6424 if (p == &PL_regdummy)
6427 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6436 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6439 STRLEN l1 = strlen(pat1);
6440 STRLEN l2 = strlen(pat2);
6443 const char *message;
6449 Copy(pat1, buf, l1 , char);
6450 Copy(pat2, buf + l1, l2 , char);
6451 buf[l1 + l2] = '\n';
6452 buf[l1 + l2 + 1] = '\0';
6454 /* ANSI variant takes additional second argument */
6455 va_start(args, pat2);
6459 msv = vmess(buf, &args);
6461 message = SvPV_const(msv,l1);
6464 Copy(message, buf, l1 , char);
6465 buf[l1-1] = '\0'; /* Overwrite \n */
6466 Perl_croak(aTHX_ "%s", buf);
6469 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6471 #ifndef PERL_IN_XSUB_RE
6473 Perl_save_re_context(pTHX)
6477 struct re_save_state *state;
6479 SAVEVPTR(PL_curcop);
6480 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6482 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6483 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6484 SSPUSHINT(SAVEt_RE_STATE);
6486 Copy(&PL_reg_state, state, 1, struct re_save_state);
6488 PL_reg_start_tmp = 0;
6489 PL_reg_start_tmpl = 0;
6490 PL_reg_oldsaved = NULL;
6491 PL_reg_oldsavedlen = 0;
6493 PL_reg_leftiter = 0;
6494 PL_reg_poscache = NULL;
6495 PL_reg_poscache_size = 0;
6496 #ifdef PERL_OLD_COPY_ON_WRITE
6500 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6502 const REGEXP * const rx = PM_GETRE(PL_curpm);
6505 for (i = 1; i <= rx->nparens; i++) {
6506 char digits[TYPE_CHARS(long)];
6507 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
6508 GV *const *const gvp
6509 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6512 GV * const gv = *gvp;
6513 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6523 clear_re(pTHX_ void *r)
6526 ReREFCNT_dec((regexp *)r);
6532 S_put_byte(pTHX_ SV *sv, int c)
6534 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6535 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6536 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6537 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6539 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6543 #define CLEAR_OPTSTART \
6544 if (optstart) STMT_START { \
6545 PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart); \
6549 #define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6551 STATIC const regnode *
6552 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6553 const regnode *last, SV* sv, I32 l)
6556 register U8 op = EXACT; /* Arbitrary non-END op. */
6557 register const regnode *next;
6558 const regnode *optstart= NULL;
6559 GET_RE_DEBUG_FLAGS_DECL;
6561 while (op != END && (!last || node < last)) {
6562 /* While that wasn't END last time... */
6568 next = regnext((regnode *)node);
6570 if (OP(node) == OPTIMIZED) {
6571 if (!optstart && (SvIV(re_debug_flags) & RE_DEBUG_OPTIMISE))
6577 regprop(r, sv, node);
6578 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6579 (int)(2*l + 1), "", SvPVX_const(sv));
6581 if (OP(node) != OPTIMIZED) {
6582 if (next == NULL) /* Next ptr. */
6583 PerlIO_printf(Perl_debug_log, "(0)");
6585 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6586 (void)PerlIO_putc(Perl_debug_log, '\n');
6590 if (PL_regkind[(U8)op] == BRANCHJ) {
6591 register const regnode *nnode = (OP(next) == LONGJMP
6592 ? regnext((regnode *)next)
6594 if (last && nnode > last)
6596 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6598 else if (PL_regkind[(U8)op] == BRANCH) {
6599 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
6601 else if ( PL_regkind[(U8)op] == TRIE ) {
6602 const I32 n = ARG(node);
6603 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6604 const I32 arry_len = av_len(trie->words)+1;
6606 PerlIO_printf(Perl_debug_log,
6607 "%*s[Start:%"UVuf" Words:%d Chars:%d Unique:%d States:%"IVdf" Minlen:%d Maxlen:%d",
6611 TRIE_WORDCOUNT(trie),
6612 (int)TRIE_CHARCOUNT(trie),
6613 trie->uniquecharcount,
6614 (IV)TRIE_LASTSTATE(trie)-1,
6615 trie->minlen, trie->maxlen
6620 sv_setpvn(sv, "", 0);
6621 for (i = 0; i <= 256; i++) {
6622 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6623 if (rangestart == -1)
6625 } else if (rangestart != -1) {
6626 if (i <= rangestart + 3)
6627 for (; rangestart < i; rangestart++)
6628 put_byte(sv, rangestart);
6630 put_byte(sv, rangestart);
6632 put_byte(sv, i - 1);
6637 PerlIO_printf(Perl_debug_log, " Start-Class:%s]\n", SvPVX_const(sv));
6639 PerlIO_printf(Perl_debug_log, " No Start-Class]\n");
6641 for (word_idx=0; word_idx < arry_len; word_idx++) {
6642 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
6644 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6647 SvPV_nolen_const(*elem_ptr),
6652 PerlIO_printf(Perl_debug_log, "(0)\n");
6654 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6660 node = NEXTOPER(node);
6661 node += regarglen[(U8)op];
6664 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6665 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6666 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6668 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6669 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6672 else if ( op == PLUS || op == STAR) {
6673 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6675 else if (op == ANYOF) {
6676 /* arglen 1 + class block */
6677 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6678 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6679 node = NEXTOPER(node);
6681 else if (PL_regkind[(U8)op] == EXACT) {
6682 /* Literal string, where present. */
6683 node += NODE_SZ_STR(node) - 1;
6684 node = NEXTOPER(node);
6687 node = NEXTOPER(node);
6688 node += regarglen[(U8)op];
6690 if (op == CURLYX || op == OPEN)
6692 else if (op == WHILEM)
6699 #endif /* DEBUGGING */
6703 * c-indentation-style: bsd
6705 * indent-tabs-mode: t
6708 * ex: set ts=8 sts=4 sw=4 noet: