5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
176 /* Length of a variant. */
178 typedef struct scan_data_t {
184 I32 last_end; /* min value, <0 unless valid. */
187 SV **longest; /* Either &l_fixed, or &l_float. */
191 I32 offset_float_min;
192 I32 offset_float_max;
196 struct regnode_charclass_class *start_class;
200 * Forward declarations for pregcomp()'s friends.
203 static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL 0x0001
208 #define SF_BEFORE_MEOL 0x0002
209 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
213 # define SF_FIX_SHIFT_EOL (0+2)
214 # define SF_FL_SHIFT_EOL (0+4)
216 # define SF_FIX_SHIFT_EOL (+2)
217 # define SF_FL_SHIFT_EOL (+4)
220 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF 0x0040
226 #define SF_HAS_PAR 0x0080
227 #define SF_IN_PAR 0x0100
228 #define SF_HAS_EVAL 0x0200
229 #define SCF_DO_SUBSTR 0x0400
230 #define SCF_DO_STCLASS_AND 0x0800
231 #define SCF_DO_STCLASS_OR 0x1000
232 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS 0x2000
235 #define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
237 #define UTF (RExC_utf8 != 0)
238 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
239 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
241 #define OOB_UNICODE 12345678
242 #define OOB_NAMEDCLASS -1
244 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248 /* length of regex to show in messages that don't mark a position within */
249 #define RegexLengthToShowInErrorMessages 127
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
256 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
257 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
259 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 #define FAIL(msg) STMT_START { \
267 const char *ellipses = ""; \
268 IV len = RExC_end - RExC_precomp; \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
284 #define Simple_vFAIL(m) STMT_START { \
285 const IV offset = RExC_parse - RExC_precomp; \
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
293 #define vFAIL(m) STMT_START { \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
300 * Like Simple_vFAIL(), but accepts two arguments.
302 #define Simple_vFAIL2(m,a1) STMT_START { \
303 const IV offset = RExC_parse - RExC_precomp; \
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
311 #define vFAIL2(m,a1) STMT_START { \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
319 * Like Simple_vFAIL(), but accepts three arguments.
321 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
322 const IV offset = RExC_parse - RExC_precomp; \
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
330 #define vFAIL3(m,a1,a2) STMT_START { \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
337 * Like Simple_vFAIL(), but accepts four arguments.
339 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
340 const IV offset = RExC_parse - RExC_precomp; \
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
345 #define vWARN(loc,m) STMT_START { \
346 const IV offset = loc - RExC_precomp; \
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351 #define vWARNdep(loc,m) STMT_START { \
352 const IV offset = loc - RExC_precomp; \
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359 #define vWARN2(loc, m, a1) STMT_START { \
360 const IV offset = loc - RExC_precomp; \
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365 #define vWARN3(loc, m, a1, a2) STMT_START { \
366 const IV offset = loc - RExC_precomp; \
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
372 const IV offset = loc - RExC_precomp; \
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
378 const IV offset = loc - RExC_precomp; \
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 /* Allow for side effects in s */
385 #define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389 /* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
393 * Position is 1 indexed.
396 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
399 __LINE__, (node), (int)(byte))); \
401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
403 RExC_offsets[2*(node)-1] = (byte); \
408 #define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
412 #define Set_Node_Length_To_R(node,len) STMT_START { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
415 __LINE__, (int)(node), (int)(len))); \
417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
419 RExC_offsets[2*(node)] = (len); \
424 #define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427 #define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
430 /* Get offsets and lengths */
431 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
434 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
440 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441 #define EXPERIMENTAL_INPLACESCAN
444 static void clear_re(pTHX_ void *r);
446 /* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
451 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
457 SvSetMagicSV(*data->longest, data->last_found);
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
464 data->flags &= ~SF_FIX_BEFORE_EOL;
467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
468 data->offset_float_max = (l
469 ? data->last_start_max
470 : data->pos_min + data->pos_delta);
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
473 if (data->flags & SF_BEFORE_EOL)
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
477 data->flags &= ~SF_FL_BEFORE_EOL;
480 SvCUR_set(data->last_found, 0);
482 SV * const sv = data->last_found;
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
490 data->flags &= ~SF_BEFORE_EOL;
493 /* Can match anything (initialization) */
495 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
497 ANYOF_CLASS_ZERO(cl);
498 ANYOF_BITMAP_SETALL(cl);
499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
501 cl->flags |= ANYOF_LOCALE;
504 /* Can match anything (initialization) */
506 S_cl_is_anything(const struct regnode_charclass_class *cl)
510 for (value = 0; value <= ANYOF_MAX; value += 2)
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
520 /* Can match anything (initialization) */
522 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 Zero(cl, 1, struct regnode_charclass_class);
526 cl_anything(pRExC_state, cl);
530 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 Zero(cl, 1, struct regnode_charclass_class);
534 cl_anything(pRExC_state, cl);
536 cl->flags |= ANYOF_LOCALE;
539 /* 'And' a given class with another one. Can create false positives */
540 /* We assume that cl is not inverted */
542 S_cl_and(struct regnode_charclass_class *cl,
543 const struct regnode_charclass_class *and_with)
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
570 cl->flags &= ~ANYOF_UNICODE_ALL;
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
573 cl->flags &= ~ANYOF_UNICODE;
576 /* 'OR' a given class with another one. Can create false positives */
577 /* We assume that cl is not inverted */
579 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
581 if (or_with->flags & ANYOF_INVERT) {
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
600 cl_anything(pRExC_state, cl);
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && (!(or_with->flags & ANYOF_FOLD)
606 || (cl->flags & ANYOF_FOLD)) ) {
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
619 cl_anything(pRExC_state, cl);
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
638 make_trie(startbranch,first,last,tail,flags,depth)
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
648 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
650 A trie is an N'ary tree where the branches are determined by digital
651 decomposition of the key. IE, at the root node you look up the 1st character and
652 follow that branch repeat until you find the end of the branches. Nodes can be
653 marked as "accepting" meaning they represent a complete word. Eg:
657 would convert into the following structure. Numbers represent states, letters
658 following numbers represent valid transitions on the letter from that state, if
659 the number is in square brackets it represents an accepting state, otherwise it
660 will be in parenthesis.
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
666 (1) +-i->(6)-+-s->[7]
668 +-s->(3)-+-h->(4)-+-e->[5]
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
672 This shows that when matching against the string 'hers' we will begin at state 1
673 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676 single traverse. We store a mapping from accepting to state to which word was
677 matched, and then when we have multiple possibilities we try to complete the
678 rest of the regex in the order in which they occured in the alternation.
680 The only prior NFA like behaviour that would be changed by the TRIE support is
681 the silent ignoring of duplicate alternations which are of the form:
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
685 Thus EVAL blocks follwing a trie may be called a different number of times with
686 and without the optimisation. With the optimisations dupes will be silently
687 ignored. This inconsistant behaviour of EVAL type nodes is well established as
688 the following demonstrates:
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
692 which prints out 'word' three times, but
694 'words'=~/(word|word|word)(?{ print $1 })S/
696 which doesnt print it out at all. This is due to other optimisations kicking in.
698 Example of what happens on a structural level:
700 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
702 1: CURLYM[1] {1,32767}(18)
713 This would be optimizable with startbranch=5, first=5, last=16, tail=16
714 and should turn into:
716 1: CURLYM[1] {1,32767}(18)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
726 Cases where tail != last would be like /(?foo|bar)baz/:
736 which would be optimizable with startbranch=1, first=1, last=7, tail=8
737 and would end up looking like:
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
747 d = uvuni_to_utf8_flags(d, uv, 0);
749 is the recommended Unicode-aware way of saying
754 #define TRIE_STORE_REVCHAR \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
760 #define TRIE_READ_CHAR STMT_START { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
785 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
790 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
801 #define TRIE_LIST_NEW(state) STMT_START { \
802 Newxz( trie->states[ state ].trans.list, \
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
808 #define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
812 trie->wordlen[ curword ] = wordlen; \
813 trie->states[ state ].wordnum = ++curword; \
815 /* store the word for dumping */ \
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
820 tmp = newSVpvn( "", 0 ); \
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
825 NOOP; /* It's a dupe. So ignore it. */ \
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
835 These routines dump out a trie in a somewhat readable format.
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
852 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
855 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV **tmp = av_fetch( trie->revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
875 const U32 base = trie->states[ state ].trans.base;
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
909 PerlIO_printf( Perl_debug_log, "]");
912 PerlIO_printf( Perl_debug_log, "\n" );
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
923 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
932 for( state=1 ; state < next_alloc ; state ++ ) {
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
945 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
964 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
968 GET_RE_DEBUG_FLAGS_DECL;
971 print out the table precompression so that we can do a visual check
972 that they are identical.
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
978 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
990 PerlIO_printf( Perl_debug_log, "\n" );
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1013 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1023 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1025 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1045 U32 ucharcount = trie->uniquecharcount;
1046 U32 numstates = trie->laststate;
1047 U32 ubound = trie->lasttrans + ucharcount;
1051 U32 base = trie->states[ 1 ].trans.base;
1055 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1056 GET_RE_DEBUG_FLAGS_DECL;
1058 ARG_SET( stclass, data_slot );
1059 Newxz( aho, 1, reg_ac_data );
1060 RExC_rx->data->data[ data_slot ] = (void*)aho;
1062 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1063 (trie->laststate+1)*sizeof(reg_trie_state));
1064 Newxz( q, numstates, U32);
1065 Newxz( aho->fail, numstates, U32 );
1067 fail[ 0 ] = fail[ 1 ] = 1;
1069 for ( charid = 0; charid < ucharcount ; charid++ ) {
1070 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1073 q[ q_write ] = newstate;
1074 /* set to point at the root */
1075 fail[ q[ q_write++ ] ]=1;
1078 while ( q_read < q_write) {
1079 U32 cur = q[ q_read++ % numstates ];
1081 base = trie->states[ cur ].trans.base;
1083 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1084 if ( ( ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ) ) ) {
1085 U32 fail_state = cur;
1088 fail_state = fail[ fail_state ];
1089 fail_base = aho->states[ fail_state ].trans.base;
1090 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1092 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1093 fail[ ch_state ] = fail_state;
1094 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1096 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1098 q[ q_write++ % numstates] = ch_state;
1103 DEBUG_TRIE_COMPILE_MORE_r({
1104 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1105 for( q_read=2; q_read<numstates; q_read++ ) {
1106 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1108 PerlIO_printf(Perl_debug_log, "\n");
1111 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1117 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1120 /* first pass, loop through and scan words */
1121 reg_trie_data *trie;
1123 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1128 /* we just use folder as a flag in utf8 */
1129 const U8 * const folder = ( flags == EXACTF
1131 : ( flags == EXACTFL
1137 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1138 SV *re_trie_maxbuff;
1140 /* these are only used during construction but are useful during
1141 * debugging so we store them in the struct when debugging.
1142 * Wordcount is actually superfluous in debugging as we have
1143 * (AV*)trie->words to use for it, but that's not available when
1144 * not debugging... We could make the macro use the AV during
1145 * debugging though...
1147 U16 trie_wordcount=0;
1148 STRLEN trie_charcount=0;
1149 /*U32 trie_laststate=0;*/
1150 AV *trie_revcharmap;
1152 GET_RE_DEBUG_FLAGS_DECL;
1154 Newxz( trie, 1, reg_trie_data );
1156 trie->startstate = 1;
1157 RExC_rx->data->data[ data_slot ] = (void*)trie;
1158 Newxz( trie->charmap, 256, U16 );
1159 if (!(UTF && folder))
1160 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1162 trie->words = newAV();
1164 TRIE_REVCHARMAP(trie) = newAV();
1166 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1167 if (!SvIOK(re_trie_maxbuff)) {
1168 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1171 PerlIO_printf( Perl_debug_log,
1172 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1173 (int)depth * 2 + 2, "",
1174 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1175 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1177 /* -- First loop and Setup --
1179 We first traverse the branches and scan each word to determine if it
1180 contains widechars, and how many unique chars there are, this is
1181 important as we have to build a table with at least as many columns as we
1184 We use an array of integers to represent the character codes 0..255
1185 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1186 native representation of the character value as the key and IV's for the
1189 *TODO* If we keep track of how many times each character is used we can
1190 remap the columns so that the table compression later on is more
1191 efficient in terms of memory by ensuring most common value is in the
1192 middle and the least common are on the outside. IMO this would be better
1193 than a most to least common mapping as theres a decent chance the most
1194 common letter will share a node with the least common, meaning the node
1195 will not be compressable. With a middle is most common approach the worst
1196 case is when we have the least common nodes twice.
1200 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1201 regnode * const noper = NEXTOPER( cur );
1202 const U8 *uc = (U8*)STRING( noper );
1203 const U8 * const e = uc + STR_LEN( noper );
1205 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1206 const U8 *scan = (U8*)NULL;
1207 U32 wordlen = 0; /* required init */
1210 TRIE_WORDCOUNT(trie)++;
1211 if (OP(noper) == NOTHING) {
1216 TRIE_BITMAP_SET(trie,*uc);
1217 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1219 for ( ; uc < e ; uc += len ) {
1220 TRIE_CHARCOUNT(trie)++;
1224 if ( !trie->charmap[ uvc ] ) {
1225 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1227 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1232 if ( !trie->widecharmap )
1233 trie->widecharmap = newHV();
1235 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1238 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1240 if ( !SvTRUE( *svpp ) ) {
1241 sv_setiv( *svpp, ++trie->uniquecharcount );
1246 if( cur == first ) {
1249 } else if (chars < trie->minlen) {
1251 } else if (chars > trie->maxlen) {
1255 } /* end first pass */
1256 DEBUG_TRIE_COMPILE_r(
1257 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1258 (int)depth * 2 + 2,"",
1259 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1260 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, trie->minlen, trie->maxlen )
1262 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
1265 We now know what we are dealing with in terms of unique chars and
1266 string sizes so we can calculate how much memory a naive
1267 representation using a flat table will take. If it's over a reasonable
1268 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1269 conservative but potentially much slower representation using an array
1272 At the end we convert both representations into the same compressed
1273 form that will be used in regexec.c for matching with. The latter
1274 is a form that cannot be used to construct with but has memory
1275 properties similar to the list form and access properties similar
1276 to the table form making it both suitable for fast searches and
1277 small enough that its feasable to store for the duration of a program.
1279 See the comment in the code where the compressed table is produced
1280 inplace from the flat tabe representation for an explanation of how
1281 the compression works.
1286 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1288 Second Pass -- Array Of Lists Representation
1290 Each state will be represented by a list of charid:state records
1291 (reg_trie_trans_le) the first such element holds the CUR and LEN
1292 points of the allocated array. (See defines above).
1294 We build the initial structure using the lists, and then convert
1295 it into the compressed table form which allows faster lookups
1296 (but cant be modified once converted).
1299 STRLEN transcount = 1;
1301 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1305 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1307 regnode * const noper = NEXTOPER( cur );
1308 U8 *uc = (U8*)STRING( noper );
1309 const U8 * const e = uc + STR_LEN( noper );
1310 U32 state = 1; /* required init */
1311 U16 charid = 0; /* sanity init */
1312 U8 *scan = (U8*)NULL; /* sanity init */
1313 STRLEN foldlen = 0; /* required init */
1314 U32 wordlen = 0; /* required init */
1315 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1317 if (OP(noper) != NOTHING) {
1318 for ( ; uc < e ; uc += len ) {
1323 charid = trie->charmap[ uvc ];
1325 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1329 charid=(U16)SvIV( *svpp );
1338 if ( !trie->states[ state ].trans.list ) {
1339 TRIE_LIST_NEW( state );
1341 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1342 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1343 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1348 newstate = next_alloc++;
1349 TRIE_LIST_PUSH( state, charid, newstate );
1354 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1356 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1359 TRIE_HANDLE_WORD(state);
1361 } /* end second pass */
1363 TRIE_LASTSTATE(trie) = next_alloc;
1364 Renew( trie->states, next_alloc, reg_trie_state );
1366 /* and now dump it out before we compress it */
1367 DEBUG_TRIE_COMPILE_MORE_r(
1368 dump_trie_interim_list(trie,next_alloc,depth+1)
1371 Newxz( trie->trans, transcount ,reg_trie_trans );
1378 for( state=1 ; state < next_alloc ; state ++ ) {
1382 DEBUG_TRIE_COMPILE_MORE_r(
1383 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1387 if (trie->states[state].trans.list) {
1388 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1392 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1393 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1394 if ( forid < minid ) {
1396 } else if ( forid > maxid ) {
1400 if ( transcount < tp + maxid - minid + 1) {
1402 Renew( trie->trans, transcount, reg_trie_trans );
1403 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1405 base = trie->uniquecharcount + tp - minid;
1406 if ( maxid == minid ) {
1408 for ( ; zp < tp ; zp++ ) {
1409 if ( ! trie->trans[ zp ].next ) {
1410 base = trie->uniquecharcount + zp - minid;
1411 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1412 trie->trans[ zp ].check = state;
1418 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1419 trie->trans[ tp ].check = state;
1424 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1425 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1426 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1427 trie->trans[ tid ].check = state;
1429 tp += ( maxid - minid + 1 );
1431 Safefree(trie->states[ state ].trans.list);
1434 DEBUG_TRIE_COMPILE_MORE_r(
1435 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1438 trie->states[ state ].trans.base=base;
1440 trie->lasttrans = tp + 1;
1444 Second Pass -- Flat Table Representation.
1446 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1447 We know that we will need Charcount+1 trans at most to store the data
1448 (one row per char at worst case) So we preallocate both structures
1449 assuming worst case.
1451 We then construct the trie using only the .next slots of the entry
1454 We use the .check field of the first entry of the node temporarily to
1455 make compression both faster and easier by keeping track of how many non
1456 zero fields are in the node.
1458 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1461 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1462 number representing the first entry of the node, and state as a
1463 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1464 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1465 are 2 entrys per node. eg:
1473 The table is internally in the right hand, idx form. However as we also
1474 have to deal with the states array which is indexed by nodenum we have to
1475 use TRIE_NODENUM() to convert.
1480 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1482 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1483 next_alloc = trie->uniquecharcount + 1;
1486 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1488 regnode * const noper = NEXTOPER( cur );
1489 const U8 *uc = (U8*)STRING( noper );
1490 const U8 * const e = uc + STR_LEN( noper );
1492 U32 state = 1; /* required init */
1494 U16 charid = 0; /* sanity init */
1495 U32 accept_state = 0; /* sanity init */
1496 U8 *scan = (U8*)NULL; /* sanity init */
1498 STRLEN foldlen = 0; /* required init */
1499 U32 wordlen = 0; /* required init */
1500 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1502 if ( OP(noper) != NOTHING ) {
1503 for ( ; uc < e ; uc += len ) {
1508 charid = trie->charmap[ uvc ];
1510 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1511 charid = svpp ? (U16)SvIV(*svpp) : 0;
1515 if ( !trie->trans[ state + charid ].next ) {
1516 trie->trans[ state + charid ].next = next_alloc;
1517 trie->trans[ state ].check++;
1518 next_alloc += trie->uniquecharcount;
1520 state = trie->trans[ state + charid ].next;
1522 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1524 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1527 accept_state = TRIE_NODENUM( state );
1528 TRIE_HANDLE_WORD(accept_state);
1530 } /* end second pass */
1532 /* and now dump it out before we compress it */
1533 DEBUG_TRIE_COMPILE_MORE_r(
1534 dump_trie_interim_table(trie,next_alloc,depth+1)
1539 * Inplace compress the table.*
1541 For sparse data sets the table constructed by the trie algorithm will
1542 be mostly 0/FAIL transitions or to put it another way mostly empty.
1543 (Note that leaf nodes will not contain any transitions.)
1545 This algorithm compresses the tables by eliminating most such
1546 transitions, at the cost of a modest bit of extra work during lookup:
1548 - Each states[] entry contains a .base field which indicates the
1549 index in the state[] array wheres its transition data is stored.
1551 - If .base is 0 there are no valid transitions from that node.
1553 - If .base is nonzero then charid is added to it to find an entry in
1556 -If trans[states[state].base+charid].check!=state then the
1557 transition is taken to be a 0/Fail transition. Thus if there are fail
1558 transitions at the front of the node then the .base offset will point
1559 somewhere inside the previous nodes data (or maybe even into a node
1560 even earlier), but the .check field determines if the transition is
1563 The following process inplace converts the table to the compressed
1564 table: We first do not compress the root node 1,and mark its all its
1565 .check pointers as 1 and set its .base pointer as 1 as well. This
1566 allows to do a DFA construction from the compressed table later, and
1567 ensures that any .base pointers we calculate later are greater than
1570 - We set 'pos' to indicate the first entry of the second node.
1572 - We then iterate over the columns of the node, finding the first and
1573 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1574 and set the .check pointers accordingly, and advance pos
1575 appropriately and repreat for the next node. Note that when we copy
1576 the next pointers we have to convert them from the original
1577 NODEIDX form to NODENUM form as the former is not valid post
1580 - If a node has no transitions used we mark its base as 0 and do not
1581 advance the pos pointer.
1583 - If a node only has one transition we use a second pointer into the
1584 structure to fill in allocated fail transitions from other states.
1585 This pointer is independent of the main pointer and scans forward
1586 looking for null transitions that are allocated to a state. When it
1587 finds one it writes the single transition into the "hole". If the
1588 pointer doesnt find one the single transition is appeneded as normal.
1590 - Once compressed we can Renew/realloc the structures to release the
1593 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1594 specifically Fig 3.47 and the associated pseudocode.
1598 const U32 laststate = TRIE_NODENUM( next_alloc );
1601 TRIE_LASTSTATE(trie) = laststate;
1603 for ( state = 1 ; state < laststate ; state++ ) {
1605 const U32 stateidx = TRIE_NODEIDX( state );
1606 const U32 o_used = trie->trans[ stateidx ].check;
1607 U32 used = trie->trans[ stateidx ].check;
1608 trie->trans[ stateidx ].check = 0;
1610 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1611 if ( flag || trie->trans[ stateidx + charid ].next ) {
1612 if ( trie->trans[ stateidx + charid ].next ) {
1614 for ( ; zp < pos ; zp++ ) {
1615 if ( ! trie->trans[ zp ].next ) {
1619 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1620 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1621 trie->trans[ zp ].check = state;
1622 if ( ++zp > pos ) pos = zp;
1629 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1631 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1632 trie->trans[ pos ].check = state;
1637 trie->lasttrans = pos + 1;
1638 Renew( trie->states, laststate + 1, reg_trie_state);
1639 DEBUG_TRIE_COMPILE_MORE_r(
1640 PerlIO_printf( Perl_debug_log,
1641 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1642 (int)depth * 2 + 2,"",
1643 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1646 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1649 } /* end table compress */
1651 /* resize the trans array to remove unused space */
1652 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1654 /* and now dump out the compressed format */
1655 DEBUG_TRIE_COMPILE_r(
1656 dump_trie(trie,depth+1)
1659 { /* Modify the program and insert the new TRIE node*/
1661 U8 nodetype =(U8)(flags & 0xFF);
1668 This means we convert either the first branch or the first Exact,
1669 depending on whether the thing following (in 'last') is a branch
1670 or not and whther first is the startbranch (ie is it a sub part of
1671 the alternation or is it the whole thing.)
1672 Assuming its a sub part we conver the EXACT otherwise we convert
1673 the whole branch sequence, including the first.
1675 /* Find the node we are going to overwrite */
1676 if ( first == startbranch && OP( last ) != BRANCH ) {
1677 /* whole branch chain */
1680 const regnode *nop = NEXTOPER( convert );
1681 mjd_offset= Node_Offset((nop));
1682 mjd_nodelen= Node_Length((nop));
1685 /* branch sub-chain */
1686 convert = NEXTOPER( first );
1687 NEXT_OFF( first ) = (U16)(last - first);
1689 mjd_offset= Node_Offset((convert));
1690 mjd_nodelen= Node_Length((convert));
1694 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1695 (int)depth * 2 + 2, "",
1696 mjd_offset,mjd_nodelen)
1699 /* But first we check to see if there is a common prefix we can
1700 split out as an EXACT and put in front of the TRIE node. */
1701 trie->startstate= 1;
1702 if ( trie->bitmap && !trie->widecharmap ) {
1705 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1706 (int)depth * 2 + 2, "",
1707 TRIE_LASTSTATE(trie))
1709 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1713 const U32 base = trie->states[ state ].trans.base;
1715 if ( trie->states[state].wordnum )
1718 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1719 if ( ( base + ofs >= trie->uniquecharcount ) &&
1720 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1721 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1723 if ( ++count > 1 ) {
1724 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1725 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1726 if ( state == 1 ) break;
1728 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1730 PerlIO_printf(Perl_debug_log,
1731 "%*sNew Start State=%"UVuf" Class: [",
1732 (int)depth * 2 + 2, "",
1735 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1736 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1738 TRIE_BITMAP_SET(trie,*ch);
1740 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1742 PerlIO_printf(Perl_debug_log, (char*)ch)
1746 TRIE_BITMAP_SET(trie,*ch);
1748 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1749 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1755 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1756 const char *ch = SvPV_nolen_const( *tmp );
1758 PerlIO_printf( Perl_debug_log,
1759 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1760 (int)depth * 2 + 2, "",
1764 OP( convert ) = nodetype;
1765 str=STRING(convert);
1773 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1778 regnode *n = convert+NODE_SZ_STR(convert);
1779 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1780 trie->startstate = state;
1781 trie->minlen -= (state - 1);
1782 trie->maxlen -= (state - 1);
1784 regnode *fix = convert;
1786 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1787 while( ++fix < n ) {
1788 Set_Node_Offset_Length(fix, 0, 0);
1794 NEXT_OFF(convert) = (U16)(tail - convert);
1798 if ( trie->maxlen ) {
1799 OP( convert ) = TRIE;
1800 NEXT_OFF( convert ) = (U16)(tail - convert);
1801 ARG_SET( convert, data_slot );
1803 /* store the type in the flags */
1804 convert->flags = nodetype;
1805 /* XXX We really should free up the resource in trie now, as we wont use them */
1807 /* needed for dumping*/
1809 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1810 regnode *opt = convert;
1811 while (++opt<optimize) {
1812 Set_Node_Offset_Length(opt,0,0);
1814 /* We now need to mark all of the space originally used by the
1815 branches as optimized away. This keeps the dumpuntil from
1816 throwing a wobbly as it doesnt use regnext() to traverse the
1818 We also "fix" the offsets
1820 while( optimize < last ) {
1821 mjd_nodelen += Node_Length((optimize));
1822 OP( optimize ) = OPTIMIZED;
1823 Set_Node_Offset_Length(optimize,0,0);
1826 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1828 } /* end node insert */
1830 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1836 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1837 * These need to be revisited when a newer toolchain becomes available.
1839 #if defined(__sparc64__) && defined(__GNUC__)
1840 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1841 # undef SPARC64_GCC_WORKAROUND
1842 # define SPARC64_GCC_WORKAROUND 1
1846 #define DEBUG_PEEP(str,scan,depth) \
1847 DEBUG_OPTIMISE_r({ \
1848 SV * const mysv=sv_newmortal(); \
1849 regnode *Next = regnext(scan); \
1850 regprop(RExC_rx, mysv, scan); \
1851 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1852 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1853 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1856 #define JOIN_EXACT(scan,min,flags) \
1857 if (PL_regkind[OP(scan)] == EXACT) \
1858 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1861 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1862 /* Merge several consecutive EXACTish nodes into one. */
1863 regnode *n = regnext(scan);
1865 regnode *next = scan + NODE_SZ_STR(scan);
1869 regnode *stop = scan;
1871 GET_RE_DEBUG_FLAGS_DECL;
1872 DEBUG_PEEP("join",scan,depth);
1874 /* Skip NOTHING, merge EXACT*. */
1876 ( PL_regkind[OP(n)] == NOTHING ||
1877 (stringok && (OP(n) == OP(scan))))
1879 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1881 if (OP(n) == TAIL || n > next)
1883 if (PL_regkind[OP(n)] == NOTHING) {
1885 DEBUG_PEEP("skip:",n,depth);
1886 NEXT_OFF(scan) += NEXT_OFF(n);
1887 next = n + NODE_STEP_REGNODE;
1894 else if (stringok) {
1895 const int oldl = STR_LEN(scan);
1896 regnode * const nnext = regnext(n);
1898 DEBUG_PEEP("merg",n,depth);
1901 if (oldl + STR_LEN(n) > U8_MAX)
1903 NEXT_OFF(scan) += NEXT_OFF(n);
1904 STR_LEN(scan) += STR_LEN(n);
1905 next = n + NODE_SZ_STR(n);
1906 /* Now we can overwrite *n : */
1907 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1915 #ifdef EXPERIMENTAL_INPLACESCAN
1916 if (flags && !NEXT_OFF(n)) {
1917 DEBUG_PEEP("atch",val,depth);
1918 if (reg_off_by_arg[OP(n)]) {
1919 ARG_SET(n, val - n);
1922 NEXT_OFF(n) = val - n;
1929 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1931 Two problematic code points in Unicode casefolding of EXACT nodes:
1933 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1934 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1940 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1941 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1943 This means that in case-insensitive matching (or "loose matching",
1944 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1945 length of the above casefolded versions) can match a target string
1946 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1947 This would rather mess up the minimum length computation.
1949 What we'll do is to look for the tail four bytes, and then peek
1950 at the preceding two bytes to see whether we need to decrease
1951 the minimum length by four (six minus two).
1953 Thanks to the design of UTF-8, there cannot be false matches:
1954 A sequence of valid UTF-8 bytes cannot be a subsequence of
1955 another valid sequence of UTF-8 bytes.
1958 char * const s0 = STRING(scan), *s, *t;
1959 char * const s1 = s0 + STR_LEN(scan) - 1;
1960 char * const s2 = s1 - 4;
1961 const char t0[] = "\xcc\x88\xcc\x81";
1962 const char * const t1 = t0 + 3;
1965 s < s2 && (t = ninstr(s, s1, t0, t1));
1967 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1968 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1975 n = scan + NODE_SZ_STR(scan);
1977 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1984 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
1988 /* REx optimizer. Converts nodes into quickier variants "in place".
1989 Finds fixed substrings. */
1991 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1992 to the position after last scanned or to NULL. */
1997 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1998 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1999 /* scanp: Start here (read-write). */
2000 /* deltap: Write maxlen-minlen here. */
2001 /* last: Stop before this one. */
2004 I32 min = 0, pars = 0, code;
2005 regnode *scan = *scanp, *next;
2007 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2008 int is_inf_internal = 0; /* The studied chunk is infinite */
2009 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2010 scan_data_t data_fake;
2011 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2012 SV *re_trie_maxbuff = NULL;
2014 GET_RE_DEBUG_FLAGS_DECL;
2016 while (scan && OP(scan) != END && scan < last) {
2017 /* Peephole optimizer: */
2018 DEBUG_PEEP("Peep",scan,depth);
2020 JOIN_EXACT(scan,&min,0);
2022 /* Follow the next-chain of the current node and optimize
2023 away all the NOTHINGs from it. */
2024 if (OP(scan) != CURLYX) {
2025 const int max = (reg_off_by_arg[OP(scan)]
2027 /* I32 may be smaller than U16 on CRAYs! */
2028 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2029 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2033 /* Skip NOTHING and LONGJMP. */
2034 while ((n = regnext(n))
2035 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2036 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2037 && off + noff < max)
2039 if (reg_off_by_arg[OP(scan)])
2042 NEXT_OFF(scan) = off;
2047 /* The principal pseudo-switch. Cannot be a switch, since we
2048 look into several different things. */
2049 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2050 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2051 next = regnext(scan);
2053 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2055 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2056 I32 max1 = 0, min1 = I32_MAX, num = 0;
2057 struct regnode_charclass_class accum;
2058 regnode * const startbranch=scan;
2060 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2061 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2062 if (flags & SCF_DO_STCLASS)
2063 cl_init_zero(pRExC_state, &accum);
2065 while (OP(scan) == code) {
2066 I32 deltanext, minnext, f = 0, fake;
2067 struct regnode_charclass_class this_class;
2070 data_fake.flags = 0;
2072 data_fake.whilem_c = data->whilem_c;
2073 data_fake.last_closep = data->last_closep;
2076 data_fake.last_closep = &fake;
2077 next = regnext(scan);
2078 scan = NEXTOPER(scan);
2080 scan = NEXTOPER(scan);
2081 if (flags & SCF_DO_STCLASS) {
2082 cl_init(pRExC_state, &this_class);
2083 data_fake.start_class = &this_class;
2084 f = SCF_DO_STCLASS_AND;
2086 if (flags & SCF_WHILEM_VISITED_POS)
2087 f |= SCF_WHILEM_VISITED_POS;
2089 /* we suppose the run is continuous, last=next...*/
2090 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2091 next, &data_fake, f,depth+1);
2094 if (max1 < minnext + deltanext)
2095 max1 = minnext + deltanext;
2096 if (deltanext == I32_MAX)
2097 is_inf = is_inf_internal = 1;
2099 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2102 if (data_fake.flags & SF_HAS_EVAL)
2103 data->flags |= SF_HAS_EVAL;
2104 data->whilem_c = data_fake.whilem_c;
2106 if (flags & SCF_DO_STCLASS)
2107 cl_or(pRExC_state, &accum, &this_class);
2108 if (code == SUSPEND)
2111 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2113 if (flags & SCF_DO_SUBSTR) {
2114 data->pos_min += min1;
2115 data->pos_delta += max1 - min1;
2116 if (max1 != min1 || is_inf)
2117 data->longest = &(data->longest_float);
2120 delta += max1 - min1;
2121 if (flags & SCF_DO_STCLASS_OR) {
2122 cl_or(pRExC_state, data->start_class, &accum);
2124 cl_and(data->start_class, &and_with);
2125 flags &= ~SCF_DO_STCLASS;
2128 else if (flags & SCF_DO_STCLASS_AND) {
2130 cl_and(data->start_class, &accum);
2131 flags &= ~SCF_DO_STCLASS;
2134 /* Switch to OR mode: cache the old value of
2135 * data->start_class */
2136 StructCopy(data->start_class, &and_with,
2137 struct regnode_charclass_class);
2138 flags &= ~SCF_DO_STCLASS_AND;
2139 StructCopy(&accum, data->start_class,
2140 struct regnode_charclass_class);
2141 flags |= SCF_DO_STCLASS_OR;
2142 data->start_class->flags |= ANYOF_EOS;
2148 Assuming this was/is a branch we are dealing with: 'scan' now
2149 points at the item that follows the branch sequence, whatever
2150 it is. We now start at the beginning of the sequence and look
2156 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2158 If we can find such a subseqence we need to turn the first
2159 element into a trie and then add the subsequent branch exact
2160 strings to the trie.
2164 1. patterns where the whole set of branch can be converted to a trie,
2166 2. patterns where only a subset of the alternations can be
2167 converted to a trie.
2169 In case 1 we can replace the whole set with a single regop
2170 for the trie. In case 2 we need to keep the start and end
2173 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2174 becomes BRANCH TRIE; BRANCH X;
2176 Hypthetically when we know the regex isnt anchored we can
2177 turn a case 1 into a DFA and let it rip... Every time it finds a match
2178 it would just call its tail, no WHILEM/CURLY needed.
2181 if (PERL_ENABLE_TRIE_OPTIMISATION) {
2183 if (!re_trie_maxbuff) {
2184 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2185 if (!SvIOK(re_trie_maxbuff))
2186 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2188 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2190 regnode *first = (regnode *)NULL;
2191 regnode *last = (regnode *)NULL;
2192 regnode *tail = scan;
2197 SV * const mysv = sv_newmortal(); /* for dumping */
2199 /* var tail is used because there may be a TAIL
2200 regop in the way. Ie, the exacts will point to the
2201 thing following the TAIL, but the last branch will
2202 point at the TAIL. So we advance tail. If we
2203 have nested (?:) we may have to move through several
2207 while ( OP( tail ) == TAIL ) {
2208 /* this is the TAIL generated by (?:) */
2209 tail = regnext( tail );
2214 regprop(RExC_rx, mysv, tail );
2215 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2216 (int)depth * 2 + 2, "",
2217 "Looking for TRIE'able sequences. Tail node is: ",
2218 SvPV_nolen_const( mysv )
2224 step through the branches, cur represents each
2225 branch, noper is the first thing to be matched
2226 as part of that branch and noper_next is the
2227 regnext() of that node. if noper is an EXACT
2228 and noper_next is the same as scan (our current
2229 position in the regex) then the EXACT branch is
2230 a possible optimization target. Once we have
2231 two or more consequetive such branches we can
2232 create a trie of the EXACT's contents and stich
2233 it in place. If the sequence represents all of
2234 the branches we eliminate the whole thing and
2235 replace it with a single TRIE. If it is a
2236 subsequence then we need to stitch it in. This
2237 means the first branch has to remain, and needs
2238 to be repointed at the item on the branch chain
2239 following the last branch optimized. This could
2240 be either a BRANCH, in which case the
2241 subsequence is internal, or it could be the
2242 item following the branch sequence in which
2243 case the subsequence is at the end.
2247 /* dont use tail as the end marker for this traverse */
2248 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2249 regnode * const noper = NEXTOPER( cur );
2250 regnode * const noper_next = regnext( noper );
2253 regprop(RExC_rx, mysv, cur);
2254 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2255 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2257 regprop(RExC_rx, mysv, noper);
2258 PerlIO_printf( Perl_debug_log, " -> %s",
2259 SvPV_nolen_const(mysv));
2262 regprop(RExC_rx, mysv, noper_next );
2263 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2264 SvPV_nolen_const(mysv));
2266 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2267 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2269 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2270 : PL_regkind[ OP( noper ) ] == EXACT )
2271 || OP(noper) == NOTHING )
2272 && noper_next == tail && count<U16_MAX)
2275 if ( !first || optype == NOTHING ) {
2276 if (!first) first = cur;
2277 optype = OP( noper );
2283 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2285 if ( PL_regkind[ OP( noper ) ] == EXACT
2286 && noper_next == tail )
2290 optype = OP( noper );
2300 regprop(RExC_rx, mysv, cur);
2301 PerlIO_printf( Perl_debug_log,
2302 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2303 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2307 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2308 #ifdef TRIE_STUDY_OPT
2309 if ( made && startbranch == first ) {
2310 if ( OP(first)!=TRIE )
2311 flags |= SCF_EXACT_TRIE;
2313 regnode *chk=*scanp;
2314 while ( OP( chk ) == OPEN )
2315 chk = regnext( chk );
2317 flags |= SCF_EXACT_TRIE;
2326 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2327 scan = NEXTOPER(NEXTOPER(scan));
2328 } else /* single branch is optimized. */
2329 scan = NEXTOPER(scan);
2332 else if (OP(scan) == EXACT) {
2333 I32 l = STR_LEN(scan);
2336 const U8 * const s = (U8*)STRING(scan);
2337 l = utf8_length(s, s + l);
2338 uc = utf8_to_uvchr(s, NULL);
2340 uc = *((U8*)STRING(scan));
2343 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2344 /* The code below prefers earlier match for fixed
2345 offset, later match for variable offset. */
2346 if (data->last_end == -1) { /* Update the start info. */
2347 data->last_start_min = data->pos_min;
2348 data->last_start_max = is_inf
2349 ? I32_MAX : data->pos_min + data->pos_delta;
2351 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2353 SvUTF8_on(data->last_found);
2355 SV * const sv = data->last_found;
2356 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2357 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2358 if (mg && mg->mg_len >= 0)
2359 mg->mg_len += utf8_length((U8*)STRING(scan),
2360 (U8*)STRING(scan)+STR_LEN(scan));
2362 data->last_end = data->pos_min + l;
2363 data->pos_min += l; /* As in the first entry. */
2364 data->flags &= ~SF_BEFORE_EOL;
2366 if (flags & SCF_DO_STCLASS_AND) {
2367 /* Check whether it is compatible with what we know already! */
2371 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2372 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2373 && (!(data->start_class->flags & ANYOF_FOLD)
2374 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2377 ANYOF_CLASS_ZERO(data->start_class);
2378 ANYOF_BITMAP_ZERO(data->start_class);
2380 ANYOF_BITMAP_SET(data->start_class, uc);
2381 data->start_class->flags &= ~ANYOF_EOS;
2383 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2385 else if (flags & SCF_DO_STCLASS_OR) {
2386 /* false positive possible if the class is case-folded */
2388 ANYOF_BITMAP_SET(data->start_class, uc);
2390 data->start_class->flags |= ANYOF_UNICODE_ALL;
2391 data->start_class->flags &= ~ANYOF_EOS;
2392 cl_and(data->start_class, &and_with);
2394 flags &= ~SCF_DO_STCLASS;
2396 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2397 I32 l = STR_LEN(scan);
2398 UV uc = *((U8*)STRING(scan));
2400 /* Search for fixed substrings supports EXACT only. */
2401 if (flags & SCF_DO_SUBSTR) {
2403 scan_commit(pRExC_state, data);
2406 const U8 * const s = (U8 *)STRING(scan);
2407 l = utf8_length(s, s + l);
2408 uc = utf8_to_uvchr(s, NULL);
2411 if (flags & SCF_DO_SUBSTR)
2413 if (flags & SCF_DO_STCLASS_AND) {
2414 /* Check whether it is compatible with what we know already! */
2418 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2419 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2420 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2422 ANYOF_CLASS_ZERO(data->start_class);
2423 ANYOF_BITMAP_ZERO(data->start_class);
2425 ANYOF_BITMAP_SET(data->start_class, uc);
2426 data->start_class->flags &= ~ANYOF_EOS;
2427 data->start_class->flags |= ANYOF_FOLD;
2428 if (OP(scan) == EXACTFL)
2429 data->start_class->flags |= ANYOF_LOCALE;
2432 else if (flags & SCF_DO_STCLASS_OR) {
2433 if (data->start_class->flags & ANYOF_FOLD) {
2434 /* false positive possible if the class is case-folded.
2435 Assume that the locale settings are the same... */
2437 ANYOF_BITMAP_SET(data->start_class, uc);
2438 data->start_class->flags &= ~ANYOF_EOS;
2440 cl_and(data->start_class, &and_with);
2442 flags &= ~SCF_DO_STCLASS;
2444 #ifdef TRIE_STUDY_OPT
2445 else if (OP(scan) == TRIE) {
2446 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2447 min += trie->minlen;
2448 delta += (trie->maxlen - trie->minlen);
2449 flags &= ~SCF_DO_STCLASS; /* xxx */
2450 if (flags & SCF_DO_SUBSTR) {
2451 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2452 data->pos_min += trie->minlen;
2453 data->pos_delta += (trie->maxlen - trie->minlen);
2454 if (trie->maxlen != trie->minlen)
2455 data->longest = &(data->longest_float);
2459 else if (strchr((const char*)PL_varies,OP(scan))) {
2460 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2461 I32 f = flags, pos_before = 0;
2462 regnode * const oscan = scan;
2463 struct regnode_charclass_class this_class;
2464 struct regnode_charclass_class *oclass = NULL;
2465 I32 next_is_eval = 0;
2467 switch (PL_regkind[OP(scan)]) {
2468 case WHILEM: /* End of (?:...)* . */
2469 scan = NEXTOPER(scan);
2472 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2473 next = NEXTOPER(scan);
2474 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2476 maxcount = REG_INFTY;
2477 next = regnext(scan);
2478 scan = NEXTOPER(scan);
2482 if (flags & SCF_DO_SUBSTR)
2487 if (flags & SCF_DO_STCLASS) {
2489 maxcount = REG_INFTY;
2490 next = regnext(scan);
2491 scan = NEXTOPER(scan);
2494 is_inf = is_inf_internal = 1;
2495 scan = regnext(scan);
2496 if (flags & SCF_DO_SUBSTR) {
2497 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2498 data->longest = &(data->longest_float);
2500 goto optimize_curly_tail;
2502 mincount = ARG1(scan);
2503 maxcount = ARG2(scan);
2504 next = regnext(scan);
2505 if (OP(scan) == CURLYX) {
2506 I32 lp = (data ? *(data->last_closep) : 0);
2507 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2509 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2510 next_is_eval = (OP(scan) == EVAL);
2512 if (flags & SCF_DO_SUBSTR) {
2513 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2514 pos_before = data->pos_min;
2518 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2520 data->flags |= SF_IS_INF;
2522 if (flags & SCF_DO_STCLASS) {
2523 cl_init(pRExC_state, &this_class);
2524 oclass = data->start_class;
2525 data->start_class = &this_class;
2526 f |= SCF_DO_STCLASS_AND;
2527 f &= ~SCF_DO_STCLASS_OR;
2529 /* These are the cases when once a subexpression
2530 fails at a particular position, it cannot succeed
2531 even after backtracking at the enclosing scope.
2533 XXXX what if minimal match and we are at the
2534 initial run of {n,m}? */
2535 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2536 f &= ~SCF_WHILEM_VISITED_POS;
2538 /* This will finish on WHILEM, setting scan, or on NULL: */
2539 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2541 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2543 if (flags & SCF_DO_STCLASS)
2544 data->start_class = oclass;
2545 if (mincount == 0 || minnext == 0) {
2546 if (flags & SCF_DO_STCLASS_OR) {
2547 cl_or(pRExC_state, data->start_class, &this_class);
2549 else if (flags & SCF_DO_STCLASS_AND) {
2550 /* Switch to OR mode: cache the old value of
2551 * data->start_class */
2552 StructCopy(data->start_class, &and_with,
2553 struct regnode_charclass_class);
2554 flags &= ~SCF_DO_STCLASS_AND;
2555 StructCopy(&this_class, data->start_class,
2556 struct regnode_charclass_class);
2557 flags |= SCF_DO_STCLASS_OR;
2558 data->start_class->flags |= ANYOF_EOS;
2560 } else { /* Non-zero len */
2561 if (flags & SCF_DO_STCLASS_OR) {
2562 cl_or(pRExC_state, data->start_class, &this_class);
2563 cl_and(data->start_class, &and_with);
2565 else if (flags & SCF_DO_STCLASS_AND)
2566 cl_and(data->start_class, &this_class);
2567 flags &= ~SCF_DO_STCLASS;
2569 if (!scan) /* It was not CURLYX, but CURLY. */
2571 if ( /* ? quantifier ok, except for (?{ ... }) */
2572 (next_is_eval || !(mincount == 0 && maxcount == 1))
2573 && (minnext == 0) && (deltanext == 0)
2574 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2575 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2576 && ckWARN(WARN_REGEXP))
2579 "Quantifier unexpected on zero-length expression");
2582 min += minnext * mincount;
2583 is_inf_internal |= ((maxcount == REG_INFTY
2584 && (minnext + deltanext) > 0)
2585 || deltanext == I32_MAX);
2586 is_inf |= is_inf_internal;
2587 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2589 /* Try powerful optimization CURLYX => CURLYN. */
2590 if ( OP(oscan) == CURLYX && data
2591 && data->flags & SF_IN_PAR
2592 && !(data->flags & SF_HAS_EVAL)
2593 && !deltanext && minnext == 1 ) {
2594 /* Try to optimize to CURLYN. */
2595 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2596 regnode * const nxt1 = nxt;
2603 if (!strchr((const char*)PL_simple,OP(nxt))
2604 && !(PL_regkind[OP(nxt)] == EXACT
2605 && STR_LEN(nxt) == 1))
2611 if (OP(nxt) != CLOSE)
2613 /* Now we know that nxt2 is the only contents: */
2614 oscan->flags = (U8)ARG(nxt);
2616 OP(nxt1) = NOTHING; /* was OPEN. */
2618 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2619 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2620 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2621 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2622 OP(nxt + 1) = OPTIMIZED; /* was count. */
2623 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2628 /* Try optimization CURLYX => CURLYM. */
2629 if ( OP(oscan) == CURLYX && data
2630 && !(data->flags & SF_HAS_PAR)
2631 && !(data->flags & SF_HAS_EVAL)
2632 && !deltanext /* atom is fixed width */
2633 && minnext != 0 /* CURLYM can't handle zero width */
2635 /* XXXX How to optimize if data == 0? */
2636 /* Optimize to a simpler form. */
2637 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2641 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2642 && (OP(nxt2) != WHILEM))
2644 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2645 /* Need to optimize away parenths. */
2646 if (data->flags & SF_IN_PAR) {
2647 /* Set the parenth number. */
2648 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2650 if (OP(nxt) != CLOSE)
2651 FAIL("Panic opt close");
2652 oscan->flags = (U8)ARG(nxt);
2653 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2654 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2656 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2657 OP(nxt + 1) = OPTIMIZED; /* was count. */
2658 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2659 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2662 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2663 regnode *nnxt = regnext(nxt1);
2666 if (reg_off_by_arg[OP(nxt1)])
2667 ARG_SET(nxt1, nxt2 - nxt1);
2668 else if (nxt2 - nxt1 < U16_MAX)
2669 NEXT_OFF(nxt1) = nxt2 - nxt1;
2671 OP(nxt) = NOTHING; /* Cannot beautify */
2676 /* Optimize again: */
2677 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2683 else if ((OP(oscan) == CURLYX)
2684 && (flags & SCF_WHILEM_VISITED_POS)
2685 /* See the comment on a similar expression above.
2686 However, this time it not a subexpression
2687 we care about, but the expression itself. */
2688 && (maxcount == REG_INFTY)
2689 && data && ++data->whilem_c < 16) {
2690 /* This stays as CURLYX, we can put the count/of pair. */
2691 /* Find WHILEM (as in regexec.c) */
2692 regnode *nxt = oscan + NEXT_OFF(oscan);
2694 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2696 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2697 | (RExC_whilem_seen << 4)); /* On WHILEM */
2699 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2701 if (flags & SCF_DO_SUBSTR) {
2702 SV *last_str = NULL;
2703 int counted = mincount != 0;
2705 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2706 #if defined(SPARC64_GCC_WORKAROUND)
2709 const char *s = NULL;
2712 if (pos_before >= data->last_start_min)
2715 b = data->last_start_min;
2718 s = SvPV_const(data->last_found, l);
2719 old = b - data->last_start_min;
2722 I32 b = pos_before >= data->last_start_min
2723 ? pos_before : data->last_start_min;
2725 const char * const s = SvPV_const(data->last_found, l);
2726 I32 old = b - data->last_start_min;
2730 old = utf8_hop((U8*)s, old) - (U8*)s;
2733 /* Get the added string: */
2734 last_str = newSVpvn(s + old, l);
2736 SvUTF8_on(last_str);
2737 if (deltanext == 0 && pos_before == b) {
2738 /* What was added is a constant string */
2740 SvGROW(last_str, (mincount * l) + 1);
2741 repeatcpy(SvPVX(last_str) + l,
2742 SvPVX_const(last_str), l, mincount - 1);
2743 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2744 /* Add additional parts. */
2745 SvCUR_set(data->last_found,
2746 SvCUR(data->last_found) - l);
2747 sv_catsv(data->last_found, last_str);
2749 SV * sv = data->last_found;
2751 SvUTF8(sv) && SvMAGICAL(sv) ?
2752 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2753 if (mg && mg->mg_len >= 0)
2754 mg->mg_len += CHR_SVLEN(last_str);
2756 data->last_end += l * (mincount - 1);
2759 /* start offset must point into the last copy */
2760 data->last_start_min += minnext * (mincount - 1);
2761 data->last_start_max += is_inf ? I32_MAX
2762 : (maxcount - 1) * (minnext + data->pos_delta);
2765 /* It is counted once already... */
2766 data->pos_min += minnext * (mincount - counted);
2767 data->pos_delta += - counted * deltanext +
2768 (minnext + deltanext) * maxcount - minnext * mincount;
2769 if (mincount != maxcount) {
2770 /* Cannot extend fixed substrings found inside
2772 scan_commit(pRExC_state,data);
2773 if (mincount && last_str) {
2774 SV * const sv = data->last_found;
2775 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2776 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2780 sv_setsv(sv, last_str);
2781 data->last_end = data->pos_min;
2782 data->last_start_min =
2783 data->pos_min - CHR_SVLEN(last_str);
2784 data->last_start_max = is_inf
2786 : data->pos_min + data->pos_delta
2787 - CHR_SVLEN(last_str);
2789 data->longest = &(data->longest_float);
2791 SvREFCNT_dec(last_str);
2793 if (data && (fl & SF_HAS_EVAL))
2794 data->flags |= SF_HAS_EVAL;
2795 optimize_curly_tail:
2796 if (OP(oscan) != CURLYX) {
2797 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2799 NEXT_OFF(oscan) += NEXT_OFF(next);
2802 default: /* REF and CLUMP only? */
2803 if (flags & SCF_DO_SUBSTR) {
2804 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2805 data->longest = &(data->longest_float);
2807 is_inf = is_inf_internal = 1;
2808 if (flags & SCF_DO_STCLASS_OR)
2809 cl_anything(pRExC_state, data->start_class);
2810 flags &= ~SCF_DO_STCLASS;
2814 else if (strchr((const char*)PL_simple,OP(scan))) {
2817 if (flags & SCF_DO_SUBSTR) {
2818 scan_commit(pRExC_state,data);
2822 if (flags & SCF_DO_STCLASS) {
2823 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2825 /* Some of the logic below assumes that switching
2826 locale on will only add false positives. */
2827 switch (PL_regkind[OP(scan)]) {
2831 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2832 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2833 cl_anything(pRExC_state, data->start_class);
2836 if (OP(scan) == SANY)
2838 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2839 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2840 || (data->start_class->flags & ANYOF_CLASS));
2841 cl_anything(pRExC_state, data->start_class);
2843 if (flags & SCF_DO_STCLASS_AND || !value)
2844 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2847 if (flags & SCF_DO_STCLASS_AND)
2848 cl_and(data->start_class,
2849 (struct regnode_charclass_class*)scan);
2851 cl_or(pRExC_state, data->start_class,
2852 (struct regnode_charclass_class*)scan);
2855 if (flags & SCF_DO_STCLASS_AND) {
2856 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2857 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2858 for (value = 0; value < 256; value++)
2859 if (!isALNUM(value))
2860 ANYOF_BITMAP_CLEAR(data->start_class, value);
2864 if (data->start_class->flags & ANYOF_LOCALE)
2865 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2867 for (value = 0; value < 256; value++)
2869 ANYOF_BITMAP_SET(data->start_class, value);
2874 if (flags & SCF_DO_STCLASS_AND) {
2875 if (data->start_class->flags & ANYOF_LOCALE)
2876 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2879 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2880 data->start_class->flags |= ANYOF_LOCALE;
2884 if (flags & SCF_DO_STCLASS_AND) {
2885 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2886 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2887 for (value = 0; value < 256; value++)
2889 ANYOF_BITMAP_CLEAR(data->start_class, value);
2893 if (data->start_class->flags & ANYOF_LOCALE)
2894 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2896 for (value = 0; value < 256; value++)
2897 if (!isALNUM(value))
2898 ANYOF_BITMAP_SET(data->start_class, value);
2903 if (flags & SCF_DO_STCLASS_AND) {
2904 if (data->start_class->flags & ANYOF_LOCALE)
2905 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2908 data->start_class->flags |= ANYOF_LOCALE;
2909 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2913 if (flags & SCF_DO_STCLASS_AND) {
2914 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2915 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2916 for (value = 0; value < 256; value++)
2917 if (!isSPACE(value))
2918 ANYOF_BITMAP_CLEAR(data->start_class, value);
2922 if (data->start_class->flags & ANYOF_LOCALE)
2923 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2925 for (value = 0; value < 256; value++)
2927 ANYOF_BITMAP_SET(data->start_class, value);
2932 if (flags & SCF_DO_STCLASS_AND) {
2933 if (data->start_class->flags & ANYOF_LOCALE)
2934 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2937 data->start_class->flags |= ANYOF_LOCALE;
2938 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2942 if (flags & SCF_DO_STCLASS_AND) {
2943 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2944 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2945 for (value = 0; value < 256; value++)
2947 ANYOF_BITMAP_CLEAR(data->start_class, value);
2951 if (data->start_class->flags & ANYOF_LOCALE)
2952 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2954 for (value = 0; value < 256; value++)
2955 if (!isSPACE(value))
2956 ANYOF_BITMAP_SET(data->start_class, value);
2961 if (flags & SCF_DO_STCLASS_AND) {
2962 if (data->start_class->flags & ANYOF_LOCALE) {
2963 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2964 for (value = 0; value < 256; value++)
2965 if (!isSPACE(value))
2966 ANYOF_BITMAP_CLEAR(data->start_class, value);
2970 data->start_class->flags |= ANYOF_LOCALE;
2971 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2975 if (flags & SCF_DO_STCLASS_AND) {
2976 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2977 for (value = 0; value < 256; value++)
2978 if (!isDIGIT(value))
2979 ANYOF_BITMAP_CLEAR(data->start_class, value);
2982 if (data->start_class->flags & ANYOF_LOCALE)
2983 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2985 for (value = 0; value < 256; value++)
2987 ANYOF_BITMAP_SET(data->start_class, value);
2992 if (flags & SCF_DO_STCLASS_AND) {
2993 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2994 for (value = 0; value < 256; value++)
2996 ANYOF_BITMAP_CLEAR(data->start_class, value);
2999 if (data->start_class->flags & ANYOF_LOCALE)
3000 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3002 for (value = 0; value < 256; value++)
3003 if (!isDIGIT(value))
3004 ANYOF_BITMAP_SET(data->start_class, value);
3009 if (flags & SCF_DO_STCLASS_OR)
3010 cl_and(data->start_class, &and_with);
3011 flags &= ~SCF_DO_STCLASS;
3014 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3015 data->flags |= (OP(scan) == MEOL
3019 else if ( PL_regkind[OP(scan)] == BRANCHJ
3020 /* Lookbehind, or need to calculate parens/evals/stclass: */
3021 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3022 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3023 /* Lookahead/lookbehind */
3024 I32 deltanext, minnext, fake = 0;
3026 struct regnode_charclass_class intrnl;
3029 data_fake.flags = 0;
3031 data_fake.whilem_c = data->whilem_c;
3032 data_fake.last_closep = data->last_closep;
3035 data_fake.last_closep = &fake;
3036 if ( flags & SCF_DO_STCLASS && !scan->flags
3037 && OP(scan) == IFMATCH ) { /* Lookahead */
3038 cl_init(pRExC_state, &intrnl);
3039 data_fake.start_class = &intrnl;
3040 f |= SCF_DO_STCLASS_AND;
3042 if (flags & SCF_WHILEM_VISITED_POS)
3043 f |= SCF_WHILEM_VISITED_POS;
3044 next = regnext(scan);
3045 nscan = NEXTOPER(NEXTOPER(scan));
3046 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3049 vFAIL("Variable length lookbehind not implemented");
3051 else if (minnext > U8_MAX) {
3052 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3054 scan->flags = (U8)minnext;
3056 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3058 if (data && (data_fake.flags & SF_HAS_EVAL))
3059 data->flags |= SF_HAS_EVAL;
3061 data->whilem_c = data_fake.whilem_c;
3062 if (f & SCF_DO_STCLASS_AND) {
3063 const int was = (data->start_class->flags & ANYOF_EOS);
3065 cl_and(data->start_class, &intrnl);
3067 data->start_class->flags |= ANYOF_EOS;
3070 else if (OP(scan) == OPEN) {
3073 else if (OP(scan) == CLOSE) {
3074 if ((I32)ARG(scan) == is_par) {
3075 next = regnext(scan);
3077 if ( next && (OP(next) != WHILEM) && next < last)
3078 is_par = 0; /* Disable optimization */
3081 *(data->last_closep) = ARG(scan);
3083 else if (OP(scan) == EVAL) {
3085 data->flags |= SF_HAS_EVAL;
3087 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3088 if (flags & SCF_DO_SUBSTR) {
3089 scan_commit(pRExC_state,data);
3090 data->longest = &(data->longest_float);
3092 is_inf = is_inf_internal = 1;
3093 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3094 cl_anything(pRExC_state, data->start_class);
3095 flags &= ~SCF_DO_STCLASS;
3097 /* Else: zero-length, ignore. */
3098 scan = regnext(scan);
3103 *deltap = is_inf_internal ? I32_MAX : delta;
3104 if (flags & SCF_DO_SUBSTR && is_inf)
3105 data->pos_delta = I32_MAX - data->pos_min;
3106 if (is_par > U8_MAX)
3108 if (is_par && pars==1 && data) {
3109 data->flags |= SF_IN_PAR;
3110 data->flags &= ~SF_HAS_PAR;
3112 else if (pars && data) {
3113 data->flags |= SF_HAS_PAR;
3114 data->flags &= ~SF_IN_PAR;
3116 if (flags & SCF_DO_STCLASS_OR)
3117 cl_and(data->start_class, &and_with);
3118 if (flags & SCF_EXACT_TRIE)
3119 data->flags |= SCF_EXACT_TRIE;
3124 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3126 if (RExC_rx->data) {
3127 Renewc(RExC_rx->data,
3128 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3129 char, struct reg_data);
3130 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3131 RExC_rx->data->count += n;
3134 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3135 char, struct reg_data);
3136 Newx(RExC_rx->data->what, n, U8);
3137 RExC_rx->data->count = n;
3139 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3140 return RExC_rx->data->count - n;
3143 #ifndef PERL_IN_XSUB_RE
3145 Perl_reginitcolors(pTHX)
3148 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3150 char *t = savepv(s);
3154 t = strchr(t, '\t');
3160 PL_colors[i] = t = (char *)"";
3165 PL_colors[i++] = (char *)"";
3173 - pregcomp - compile a regular expression into internal code
3175 * We can't allocate space until we know how big the compiled form will be,
3176 * but we can't compile it (and thus know how big it is) until we've got a
3177 * place to put the code. So we cheat: we compile it twice, once with code
3178 * generation turned off and size counting turned on, and once "for real".
3179 * This also means that we don't allocate space until we are sure that the
3180 * thing really will compile successfully, and we never have to move the
3181 * code and thus invalidate pointers into it. (Note that it has to be in
3182 * one piece because free() must be able to free it all.) [NB: not true in perl]
3184 * Beware that the optimization-preparation code in here knows about some
3185 * of the structure of the compiled regexp. [I'll say.]
3188 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3199 RExC_state_t RExC_state;
3200 RExC_state_t *pRExC_state = &RExC_state;
3201 #ifdef TRIE_STUDY_OPT
3203 RExC_state_t copyRExC_state;
3206 GET_RE_DEBUG_FLAGS_DECL;
3209 FAIL("NULL regexp argument");
3211 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3214 DEBUG_r(if (!PL_colorset) reginitcolors());
3216 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
3217 PL_colors[4],PL_colors[5],PL_colors[0],
3218 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3220 RExC_flags = pm->op_pmflags;
3224 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3225 RExC_seen_evals = 0;
3228 /* First pass: determine size, legality. */
3235 RExC_emit = &PL_regdummy;
3236 RExC_whilem_seen = 0;
3237 #if 0 /* REGC() is (currently) a NOP at the first pass.
3238 * Clever compilers notice this and complain. --jhi */
3239 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3241 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3242 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3243 RExC_precomp = NULL;
3246 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3247 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3248 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3251 RExC_lastparse=NULL;
3255 /* Small enough for pointer-storage convention?
3256 If extralen==0, this means that we will not need long jumps. */
3257 if (RExC_size >= 0x10000L && RExC_extralen)
3258 RExC_size += RExC_extralen;
3261 if (RExC_whilem_seen > 15)
3262 RExC_whilem_seen = 15;
3264 /* Allocate space and initialize. */
3265 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3268 FAIL("Regexp out of space");
3271 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3272 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3275 r->prelen = xend - exp;
3276 r->precomp = savepvn(RExC_precomp, r->prelen);
3278 #ifdef PERL_OLD_COPY_ON_WRITE
3279 r->saved_copy = NULL;
3281 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3282 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3283 r->lastparen = 0; /* mg.c reads this. */
3285 r->substrs = 0; /* Useful during FAIL. */
3286 r->startp = 0; /* Useful during FAIL. */
3287 r->endp = 0; /* Useful during FAIL. */
3289 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3291 r->offsets[0] = RExC_size;
3293 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3294 "%s %"UVuf" bytes for offset annotations.\n",
3295 r->offsets ? "Got" : "Couldn't get",
3296 (UV)((2*RExC_size+1) * sizeof(U32))));
3300 /* Second pass: emit code. */
3301 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3306 RExC_emit_start = r->program;
3307 RExC_emit = r->program;
3308 /* Store the count of eval-groups for security checks: */
3309 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3310 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3312 if (reg(pRExC_state, 0, &flags,1) == NULL)
3314 /* XXXX To minimize changes to RE engine we always allocate
3315 3-units-long substrs field. */
3316 Newx(r->substrs, 1, struct reg_substr_data);
3319 Zero(r->substrs, 1, struct reg_substr_data);
3320 StructCopy(&zero_scan_data, &data, scan_data_t);
3322 #ifdef TRIE_STUDY_OPT
3324 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3325 RExC_state=copyRExC_state;
3326 if (data.longest_fixed)
3327 SvREFCNT_dec(data.longest_fixed);
3328 if (data.longest_float)
3329 SvREFCNT_dec(data.longest_float);
3330 if (data.last_found)
3331 SvREFCNT_dec(data.last_found);
3333 copyRExC_state=RExC_state;
3336 /* Dig out information for optimizations. */
3337 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3338 pm->op_pmflags = RExC_flags;
3340 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3341 r->regstclass = NULL;
3342 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3343 r->reganch |= ROPT_NAUGHTY;
3344 scan = r->program + 1; /* First BRANCH. */
3346 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3347 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3349 STRLEN longest_float_length, longest_fixed_length;
3350 struct regnode_charclass_class ch_class; /* pointed to by data */
3352 I32 last_close = 0; /* pointed to by data */
3355 /* Skip introductions and multiplicators >= 1. */
3356 while ((OP(first) == OPEN && (sawopen = 1)) ||
3357 /* An OR of *one* alternative - should not happen now. */
3358 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3359 /* for now we can't handle lookbehind IFMATCH*/
3360 (OP(first) == IFMATCH && !first->flags) ||
3361 (OP(first) == PLUS) ||
3362 (OP(first) == MINMOD) ||
3363 /* An {n,m} with n>0 */
3364 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3366 DEBUG_PEEP("first:",first,0);
3367 if (OP(first) == PLUS)
3370 first += regarglen[OP(first)];
3371 if (OP(first) == IFMATCH) {
3372 first = NEXTOPER(first);
3373 first += EXTRA_STEP_2ARGS;
3374 } else /*xxx possible optimisation for /(?=)/*/
3375 first = NEXTOPER(first);
3378 /* Starting-point info. */
3380 /* Ignore EXACT as we deal with it later. */
3381 if (PL_regkind[OP(first)] == EXACT) {
3382 if (OP(first) == EXACT)
3383 NOOP; /* Empty, get anchored substr later. */
3384 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3385 r->regstclass = first;
3388 else if (OP(first) == TRIE &&
3389 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3391 /* this can happen only on restudy */
3392 struct regnode_1 *trie_op;
3393 Newxz(trie_op,1,struct regnode_1);
3394 StructCopy(first,trie_op,struct regnode_1);
3395 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3396 r->regstclass = (regnode *)trie_op;
3399 else if (strchr((const char*)PL_simple,OP(first)))
3400 r->regstclass = first;
3401 else if (PL_regkind[OP(first)] == BOUND ||
3402 PL_regkind[OP(first)] == NBOUND)
3403 r->regstclass = first;
3404 else if (PL_regkind[OP(first)] == BOL) {
3405 r->reganch |= (OP(first) == MBOL
3407 : (OP(first) == SBOL
3410 first = NEXTOPER(first);
3413 else if (OP(first) == GPOS) {
3414 r->reganch |= ROPT_ANCH_GPOS;
3415 first = NEXTOPER(first);
3418 else if (!sawopen && (OP(first) == STAR &&
3419 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3420 !(r->reganch & ROPT_ANCH) )
3422 /* turn .* into ^.* with an implied $*=1 */
3424 (OP(NEXTOPER(first)) == REG_ANY)
3427 r->reganch |= type | ROPT_IMPLICIT;
3428 first = NEXTOPER(first);
3431 if (sawplus && (!sawopen || !RExC_sawback)
3432 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3433 /* x+ must match at the 1st pos of run of x's */
3434 r->reganch |= ROPT_SKIP;
3436 /* Scan is after the zeroth branch, first is atomic matcher. */
3437 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3438 (IV)(first - scan + 1)));
3440 * If there's something expensive in the r.e., find the
3441 * longest literal string that must appear and make it the
3442 * regmust. Resolve ties in favor of later strings, since
3443 * the regstart check works with the beginning of the r.e.
3444 * and avoiding duplication strengthens checking. Not a
3445 * strong reason, but sufficient in the absence of others.
3446 * [Now we resolve ties in favor of the earlier string if
3447 * it happens that c_offset_min has been invalidated, since the
3448 * earlier string may buy us something the later one won't.]
3452 data.longest_fixed = newSVpvs("");
3453 data.longest_float = newSVpvs("");
3454 data.last_found = newSVpvs("");
3455 data.longest = &(data.longest_fixed);
3457 if (!r->regstclass) {
3458 cl_init(pRExC_state, &ch_class);
3459 data.start_class = &ch_class;
3460 stclass_flag = SCF_DO_STCLASS_AND;
3461 } else /* XXXX Check for BOUND? */
3463 data.last_closep = &last_close;
3465 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3466 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3468 #ifdef TRIE_STUDY_OPT
3469 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3474 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3475 && data.last_start_min == 0 && data.last_end > 0
3476 && !RExC_seen_zerolen
3477 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3478 r->reganch |= ROPT_CHECK_ALL;
3479 scan_commit(pRExC_state, &data);
3480 SvREFCNT_dec(data.last_found);
3482 longest_float_length = CHR_SVLEN(data.longest_float);
3483 if (longest_float_length
3484 || (data.flags & SF_FL_BEFORE_EOL
3485 && (!(data.flags & SF_FL_BEFORE_MEOL)
3486 || (RExC_flags & PMf_MULTILINE)))) {
3489 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3490 && data.offset_fixed == data.offset_float_min
3491 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3492 goto remove_float; /* As in (a)+. */
3494 if (SvUTF8(data.longest_float)) {
3495 r->float_utf8 = data.longest_float;
3496 r->float_substr = NULL;
3498 r->float_substr = data.longest_float;
3499 r->float_utf8 = NULL;
3501 r->float_min_offset = data.offset_float_min;
3502 r->float_max_offset = data.offset_float_max;
3503 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3504 && (!(data.flags & SF_FL_BEFORE_MEOL)
3505 || (RExC_flags & PMf_MULTILINE)));
3506 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3510 r->float_substr = r->float_utf8 = NULL;
3511 SvREFCNT_dec(data.longest_float);
3512 longest_float_length = 0;
3515 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3516 if (longest_fixed_length
3517 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3518 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3519 || (RExC_flags & PMf_MULTILINE)))) {
3522 if (SvUTF8(data.longest_fixed)) {
3523 r->anchored_utf8 = data.longest_fixed;
3524 r->anchored_substr = NULL;
3526 r->anchored_substr = data.longest_fixed;
3527 r->anchored_utf8 = NULL;
3529 r->anchored_offset = data.offset_fixed;
3530 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3531 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3532 || (RExC_flags & PMf_MULTILINE)));
3533 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3536 r->anchored_substr = r->anchored_utf8 = NULL;
3537 SvREFCNT_dec(data.longest_fixed);
3538 longest_fixed_length = 0;
3541 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3542 r->regstclass = NULL;
3543 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3545 && !(data.start_class->flags & ANYOF_EOS)
3546 && !cl_is_anything(data.start_class))
3548 const I32 n = add_data(pRExC_state, 1, "f");
3550 Newx(RExC_rx->data->data[n], 1,
3551 struct regnode_charclass_class);
3552 StructCopy(data.start_class,
3553 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3554 struct regnode_charclass_class);
3555 r->regstclass = (regnode*)RExC_rx->data->data[n];
3556 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3557 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3558 regprop(r, sv, (regnode*)data.start_class);
3559 PerlIO_printf(Perl_debug_log,
3560 "synthetic stclass \"%s\".\n",
3561 SvPVX_const(sv));});
3564 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3565 if (longest_fixed_length > longest_float_length) {
3566 r->check_substr = r->anchored_substr;
3567 r->check_utf8 = r->anchored_utf8;
3568 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3569 if (r->reganch & ROPT_ANCH_SINGLE)
3570 r->reganch |= ROPT_NOSCAN;
3573 r->check_substr = r->float_substr;
3574 r->check_utf8 = r->float_utf8;
3575 r->check_offset_min = data.offset_float_min;
3576 r->check_offset_max = data.offset_float_max;
3578 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3579 This should be changed ASAP! */
3580 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3581 r->reganch |= RE_USE_INTUIT;
3582 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3583 r->reganch |= RE_INTUIT_TAIL;
3587 /* Several toplevels. Best we can is to set minlen. */
3589 struct regnode_charclass_class ch_class;
3592 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3594 scan = r->program + 1;
3595 cl_init(pRExC_state, &ch_class);
3596 data.start_class = &ch_class;
3597 data.last_closep = &last_close;
3599 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3600 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3602 #ifdef TRIE_STUDY_OPT
3603 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3608 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3609 = r->float_substr = r->float_utf8 = NULL;
3610 if (!(data.start_class->flags & ANYOF_EOS)
3611 && !cl_is_anything(data.start_class))
3613 const I32 n = add_data(pRExC_state, 1, "f");
3615 Newx(RExC_rx->data->data[n], 1,
3616 struct regnode_charclass_class);
3617 StructCopy(data.start_class,
3618 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3619 struct regnode_charclass_class);
3620 r->regstclass = (regnode*)RExC_rx->data->data[n];
3621 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3622 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3623 regprop(r, sv, (regnode*)data.start_class);
3624 PerlIO_printf(Perl_debug_log,
3625 "synthetic stclass \"%s\".\n",
3626 SvPVX_const(sv));});
3631 if (RExC_seen & REG_SEEN_GPOS)
3632 r->reganch |= ROPT_GPOS_SEEN;
3633 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3634 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3635 if (RExC_seen & REG_SEEN_EVAL)
3636 r->reganch |= ROPT_EVAL_SEEN;
3637 if (RExC_seen & REG_SEEN_CANY)
3638 r->reganch |= ROPT_CANY_SEEN;
3639 Newxz(r->startp, RExC_npar, I32);
3640 Newxz(r->endp, RExC_npar, I32);
3643 if (SvIV(re_debug_flags)> (RE_DEBUG_COMPILE | RE_DEBUG_EXECUTE))
3644 PerlIO_printf(Perl_debug_log,"Final program:\n");
3651 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3652 int rem=(int)(RExC_end - RExC_parse); \
3661 if (RExC_lastparse!=RExC_parse) \
3662 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3665 iscut ? "..." : "<" \
3668 PerlIO_printf(Perl_debug_log,"%16s",""); \
3673 num=REG_NODE_NUM(RExC_emit); \
3674 if (RExC_lastnum!=num) \
3675 PerlIO_printf(Perl_debug_log,"%4d",num); \
3677 PerlIO_printf(Perl_debug_log,"%4s",""); \
3678 PerlIO_printf(Perl_debug_log,"%*s%-4s", \
3679 (int)(10+(depth*2)), "", \
3683 RExC_lastparse=RExC_parse; \
3688 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3689 DEBUG_PARSE_MSG((funcname)); \
3690 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3693 - reg - regular expression, i.e. main body or parenthesized thing
3695 * Caller must absorb opening parenthesis.
3697 * Combining parenthesis handling with the base level of regular expression
3698 * is a trifle forced, but the need to tie the tails of the branches to what
3699 * follows makes it hard to avoid.
3701 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3703 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3705 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3709 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3710 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3713 register regnode *ret; /* Will be the head of the group. */
3714 register regnode *br;
3715 register regnode *lastbr;
3716 register regnode *ender = NULL;
3717 register I32 parno = 0;
3719 const I32 oregflags = RExC_flags;
3720 bool have_branch = 0;
3723 /* for (?g), (?gc), and (?o) warnings; warning
3724 about (?c) will warn about (?g) -- japhy */
3726 #define WASTED_O 0x01
3727 #define WASTED_G 0x02
3728 #define WASTED_C 0x04
3729 #define WASTED_GC (0x02|0x04)
3730 I32 wastedflags = 0x00;
3732 char * parse_start = RExC_parse; /* MJD */
3733 char * const oregcomp_parse = RExC_parse;
3735 GET_RE_DEBUG_FLAGS_DECL;
3736 DEBUG_PARSE("reg ");
3739 *flagp = 0; /* Tentatively. */
3742 /* Make an OPEN node, if parenthesized. */
3744 if (*RExC_parse == '?') { /* (?...) */
3745 U32 posflags = 0, negflags = 0;
3746 U32 *flagsp = &posflags;
3747 bool is_logical = 0;
3748 const char * const seqstart = RExC_parse;
3751 paren = *RExC_parse++;
3752 ret = NULL; /* For look-ahead/behind. */
3754 case '<': /* (?<...) */
3755 RExC_seen |= REG_SEEN_LOOKBEHIND;
3756 if (*RExC_parse == '!')
3758 if (*RExC_parse != '=' && *RExC_parse != '!')
3761 case '=': /* (?=...) */
3762 case '!': /* (?!...) */
3763 RExC_seen_zerolen++;
3764 case ':': /* (?:...) */
3765 case '>': /* (?>...) */
3767 case '$': /* (?$...) */
3768 case '@': /* (?@...) */
3769 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3771 case '#': /* (?#...) */
3772 while (*RExC_parse && *RExC_parse != ')')
3774 if (*RExC_parse != ')')
3775 FAIL("Sequence (?#... not terminated");
3776 nextchar(pRExC_state);
3779 case 'p': /* (?p...) */
3780 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3781 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3783 case '?': /* (??...) */
3785 if (*RExC_parse != '{')
3787 paren = *RExC_parse++;
3789 case '{': /* (?{...}) */
3791 I32 count = 1, n = 0;
3793 char *s = RExC_parse;
3795 RExC_seen_zerolen++;
3796 RExC_seen |= REG_SEEN_EVAL;
3797 while (count && (c = *RExC_parse)) {
3808 if (*RExC_parse != ')') {
3810 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3814 OP_4tree *sop, *rop;
3815 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3818 Perl_save_re_context(aTHX);
3819 rop = sv_compile_2op(sv, &sop, "re", &pad);
3820 sop->op_private |= OPpREFCOUNTED;
3821 /* re_dup will OpREFCNT_inc */
3822 OpREFCNT_set(sop, 1);
3825 n = add_data(pRExC_state, 3, "nop");
3826 RExC_rx->data->data[n] = (void*)rop;
3827 RExC_rx->data->data[n+1] = (void*)sop;
3828 RExC_rx->data->data[n+2] = (void*)pad;
3831 else { /* First pass */
3832 if (PL_reginterp_cnt < ++RExC_seen_evals
3834 /* No compiled RE interpolated, has runtime
3835 components ===> unsafe. */
3836 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3837 if (PL_tainting && PL_tainted)
3838 FAIL("Eval-group in insecure regular expression");
3839 #if PERL_VERSION > 8
3840 if (IN_PERL_COMPILETIME)
3845 nextchar(pRExC_state);
3847 ret = reg_node(pRExC_state, LOGICAL);
3850 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3851 /* deal with the length of this later - MJD */
3854 ret = reganode(pRExC_state, EVAL, n);
3855 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3856 Set_Node_Offset(ret, parse_start);
3859 case '(': /* (?(?{...})...) and (?(?=...)...) */
3861 if (RExC_parse[0] == '?') { /* (?(?...)) */
3862 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3863 || RExC_parse[1] == '<'
3864 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3867 ret = reg_node(pRExC_state, LOGICAL);
3870 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3874 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3877 parno = atoi(RExC_parse++);
3879 while (isDIGIT(*RExC_parse))
3881 ret = reganode(pRExC_state, GROUPP, parno);
3883 if ((c = *nextchar(pRExC_state)) != ')')
3884 vFAIL("Switch condition not recognized");
3886 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3887 br = regbranch(pRExC_state, &flags, 1,depth+1);
3889 br = reganode(pRExC_state, LONGJMP, 0);
3891 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3892 c = *nextchar(pRExC_state);
3896 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3897 regbranch(pRExC_state, &flags, 1,depth+1);
3898 REGTAIL(pRExC_state, ret, lastbr);
3901 c = *nextchar(pRExC_state);
3906 vFAIL("Switch (?(condition)... contains too many branches");
3907 ender = reg_node(pRExC_state, TAIL);
3908 REGTAIL(pRExC_state, br, ender);
3910 REGTAIL(pRExC_state, lastbr, ender);
3911 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3914 REGTAIL(pRExC_state, ret, ender);
3918 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3922 RExC_parse--; /* for vFAIL to print correctly */
3923 vFAIL("Sequence (? incomplete");
3927 parse_flags: /* (?i) */
3928 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3929 /* (?g), (?gc) and (?o) are useless here
3930 and must be globally applied -- japhy */
3932 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3933 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3934 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3935 if (! (wastedflags & wflagbit) ) {
3936 wastedflags |= wflagbit;
3939 "Useless (%s%c) - %suse /%c modifier",
3940 flagsp == &negflags ? "?-" : "?",
3942 flagsp == &negflags ? "don't " : "",
3948 else if (*RExC_parse == 'c') {
3949 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3950 if (! (wastedflags & WASTED_C) ) {
3951 wastedflags |= WASTED_GC;
3954 "Useless (%sc) - %suse /gc modifier",
3955 flagsp == &negflags ? "?-" : "?",
3956 flagsp == &negflags ? "don't " : ""
3961 else { pmflag(flagsp, *RExC_parse); }
3965 if (*RExC_parse == '-') {
3967 wastedflags = 0; /* reset so (?g-c) warns twice */
3971 RExC_flags |= posflags;
3972 RExC_flags &= ~negflags;
3973 if (*RExC_parse == ':') {
3979 if (*RExC_parse != ')') {
3981 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3983 nextchar(pRExC_state);
3991 ret = reganode(pRExC_state, OPEN, parno);
3992 Set_Node_Length(ret, 1); /* MJD */
3993 Set_Node_Offset(ret, RExC_parse); /* MJD */
4000 /* Pick up the branches, linking them together. */
4001 parse_start = RExC_parse; /* MJD */
4002 br = regbranch(pRExC_state, &flags, 1,depth+1);
4003 /* branch_len = (paren != 0); */
4007 if (*RExC_parse == '|') {
4008 if (!SIZE_ONLY && RExC_extralen) {
4009 reginsert(pRExC_state, BRANCHJ, br);
4012 reginsert(pRExC_state, BRANCH, br);
4013 Set_Node_Length(br, paren != 0);
4014 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4018 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4020 else if (paren == ':') {
4021 *flagp |= flags&SIMPLE;
4023 if (is_open) { /* Starts with OPEN. */
4024 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4026 else if (paren != '?') /* Not Conditional */
4028 *flagp |= flags & (SPSTART | HASWIDTH);
4030 while (*RExC_parse == '|') {
4031 if (!SIZE_ONLY && RExC_extralen) {
4032 ender = reganode(pRExC_state, LONGJMP,0);
4033 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4036 RExC_extralen += 2; /* Account for LONGJMP. */
4037 nextchar(pRExC_state);
4038 br = regbranch(pRExC_state, &flags, 0, depth+1);
4042 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4046 *flagp |= flags&SPSTART;
4049 if (have_branch || paren != ':') {
4050 /* Make a closing node, and hook it on the end. */
4053 ender = reg_node(pRExC_state, TAIL);
4056 ender = reganode(pRExC_state, CLOSE, parno);
4057 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4058 Set_Node_Length(ender,1); /* MJD */
4064 *flagp &= ~HASWIDTH;
4067 ender = reg_node(pRExC_state, SUCCEED);
4070 ender = reg_node(pRExC_state, END);
4073 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4075 if (have_branch && !SIZE_ONLY) {
4076 /* Hook the tails of the branches to the closing node. */
4077 for (br = ret; br; br = regnext(br)) {
4078 const U8 op = PL_regkind[OP(br)];
4080 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4082 else if (op == BRANCHJ) {
4083 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4091 static const char parens[] = "=!<,>";
4093 if (paren && (p = strchr(parens, paren))) {
4094 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4095 int flag = (p - parens) > 1;
4098 node = SUSPEND, flag = 0;
4099 reginsert(pRExC_state, node,ret);
4100 Set_Node_Cur_Length(ret);
4101 Set_Node_Offset(ret, parse_start + 1);
4103 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4107 /* Check for proper termination. */
4109 RExC_flags = oregflags;
4110 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4111 RExC_parse = oregcomp_parse;
4112 vFAIL("Unmatched (");
4115 else if (!paren && RExC_parse < RExC_end) {
4116 if (*RExC_parse == ')') {
4118 vFAIL("Unmatched )");
4121 FAIL("Junk on end of regexp"); /* "Can't happen". */
4129 - regbranch - one alternative of an | operator
4131 * Implements the concatenation operator.
4134 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4137 register regnode *ret;
4138 register regnode *chain = NULL;
4139 register regnode *latest;
4140 I32 flags = 0, c = 0;
4141 GET_RE_DEBUG_FLAGS_DECL;
4142 DEBUG_PARSE("brnc");
4146 if (!SIZE_ONLY && RExC_extralen)
4147 ret = reganode(pRExC_state, BRANCHJ,0);
4149 ret = reg_node(pRExC_state, BRANCH);
4150 Set_Node_Length(ret, 1);
4154 if (!first && SIZE_ONLY)
4155 RExC_extralen += 1; /* BRANCHJ */
4157 *flagp = WORST; /* Tentatively. */
4160 nextchar(pRExC_state);
4161 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4163 latest = regpiece(pRExC_state, &flags,depth+1);
4164 if (latest == NULL) {
4165 if (flags & TRYAGAIN)
4169 else if (ret == NULL)
4171 *flagp |= flags&HASWIDTH;
4172 if (chain == NULL) /* First piece. */
4173 *flagp |= flags&SPSTART;
4176 REGTAIL(pRExC_state, chain, latest);
4181 if (chain == NULL) { /* Loop ran zero times. */
4182 chain = reg_node(pRExC_state, NOTHING);
4187 *flagp |= flags&SIMPLE;
4194 - regpiece - something followed by possible [*+?]
4196 * Note that the branching code sequences used for ? and the general cases
4197 * of * and + are somewhat optimized: they use the same NOTHING node as
4198 * both the endmarker for their branch list and the body of the last branch.
4199 * It might seem that this node could be dispensed with entirely, but the
4200 * endmarker role is not redundant.
4203 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4206 register regnode *ret;
4208 register char *next;
4210 const char * const origparse = RExC_parse;
4212 I32 max = REG_INFTY;
4214 GET_RE_DEBUG_FLAGS_DECL;
4215 DEBUG_PARSE("piec");
4217 ret = regatom(pRExC_state, &flags,depth+1);
4219 if (flags & TRYAGAIN)
4226 if (op == '{' && regcurly(RExC_parse)) {
4227 const char *maxpos = NULL;
4228 parse_start = RExC_parse; /* MJD */
4229 next = RExC_parse + 1;
4230 while (isDIGIT(*next) || *next == ',') {
4239 if (*next == '}') { /* got one */
4243 min = atoi(RExC_parse);
4247 maxpos = RExC_parse;
4249 if (!max && *maxpos != '0')
4250 max = REG_INFTY; /* meaning "infinity" */
4251 else if (max >= REG_INFTY)
4252 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4254 nextchar(pRExC_state);
4257 if ((flags&SIMPLE)) {
4258 RExC_naughty += 2 + RExC_naughty / 2;
4259 reginsert(pRExC_state, CURLY, ret);
4260 Set_Node_Offset(ret, parse_start+1); /* MJD */
4261 Set_Node_Cur_Length(ret);
4264 regnode * const w = reg_node(pRExC_state, WHILEM);
4267 REGTAIL(pRExC_state, ret, w);
4268 if (!SIZE_ONLY && RExC_extralen) {
4269 reginsert(pRExC_state, LONGJMP,ret);
4270 reginsert(pRExC_state, NOTHING,ret);
4271 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4273 reginsert(pRExC_state, CURLYX,ret);
4275 Set_Node_Offset(ret, parse_start+1);
4276 Set_Node_Length(ret,
4277 op == '{' ? (RExC_parse - parse_start) : 1);
4279 if (!SIZE_ONLY && RExC_extralen)
4280 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4281 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4283 RExC_whilem_seen++, RExC_extralen += 3;
4284 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4292 if (max && max < min)
4293 vFAIL("Can't do {n,m} with n > m");
4295 ARG1_SET(ret, (U16)min);
4296 ARG2_SET(ret, (U16)max);
4308 #if 0 /* Now runtime fix should be reliable. */
4310 /* if this is reinstated, don't forget to put this back into perldiag:
4312 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4314 (F) The part of the regexp subject to either the * or + quantifier
4315 could match an empty string. The {#} shows in the regular
4316 expression about where the problem was discovered.
4320 if (!(flags&HASWIDTH) && op != '?')
4321 vFAIL("Regexp *+ operand could be empty");
4324 parse_start = RExC_parse;
4325 nextchar(pRExC_state);
4327 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4329 if (op == '*' && (flags&SIMPLE)) {
4330 reginsert(pRExC_state, STAR, ret);
4334 else if (op == '*') {
4338 else if (op == '+' && (flags&SIMPLE)) {
4339 reginsert(pRExC_state, PLUS, ret);
4343 else if (op == '+') {
4347 else if (op == '?') {
4352 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4354 "%.*s matches null string many times",
4355 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4359 if (*RExC_parse == '?') {
4360 nextchar(pRExC_state);
4361 reginsert(pRExC_state, MINMOD, ret);
4362 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4364 if (ISMULT2(RExC_parse)) {
4366 vFAIL("Nested quantifiers");
4373 - regatom - the lowest level
4375 * Optimization: gobbles an entire sequence of ordinary characters so that
4376 * it can turn them into a single node, which is smaller to store and
4377 * faster to run. Backslashed characters are exceptions, each becoming a
4378 * separate node; the code is simpler that way and it's not worth fixing.
4380 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4381 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4384 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4387 register regnode *ret = NULL;
4389 char *parse_start = RExC_parse;
4390 GET_RE_DEBUG_FLAGS_DECL;
4391 DEBUG_PARSE("atom");
4392 *flagp = WORST; /* Tentatively. */
4395 switch (*RExC_parse) {
4397 RExC_seen_zerolen++;
4398 nextchar(pRExC_state);
4399 if (RExC_flags & PMf_MULTILINE)
4400 ret = reg_node(pRExC_state, MBOL);
4401 else if (RExC_flags & PMf_SINGLELINE)
4402 ret = reg_node(pRExC_state, SBOL);
4404 ret = reg_node(pRExC_state, BOL);
4405 Set_Node_Length(ret, 1); /* MJD */
4408 nextchar(pRExC_state);
4410 RExC_seen_zerolen++;
4411 if (RExC_flags & PMf_MULTILINE)
4412 ret = reg_node(pRExC_state, MEOL);
4413 else if (RExC_flags & PMf_SINGLELINE)
4414 ret = reg_node(pRExC_state, SEOL);
4416 ret = reg_node(pRExC_state, EOL);
4417 Set_Node_Length(ret, 1); /* MJD */
4420 nextchar(pRExC_state);
4421 if (RExC_flags & PMf_SINGLELINE)
4422 ret = reg_node(pRExC_state, SANY);
4424 ret = reg_node(pRExC_state, REG_ANY);
4425 *flagp |= HASWIDTH|SIMPLE;
4427 Set_Node_Length(ret, 1); /* MJD */
4431 char * const oregcomp_parse = ++RExC_parse;
4432 ret = regclass(pRExC_state,depth+1);
4433 if (*RExC_parse != ']') {
4434 RExC_parse = oregcomp_parse;
4435 vFAIL("Unmatched [");
4437 nextchar(pRExC_state);
4438 *flagp |= HASWIDTH|SIMPLE;
4439 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4443 nextchar(pRExC_state);
4444 ret = reg(pRExC_state, 1, &flags,depth+1);
4446 if (flags & TRYAGAIN) {
4447 if (RExC_parse == RExC_end) {
4448 /* Make parent create an empty node if needed. */
4456 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4460 if (flags & TRYAGAIN) {
4464 vFAIL("Internal urp");
4465 /* Supposed to be caught earlier. */
4468 if (!regcurly(RExC_parse)) {
4477 vFAIL("Quantifier follows nothing");
4480 switch (*++RExC_parse) {
4482 RExC_seen_zerolen++;
4483 ret = reg_node(pRExC_state, SBOL);
4485 nextchar(pRExC_state);
4486 Set_Node_Length(ret, 2); /* MJD */
4489 ret = reg_node(pRExC_state, GPOS);
4490 RExC_seen |= REG_SEEN_GPOS;
4492 nextchar(pRExC_state);
4493 Set_Node_Length(ret, 2); /* MJD */
4496 ret = reg_node(pRExC_state, SEOL);
4498 RExC_seen_zerolen++; /* Do not optimize RE away */
4499 nextchar(pRExC_state);
4502 ret = reg_node(pRExC_state, EOS);
4504 RExC_seen_zerolen++; /* Do not optimize RE away */
4505 nextchar(pRExC_state);
4506 Set_Node_Length(ret, 2); /* MJD */
4509 ret = reg_node(pRExC_state, CANY);
4510 RExC_seen |= REG_SEEN_CANY;
4511 *flagp |= HASWIDTH|SIMPLE;
4512 nextchar(pRExC_state);
4513 Set_Node_Length(ret, 2); /* MJD */
4516 ret = reg_node(pRExC_state, CLUMP);
4518 nextchar(pRExC_state);
4519 Set_Node_Length(ret, 2); /* MJD */
4522 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4523 *flagp |= HASWIDTH|SIMPLE;
4524 nextchar(pRExC_state);
4525 Set_Node_Length(ret, 2); /* MJD */
4528 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4529 *flagp |= HASWIDTH|SIMPLE;
4530 nextchar(pRExC_state);
4531 Set_Node_Length(ret, 2); /* MJD */
4534 RExC_seen_zerolen++;
4535 RExC_seen |= REG_SEEN_LOOKBEHIND;
4536 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4538 nextchar(pRExC_state);
4539 Set_Node_Length(ret, 2); /* MJD */
4542 RExC_seen_zerolen++;
4543 RExC_seen |= REG_SEEN_LOOKBEHIND;
4544 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4546 nextchar(pRExC_state);
4547 Set_Node_Length(ret, 2); /* MJD */
4550 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4551 *flagp |= HASWIDTH|SIMPLE;
4552 nextchar(pRExC_state);
4553 Set_Node_Length(ret, 2); /* MJD */
4556 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4557 *flagp |= HASWIDTH|SIMPLE;
4558 nextchar(pRExC_state);
4559 Set_Node_Length(ret, 2); /* MJD */
4562 ret = reg_node(pRExC_state, DIGIT);
4563 *flagp |= HASWIDTH|SIMPLE;
4564 nextchar(pRExC_state);
4565 Set_Node_Length(ret, 2); /* MJD */
4568 ret = reg_node(pRExC_state, NDIGIT);
4569 *flagp |= HASWIDTH|SIMPLE;
4570 nextchar(pRExC_state);
4571 Set_Node_Length(ret, 2); /* MJD */
4576 char* const oldregxend = RExC_end;
4577 char* parse_start = RExC_parse - 2;
4579 if (RExC_parse[1] == '{') {
4580 /* a lovely hack--pretend we saw [\pX] instead */
4581 RExC_end = strchr(RExC_parse, '}');
4583 const U8 c = (U8)*RExC_parse;
4585 RExC_end = oldregxend;
4586 vFAIL2("Missing right brace on \\%c{}", c);
4591 RExC_end = RExC_parse + 2;
4592 if (RExC_end > oldregxend)
4593 RExC_end = oldregxend;
4597 ret = regclass(pRExC_state,depth+1);
4599 RExC_end = oldregxend;
4602 Set_Node_Offset(ret, parse_start + 2);
4603 Set_Node_Cur_Length(ret);
4604 nextchar(pRExC_state);
4605 *flagp |= HASWIDTH|SIMPLE;
4618 case '1': case '2': case '3': case '4':
4619 case '5': case '6': case '7': case '8': case '9':
4621 const I32 num = atoi(RExC_parse);
4623 if (num > 9 && num >= RExC_npar)
4626 char * const parse_start = RExC_parse - 1; /* MJD */
4627 while (isDIGIT(*RExC_parse))
4630 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4631 vFAIL("Reference to nonexistent group");
4633 ret = reganode(pRExC_state,
4634 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4638 /* override incorrect value set in reganode MJD */
4639 Set_Node_Offset(ret, parse_start+1);
4640 Set_Node_Cur_Length(ret); /* MJD */
4642 nextchar(pRExC_state);
4647 if (RExC_parse >= RExC_end)
4648 FAIL("Trailing \\");
4651 /* Do not generate "unrecognized" warnings here, we fall
4652 back into the quick-grab loop below */
4659 if (RExC_flags & PMf_EXTENDED) {
4660 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4662 if (RExC_parse < RExC_end)
4668 register STRLEN len;
4673 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4675 parse_start = RExC_parse - 1;
4681 ret = reg_node(pRExC_state,
4682 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4684 for (len = 0, p = RExC_parse - 1;
4685 len < 127 && p < RExC_end;
4688 char * const oldp = p;
4690 if (RExC_flags & PMf_EXTENDED)
4691 p = regwhite(p, RExC_end);
4738 ender = ASCII_TO_NATIVE('\033');
4742 ender = ASCII_TO_NATIVE('\007');
4747 char* const e = strchr(p, '}');
4751 vFAIL("Missing right brace on \\x{}");
4754 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4755 | PERL_SCAN_DISALLOW_PREFIX;
4756 STRLEN numlen = e - p - 1;
4757 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4764 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4766 ender = grok_hex(p, &numlen, &flags, NULL);
4772 ender = UCHARAT(p++);
4773 ender = toCTRL(ender);
4775 case '0': case '1': case '2': case '3':case '4':
4776 case '5': case '6': case '7': case '8':case '9':
4778 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4781 ender = grok_oct(p, &numlen, &flags, NULL);
4791 FAIL("Trailing \\");
4794 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4795 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4796 goto normal_default;
4801 if (UTF8_IS_START(*p) && UTF) {
4803 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4804 &numlen, UTF8_ALLOW_DEFAULT);
4811 if (RExC_flags & PMf_EXTENDED)
4812 p = regwhite(p, RExC_end);
4814 /* Prime the casefolded buffer. */
4815 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4817 if (ISMULT2(p)) { /* Back off on ?+*. */
4822 /* Emit all the Unicode characters. */
4824 for (foldbuf = tmpbuf;
4826 foldlen -= numlen) {
4827 ender = utf8_to_uvchr(foldbuf, &numlen);
4829 const STRLEN unilen = reguni(pRExC_state, ender, s);
4832 /* In EBCDIC the numlen
4833 * and unilen can differ. */
4835 if (numlen >= foldlen)
4839 break; /* "Can't happen." */
4843 const STRLEN unilen = reguni(pRExC_state, ender, s);
4852 REGC((char)ender, s++);
4858 /* Emit all the Unicode characters. */
4860 for (foldbuf = tmpbuf;
4862 foldlen -= numlen) {
4863 ender = utf8_to_uvchr(foldbuf, &numlen);
4865 const STRLEN unilen = reguni(pRExC_state, ender, s);
4868 /* In EBCDIC the numlen
4869 * and unilen can differ. */
4871 if (numlen >= foldlen)
4879 const STRLEN unilen = reguni(pRExC_state, ender, s);
4888 REGC((char)ender, s++);
4892 Set_Node_Cur_Length(ret); /* MJD */
4893 nextchar(pRExC_state);
4895 /* len is STRLEN which is unsigned, need to copy to signed */
4898 vFAIL("Internal disaster");
4902 if (len == 1 && UNI_IS_INVARIANT(ender))
4906 RExC_size += STR_SZ(len);
4909 RExC_emit += STR_SZ(len);
4915 /* If the encoding pragma is in effect recode the text of
4916 * any EXACT-kind nodes. */
4917 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4918 const STRLEN oldlen = STR_LEN(ret);
4919 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4923 if (sv_utf8_downgrade(sv, TRUE)) {
4924 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4925 const STRLEN newlen = SvCUR(sv);
4930 GET_RE_DEBUG_FLAGS_DECL;
4931 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4932 (int)oldlen, STRING(ret),
4934 Copy(s, STRING(ret), newlen, char);
4935 STR_LEN(ret) += newlen - oldlen;
4936 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4938 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4946 S_regwhite(char *p, const char *e)
4951 else if (*p == '#') {
4954 } while (p < e && *p != '\n');
4962 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4963 Character classes ([:foo:]) can also be negated ([:^foo:]).
4964 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4965 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4966 but trigger failures because they are currently unimplemented. */
4968 #define POSIXCC_DONE(c) ((c) == ':')
4969 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4970 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4973 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4976 I32 namedclass = OOB_NAMEDCLASS;
4978 if (value == '[' && RExC_parse + 1 < RExC_end &&
4979 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4980 POSIXCC(UCHARAT(RExC_parse))) {
4981 const char c = UCHARAT(RExC_parse);
4982 char* const s = RExC_parse++;
4984 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4986 if (RExC_parse == RExC_end)
4987 /* Grandfather lone [:, [=, [. */
4990 const char* const t = RExC_parse++; /* skip over the c */
4993 if (UCHARAT(RExC_parse) == ']') {
4994 const char *posixcc = s + 1;
4995 RExC_parse++; /* skip over the ending ] */
4998 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4999 const I32 skip = t - posixcc;
5001 /* Initially switch on the length of the name. */
5004 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5005 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5008 /* Names all of length 5. */
5009 /* alnum alpha ascii blank cntrl digit graph lower
5010 print punct space upper */
5011 /* Offset 4 gives the best switch position. */
5012 switch (posixcc[4]) {
5014 if (memEQ(posixcc, "alph", 4)) /* alpha */
5015 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5018 if (memEQ(posixcc, "spac", 4)) /* space */
5019 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5022 if (memEQ(posixcc, "grap", 4)) /* graph */
5023 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5026 if (memEQ(posixcc, "asci", 4)) /* ascii */
5027 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5030 if (memEQ(posixcc, "blan", 4)) /* blank */
5031 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5034 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5035 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5038 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5039 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5042 if (memEQ(posixcc, "lowe", 4)) /* lower */
5043 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5044 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5045 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5048 if (memEQ(posixcc, "digi", 4)) /* digit */
5049 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5050 else if (memEQ(posixcc, "prin", 4)) /* print */
5051 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5052 else if (memEQ(posixcc, "punc", 4)) /* punct */
5053 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5058 if (memEQ(posixcc, "xdigit", 6))
5059 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5063 if (namedclass == OOB_NAMEDCLASS)
5064 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5066 assert (posixcc[skip] == ':');
5067 assert (posixcc[skip+1] == ']');
5068 } else if (!SIZE_ONLY) {
5069 /* [[=foo=]] and [[.foo.]] are still future. */
5071 /* adjust RExC_parse so the warning shows after
5073 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5075 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5078 /* Maternal grandfather:
5079 * "[:" ending in ":" but not in ":]" */
5089 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5092 if (POSIXCC(UCHARAT(RExC_parse))) {
5093 const char *s = RExC_parse;
5094 const char c = *s++;
5098 if (*s && c == *s && s[1] == ']') {
5099 if (ckWARN(WARN_REGEXP))
5101 "POSIX syntax [%c %c] belongs inside character classes",
5104 /* [[=foo=]] and [[.foo.]] are still future. */
5105 if (POSIXCC_NOTYET(c)) {
5106 /* adjust RExC_parse so the error shows after
5108 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5110 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5118 parse a class specification and produce either an ANYOF node that
5119 matches the pattern. If the pattern matches a single char only and
5120 that char is < 256 then we produce an EXACT node instead.
5123 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5127 register UV nextvalue;
5128 register IV prevvalue = OOB_UNICODE;
5129 register IV range = 0;
5130 register regnode *ret;
5133 char *rangebegin = NULL;
5134 bool need_class = 0;
5137 bool optimize_invert = TRUE;
5138 AV* unicode_alternate = NULL;
5140 UV literal_endpoint = 0;
5142 UV stored = 0; /* number of chars stored in the class */
5144 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5145 case we need to change the emitted regop to an EXACT. */
5146 const char * orig_parse = RExC_parse;
5147 GET_RE_DEBUG_FLAGS_DECL;
5148 DEBUG_PARSE("clas");
5150 /* Assume we are going to generate an ANYOF node. */
5151 ret = reganode(pRExC_state, ANYOF, 0);
5154 ANYOF_FLAGS(ret) = 0;
5156 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5160 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5164 RExC_size += ANYOF_SKIP;
5165 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5168 RExC_emit += ANYOF_SKIP;
5170 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5172 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5173 ANYOF_BITMAP_ZERO(ret);
5174 listsv = newSVpvs("# comment\n");
5177 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5179 if (!SIZE_ONLY && POSIXCC(nextvalue))
5180 checkposixcc(pRExC_state);
5182 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5183 if (UCHARAT(RExC_parse) == ']')
5186 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5190 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5193 rangebegin = RExC_parse;
5195 value = utf8n_to_uvchr((U8*)RExC_parse,
5196 RExC_end - RExC_parse,
5197 &numlen, UTF8_ALLOW_DEFAULT);
5198 RExC_parse += numlen;
5201 value = UCHARAT(RExC_parse++);
5203 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5204 if (value == '[' && POSIXCC(nextvalue))
5205 namedclass = regpposixcc(pRExC_state, value);
5206 else if (value == '\\') {
5208 value = utf8n_to_uvchr((U8*)RExC_parse,
5209 RExC_end - RExC_parse,
5210 &numlen, UTF8_ALLOW_DEFAULT);
5211 RExC_parse += numlen;
5214 value = UCHARAT(RExC_parse++);
5215 /* Some compilers cannot handle switching on 64-bit integer
5216 * values, therefore value cannot be an UV. Yes, this will
5217 * be a problem later if we want switch on Unicode.
5218 * A similar issue a little bit later when switching on
5219 * namedclass. --jhi */
5220 switch ((I32)value) {
5221 case 'w': namedclass = ANYOF_ALNUM; break;
5222 case 'W': namedclass = ANYOF_NALNUM; break;
5223 case 's': namedclass = ANYOF_SPACE; break;
5224 case 'S': namedclass = ANYOF_NSPACE; break;
5225 case 'd': namedclass = ANYOF_DIGIT; break;
5226 case 'D': namedclass = ANYOF_NDIGIT; break;
5231 if (RExC_parse >= RExC_end)
5232 vFAIL2("Empty \\%c{}", (U8)value);
5233 if (*RExC_parse == '{') {
5234 const U8 c = (U8)value;
5235 e = strchr(RExC_parse++, '}');
5237 vFAIL2("Missing right brace on \\%c{}", c);
5238 while (isSPACE(UCHARAT(RExC_parse)))
5240 if (e == RExC_parse)
5241 vFAIL2("Empty \\%c{}", c);
5243 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5251 if (UCHARAT(RExC_parse) == '^') {
5254 value = value == 'p' ? 'P' : 'p'; /* toggle */
5255 while (isSPACE(UCHARAT(RExC_parse))) {
5260 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5261 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5264 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5265 namedclass = ANYOF_MAX; /* no official name, but it's named */
5268 case 'n': value = '\n'; break;
5269 case 'r': value = '\r'; break;
5270 case 't': value = '\t'; break;
5271 case 'f': value = '\f'; break;
5272 case 'b': value = '\b'; break;
5273 case 'e': value = ASCII_TO_NATIVE('\033');break;
5274 case 'a': value = ASCII_TO_NATIVE('\007');break;
5276 if (*RExC_parse == '{') {
5277 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5278 | PERL_SCAN_DISALLOW_PREFIX;
5279 char * const e = strchr(RExC_parse++, '}');
5281 vFAIL("Missing right brace on \\x{}");
5283 numlen = e - RExC_parse;
5284 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5288 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5290 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5291 RExC_parse += numlen;
5295 value = UCHARAT(RExC_parse++);
5296 value = toCTRL(value);
5298 case '0': case '1': case '2': case '3': case '4':
5299 case '5': case '6': case '7': case '8': case '9':
5303 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5304 RExC_parse += numlen;
5308 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5310 "Unrecognized escape \\%c in character class passed through",
5314 } /* end of \blah */
5320 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5322 if (!SIZE_ONLY && !need_class)
5323 ANYOF_CLASS_ZERO(ret);
5327 /* a bad range like a-\d, a-[:digit:] ? */
5330 if (ckWARN(WARN_REGEXP)) {
5332 RExC_parse >= rangebegin ?
5333 RExC_parse - rangebegin : 0;
5335 "False [] range \"%*.*s\"",
5338 if (prevvalue < 256) {
5339 ANYOF_BITMAP_SET(ret, prevvalue);
5340 ANYOF_BITMAP_SET(ret, '-');
5343 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5344 Perl_sv_catpvf(aTHX_ listsv,
5345 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5349 range = 0; /* this was not a true range */
5353 const char *what = NULL;
5356 if (namedclass > OOB_NAMEDCLASS)
5357 optimize_invert = FALSE;
5358 /* Possible truncation here but in some 64-bit environments
5359 * the compiler gets heartburn about switch on 64-bit values.
5360 * A similar issue a little earlier when switching on value.
5362 switch ((I32)namedclass) {
5365 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5367 for (value = 0; value < 256; value++)
5369 ANYOF_BITMAP_SET(ret, value);
5376 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5378 for (value = 0; value < 256; value++)
5379 if (!isALNUM(value))
5380 ANYOF_BITMAP_SET(ret, value);
5387 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5389 for (value = 0; value < 256; value++)
5390 if (isALNUMC(value))
5391 ANYOF_BITMAP_SET(ret, value);
5398 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5400 for (value = 0; value < 256; value++)
5401 if (!isALNUMC(value))
5402 ANYOF_BITMAP_SET(ret, value);
5409 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5411 for (value = 0; value < 256; value++)
5413 ANYOF_BITMAP_SET(ret, value);
5420 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5422 for (value = 0; value < 256; value++)
5423 if (!isALPHA(value))
5424 ANYOF_BITMAP_SET(ret, value);
5431 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5434 for (value = 0; value < 128; value++)
5435 ANYOF_BITMAP_SET(ret, value);
5437 for (value = 0; value < 256; value++) {
5439 ANYOF_BITMAP_SET(ret, value);
5448 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5451 for (value = 128; value < 256; value++)
5452 ANYOF_BITMAP_SET(ret, value);
5454 for (value = 0; value < 256; value++) {
5455 if (!isASCII(value))
5456 ANYOF_BITMAP_SET(ret, value);
5465 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5467 for (value = 0; value < 256; value++)
5469 ANYOF_BITMAP_SET(ret, value);
5476 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5478 for (value = 0; value < 256; value++)
5479 if (!isBLANK(value))
5480 ANYOF_BITMAP_SET(ret, value);
5487 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5489 for (value = 0; value < 256; value++)
5491 ANYOF_BITMAP_SET(ret, value);
5498 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5500 for (value = 0; value < 256; value++)
5501 if (!isCNTRL(value))
5502 ANYOF_BITMAP_SET(ret, value);
5509 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5511 /* consecutive digits assumed */
5512 for (value = '0'; value <= '9'; value++)
5513 ANYOF_BITMAP_SET(ret, value);
5520 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5522 /* consecutive digits assumed */
5523 for (value = 0; value < '0'; value++)
5524 ANYOF_BITMAP_SET(ret, value);
5525 for (value = '9' + 1; value < 256; value++)
5526 ANYOF_BITMAP_SET(ret, value);
5533 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5535 for (value = 0; value < 256; value++)
5537 ANYOF_BITMAP_SET(ret, value);
5544 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5546 for (value = 0; value < 256; value++)
5547 if (!isGRAPH(value))
5548 ANYOF_BITMAP_SET(ret, value);
5555 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5557 for (value = 0; value < 256; value++)
5559 ANYOF_BITMAP_SET(ret, value);
5566 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5568 for (value = 0; value < 256; value++)
5569 if (!isLOWER(value))
5570 ANYOF_BITMAP_SET(ret, value);
5577 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5579 for (value = 0; value < 256; value++)
5581 ANYOF_BITMAP_SET(ret, value);
5588 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5590 for (value = 0; value < 256; value++)
5591 if (!isPRINT(value))
5592 ANYOF_BITMAP_SET(ret, value);
5599 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5601 for (value = 0; value < 256; value++)
5602 if (isPSXSPC(value))
5603 ANYOF_BITMAP_SET(ret, value);
5610 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5612 for (value = 0; value < 256; value++)
5613 if (!isPSXSPC(value))
5614 ANYOF_BITMAP_SET(ret, value);
5621 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5623 for (value = 0; value < 256; value++)
5625 ANYOF_BITMAP_SET(ret, value);
5632 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5634 for (value = 0; value < 256; value++)
5635 if (!isPUNCT(value))
5636 ANYOF_BITMAP_SET(ret, value);
5643 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5645 for (value = 0; value < 256; value++)
5647 ANYOF_BITMAP_SET(ret, value);
5654 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5656 for (value = 0; value < 256; value++)
5657 if (!isSPACE(value))
5658 ANYOF_BITMAP_SET(ret, value);
5665 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5667 for (value = 0; value < 256; value++)
5669 ANYOF_BITMAP_SET(ret, value);
5676 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5678 for (value = 0; value < 256; value++)
5679 if (!isUPPER(value))
5680 ANYOF_BITMAP_SET(ret, value);
5687 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5689 for (value = 0; value < 256; value++)
5690 if (isXDIGIT(value))
5691 ANYOF_BITMAP_SET(ret, value);
5698 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5700 for (value = 0; value < 256; value++)
5701 if (!isXDIGIT(value))
5702 ANYOF_BITMAP_SET(ret, value);
5708 /* this is to handle \p and \P */
5711 vFAIL("Invalid [::] class");
5715 /* Strings such as "+utf8::isWord\n" */
5716 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5719 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5722 } /* end of namedclass \blah */
5725 if (prevvalue > (IV)value) /* b-a */ {
5726 const int w = RExC_parse - rangebegin;
5727 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5728 range = 0; /* not a valid range */
5732 prevvalue = value; /* save the beginning of the range */
5733 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5734 RExC_parse[1] != ']') {
5737 /* a bad range like \w-, [:word:]- ? */
5738 if (namedclass > OOB_NAMEDCLASS) {
5739 if (ckWARN(WARN_REGEXP)) {
5741 RExC_parse >= rangebegin ?
5742 RExC_parse - rangebegin : 0;
5744 "False [] range \"%*.*s\"",
5748 ANYOF_BITMAP_SET(ret, '-');
5750 range = 1; /* yeah, it's a range! */
5751 continue; /* but do it the next time */
5755 /* now is the next time */
5756 /*stored += (value - prevvalue + 1);*/
5758 if (prevvalue < 256) {
5759 const IV ceilvalue = value < 256 ? value : 255;
5762 /* In EBCDIC [\x89-\x91] should include
5763 * the \x8e but [i-j] should not. */
5764 if (literal_endpoint == 2 &&
5765 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5766 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5768 if (isLOWER(prevvalue)) {
5769 for (i = prevvalue; i <= ceilvalue; i++)
5771 ANYOF_BITMAP_SET(ret, i);
5773 for (i = prevvalue; i <= ceilvalue; i++)
5775 ANYOF_BITMAP_SET(ret, i);
5780 for (i = prevvalue; i <= ceilvalue; i++) {
5781 if (!ANYOF_BITMAP_TEST(ret,i)) {
5783 ANYOF_BITMAP_SET(ret, i);
5787 if (value > 255 || UTF) {
5788 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5789 const UV natvalue = NATIVE_TO_UNI(value);
5790 stored+=2; /* can't optimize this class */
5791 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5792 if (prevnatvalue < natvalue) { /* what about > ? */
5793 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5794 prevnatvalue, natvalue);
5796 else if (prevnatvalue == natvalue) {
5797 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5799 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5801 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5803 /* If folding and foldable and a single
5804 * character, insert also the folded version
5805 * to the charclass. */
5807 if (foldlen == (STRLEN)UNISKIP(f))
5808 Perl_sv_catpvf(aTHX_ listsv,
5811 /* Any multicharacter foldings
5812 * require the following transform:
5813 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5814 * where E folds into "pq" and F folds
5815 * into "rst", all other characters
5816 * fold to single characters. We save
5817 * away these multicharacter foldings,
5818 * to be later saved as part of the
5819 * additional "s" data. */
5822 if (!unicode_alternate)
5823 unicode_alternate = newAV();
5824 sv = newSVpvn((char*)foldbuf, foldlen);
5826 av_push(unicode_alternate, sv);
5830 /* If folding and the value is one of the Greek
5831 * sigmas insert a few more sigmas to make the
5832 * folding rules of the sigmas to work right.
5833 * Note that not all the possible combinations
5834 * are handled here: some of them are handled
5835 * by the standard folding rules, and some of
5836 * them (literal or EXACTF cases) are handled
5837 * during runtime in regexec.c:S_find_byclass(). */
5838 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5839 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5840 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5841 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5842 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5844 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5845 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5846 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5851 literal_endpoint = 0;
5855 range = 0; /* this range (if it was one) is done now */
5859 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5861 RExC_size += ANYOF_CLASS_ADD_SKIP;
5863 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5869 /****** !SIZE_ONLY AFTER HERE *********/
5871 if( stored == 1 && value < 256
5872 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5874 /* optimize single char class to an EXACT node
5875 but *only* when its not a UTF/high char */
5876 const char * cur_parse= RExC_parse;
5877 RExC_emit = (regnode *)orig_emit;
5878 RExC_parse = (char *)orig_parse;
5879 ret = reg_node(pRExC_state,
5880 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5881 RExC_parse = (char *)cur_parse;
5882 *STRING(ret)= (char)value;
5884 RExC_emit += STR_SZ(1);
5887 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5888 if ( /* If the only flag is folding (plus possibly inversion). */
5889 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5891 for (value = 0; value < 256; ++value) {
5892 if (ANYOF_BITMAP_TEST(ret, value)) {
5893 UV fold = PL_fold[value];
5896 ANYOF_BITMAP_SET(ret, fold);
5899 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5902 /* optimize inverted simple patterns (e.g. [^a-z]) */
5903 if (optimize_invert &&
5904 /* If the only flag is inversion. */
5905 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5906 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5907 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5908 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5911 AV * const av = newAV();
5913 /* The 0th element stores the character class description
5914 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5915 * to initialize the appropriate swash (which gets stored in
5916 * the 1st element), and also useful for dumping the regnode.
5917 * The 2nd element stores the multicharacter foldings,
5918 * used later (regexec.c:S_reginclass()). */
5919 av_store(av, 0, listsv);
5920 av_store(av, 1, NULL);
5921 av_store(av, 2, (SV*)unicode_alternate);
5922 rv = newRV_noinc((SV*)av);
5923 n = add_data(pRExC_state, 1, "s");
5924 RExC_rx->data->data[n] = (void*)rv;
5931 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5933 char* const retval = RExC_parse++;
5936 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5937 RExC_parse[2] == '#') {
5938 while (*RExC_parse != ')') {
5939 if (RExC_parse == RExC_end)
5940 FAIL("Sequence (?#... not terminated");
5946 if (RExC_flags & PMf_EXTENDED) {
5947 if (isSPACE(*RExC_parse)) {
5951 else if (*RExC_parse == '#') {
5952 while (RExC_parse < RExC_end)
5953 if (*RExC_parse++ == '\n') break;
5962 - reg_node - emit a node
5964 STATIC regnode * /* Location. */
5965 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5968 register regnode *ptr;
5969 regnode * const ret = RExC_emit;
5970 GET_RE_DEBUG_FLAGS_DECL;
5973 SIZE_ALIGN(RExC_size);
5977 NODE_ALIGN_FILL(ret);
5979 FILL_ADVANCE_NODE(ptr, op);
5980 if (RExC_offsets) { /* MJD */
5981 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
5982 "reg_node", __LINE__,
5984 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
5985 ? "Overwriting end of array!\n" : "OK",
5986 (UV)(RExC_emit - RExC_emit_start),
5987 (UV)(RExC_parse - RExC_start),
5988 (UV)RExC_offsets[0]));
5989 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5998 - reganode - emit a node with an argument
6000 STATIC regnode * /* Location. */
6001 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6004 register regnode *ptr;
6005 regnode * const ret = RExC_emit;
6006 GET_RE_DEBUG_FLAGS_DECL;
6009 SIZE_ALIGN(RExC_size);
6014 NODE_ALIGN_FILL(ret);
6016 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6017 if (RExC_offsets) { /* MJD */
6018 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6022 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6023 "Overwriting end of array!\n" : "OK",
6024 (UV)(RExC_emit - RExC_emit_start),
6025 (UV)(RExC_parse - RExC_start),
6026 (UV)RExC_offsets[0]));
6027 Set_Cur_Node_Offset;
6036 - reguni - emit (if appropriate) a Unicode character
6039 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6042 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6046 - reginsert - insert an operator in front of already-emitted operand
6048 * Means relocating the operand.
6051 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6054 register regnode *src;
6055 register regnode *dst;
6056 register regnode *place;
6057 const int offset = regarglen[(U8)op];
6058 GET_RE_DEBUG_FLAGS_DECL;
6059 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6062 RExC_size += NODE_STEP_REGNODE + offset;
6067 RExC_emit += NODE_STEP_REGNODE + offset;
6069 while (src > opnd) {
6070 StructCopy(--src, --dst, regnode);
6071 if (RExC_offsets) { /* MJD 20010112 */
6072 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6076 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6077 ? "Overwriting end of array!\n" : "OK",
6078 (UV)(src - RExC_emit_start),
6079 (UV)(dst - RExC_emit_start),
6080 (UV)RExC_offsets[0]));
6081 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6082 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6087 place = opnd; /* Op node, where operand used to be. */
6088 if (RExC_offsets) { /* MJD */
6089 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6093 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6094 ? "Overwriting end of array!\n" : "OK",
6095 (UV)(place - RExC_emit_start),
6096 (UV)(RExC_parse - RExC_start),
6098 Set_Node_Offset(place, RExC_parse);
6099 Set_Node_Length(place, 1);
6101 src = NEXTOPER(place);
6102 FILL_ADVANCE_NODE(place, op);
6103 Zero(src, offset, regnode);
6107 - regtail - set the next-pointer at the end of a node chain of p to val.
6108 - SEE ALSO: regtail_study
6110 /* TODO: All three parms should be const */
6112 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6115 register regnode *scan;
6116 GET_RE_DEBUG_FLAGS_DECL;
6121 /* Find last node. */
6124 regnode * const temp = regnext(scan);
6126 SV * const mysv=sv_newmortal();
6127 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6128 regprop(RExC_rx, mysv, scan);
6129 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6130 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6137 if (reg_off_by_arg[OP(scan)]) {
6138 ARG_SET(scan, val - scan);
6141 NEXT_OFF(scan) = val - scan;
6147 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6148 - Look for optimizable sequences at the same time.
6149 - currently only looks for EXACT chains.
6151 This is expermental code. The idea is to use this routine to perform
6152 in place optimizations on branches and groups as they are constructed,
6153 with the long term intention of removing optimization from study_chunk so
6154 that it is purely analytical.
6156 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6157 to control which is which.
6160 /* TODO: All four parms should be const */
6163 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6166 register regnode *scan;
6168 #ifdef EXPERIMENTAL_INPLACESCAN
6172 GET_RE_DEBUG_FLAGS_DECL;
6178 /* Find last node. */
6182 regnode * const temp = regnext(scan);
6183 #ifdef EXPERIMENTAL_INPLACESCAN
6184 if (PL_regkind[OP(scan)] == EXACT)
6185 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6193 if( exact == PSEUDO )
6195 else if ( exact != OP(scan) )
6204 SV * const mysv=sv_newmortal();
6205 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6206 regprop(RExC_rx, mysv, scan);
6207 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6208 SvPV_nolen_const(mysv),
6210 REG_NODE_NUM(scan));
6217 SV * const mysv_val=sv_newmortal();
6218 DEBUG_PARSE_MSG("");
6219 regprop(RExC_rx, mysv_val, val);
6220 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6221 SvPV_nolen_const(mysv_val),
6226 if (reg_off_by_arg[OP(scan)]) {
6227 ARG_SET(scan, val - scan);
6230 NEXT_OFF(scan) = val - scan;
6238 - regcurly - a little FSA that accepts {\d+,?\d*}
6241 S_regcurly(register const char *s)
6260 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6263 Perl_regdump(pTHX_ const regexp *r)
6267 SV * const sv = sv_newmortal();
6269 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
6271 /* Header fields of interest. */
6272 if (r->anchored_substr)
6273 PerlIO_printf(Perl_debug_log,
6274 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
6276 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
6277 SvPVX_const(r->anchored_substr),
6279 SvTAIL(r->anchored_substr) ? "$" : "",
6280 (IV)r->anchored_offset);
6281 else if (r->anchored_utf8)
6282 PerlIO_printf(Perl_debug_log,
6283 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
6285 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
6286 SvPVX_const(r->anchored_utf8),
6288 SvTAIL(r->anchored_utf8) ? "$" : "",
6289 (IV)r->anchored_offset);
6290 if (r->float_substr)
6291 PerlIO_printf(Perl_debug_log,
6292 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6294 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
6295 SvPVX_const(r->float_substr),
6297 SvTAIL(r->float_substr) ? "$" : "",
6298 (IV)r->float_min_offset, (UV)r->float_max_offset);
6299 else if (r->float_utf8)
6300 PerlIO_printf(Perl_debug_log,
6301 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6303 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
6304 SvPVX_const(r->float_utf8),
6306 SvTAIL(r->float_utf8) ? "$" : "",
6307 (IV)r->float_min_offset, (UV)r->float_max_offset);
6308 if (r->check_substr || r->check_utf8)
6309 PerlIO_printf(Perl_debug_log,
6310 r->check_substr == r->float_substr
6311 && r->check_utf8 == r->float_utf8
6312 ? "(checking floating" : "(checking anchored");
6313 if (r->reganch & ROPT_NOSCAN)
6314 PerlIO_printf(Perl_debug_log, " noscan");
6315 if (r->reganch & ROPT_CHECK_ALL)
6316 PerlIO_printf(Perl_debug_log, " isall");
6317 if (r->check_substr || r->check_utf8)
6318 PerlIO_printf(Perl_debug_log, ") ");
6320 if (r->regstclass) {
6321 regprop(r, sv, r->regstclass);
6322 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6324 if (r->reganch & ROPT_ANCH) {
6325 PerlIO_printf(Perl_debug_log, "anchored");
6326 if (r->reganch & ROPT_ANCH_BOL)
6327 PerlIO_printf(Perl_debug_log, "(BOL)");
6328 if (r->reganch & ROPT_ANCH_MBOL)
6329 PerlIO_printf(Perl_debug_log, "(MBOL)");
6330 if (r->reganch & ROPT_ANCH_SBOL)
6331 PerlIO_printf(Perl_debug_log, "(SBOL)");
6332 if (r->reganch & ROPT_ANCH_GPOS)
6333 PerlIO_printf(Perl_debug_log, "(GPOS)");
6334 PerlIO_putc(Perl_debug_log, ' ');
6336 if (r->reganch & ROPT_GPOS_SEEN)
6337 PerlIO_printf(Perl_debug_log, "GPOS ");
6338 if (r->reganch & ROPT_SKIP)
6339 PerlIO_printf(Perl_debug_log, "plus ");
6340 if (r->reganch & ROPT_IMPLICIT)
6341 PerlIO_printf(Perl_debug_log, "implicit ");
6342 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6343 if (r->reganch & ROPT_EVAL_SEEN)
6344 PerlIO_printf(Perl_debug_log, "with eval ");
6345 PerlIO_printf(Perl_debug_log, "\n");
6347 const U32 len = r->offsets[0];
6348 GET_RE_DEBUG_FLAGS_DECL;
6351 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
6352 for (i = 1; i <= len; i++) {
6353 if (!(SvIV(re_debug_flags) & RE_DEBUG_OLD_OFFSETS)) {
6354 if (r->offsets[i*2-1] || r->offsets[i*2])
6355 PerlIO_printf(Perl_debug_log, "%"UVuf":",i);
6359 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
6360 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
6362 PerlIO_printf(Perl_debug_log, "\n");
6366 PERL_UNUSED_CONTEXT;
6368 #endif /* DEBUGGING */
6372 - regprop - printable representation of opcode
6375 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6381 sv_setpvn(sv, "", 0);
6382 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6383 /* It would be nice to FAIL() here, but this may be called from
6384 regexec.c, and it would be hard to supply pRExC_state. */
6385 Perl_croak(aTHX_ "Corrupted regexp opcode");
6386 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6388 k = PL_regkind[OP(o)];
6391 SV * const dsv = sv_2mortal(newSVpvs(""));
6392 /* Using is_utf8_string() is a crude hack but it may
6393 * be the best for now since we have no flag "this EXACTish
6394 * node was UTF-8" --jhi */
6395 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
6396 const char * const s = do_utf8 ?
6397 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6398 UNI_DISPLAY_REGEX) :
6400 const int len = do_utf8 ?
6403 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6407 } else if (k == TRIE) {
6408 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6409 /* print the details of the trie in dumpuntil instead, as
6410 * prog->data isn't available here */
6411 } else if (k == CURLY) {
6412 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6413 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6414 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6416 else if (k == WHILEM && o->flags) /* Ordinal/of */
6417 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6418 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6419 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6420 else if (k == LOGICAL)
6421 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6422 else if (k == ANYOF) {
6423 int i, rangestart = -1;
6424 const U8 flags = ANYOF_FLAGS(o);
6426 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6427 static const char * const anyofs[] = {
6460 if (flags & ANYOF_LOCALE)
6461 sv_catpvs(sv, "{loc}");
6462 if (flags & ANYOF_FOLD)
6463 sv_catpvs(sv, "{i}");
6464 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6465 if (flags & ANYOF_INVERT)
6467 for (i = 0; i <= 256; i++) {
6468 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6469 if (rangestart == -1)
6471 } else if (rangestart != -1) {
6472 if (i <= rangestart + 3)
6473 for (; rangestart < i; rangestart++)
6474 put_byte(sv, rangestart);
6476 put_byte(sv, rangestart);
6478 put_byte(sv, i - 1);
6484 if (o->flags & ANYOF_CLASS)
6485 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6486 if (ANYOF_CLASS_TEST(o,i))
6487 sv_catpv(sv, anyofs[i]);
6489 if (flags & ANYOF_UNICODE)
6490 sv_catpvs(sv, "{unicode}");
6491 else if (flags & ANYOF_UNICODE_ALL)
6492 sv_catpvs(sv, "{unicode_all}");
6496 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6500 U8 s[UTF8_MAXBYTES_CASE+1];
6502 for (i = 0; i <= 256; i++) { /* just the first 256 */
6503 uvchr_to_utf8(s, i);
6505 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6506 if (rangestart == -1)
6508 } else if (rangestart != -1) {
6509 if (i <= rangestart + 3)
6510 for (; rangestart < i; rangestart++) {
6511 const U8 * const e = uvchr_to_utf8(s,rangestart);
6513 for(p = s; p < e; p++)
6517 const U8 *e = uvchr_to_utf8(s,rangestart);
6519 for (p = s; p < e; p++)
6522 e = uvchr_to_utf8(s, i-1);
6523 for (p = s; p < e; p++)
6530 sv_catpvs(sv, "..."); /* et cetera */
6534 char *s = savesvpv(lv);
6535 char * const origs = s;
6537 while (*s && *s != '\n')
6541 const char * const t = ++s;
6559 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6561 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6562 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
6564 PERL_UNUSED_CONTEXT;
6565 PERL_UNUSED_ARG(sv);
6567 #endif /* DEBUGGING */
6571 Perl_re_intuit_string(pTHX_ regexp *prog)
6572 { /* Assume that RE_INTUIT is set */
6574 GET_RE_DEBUG_FLAGS_DECL;
6575 PERL_UNUSED_CONTEXT;
6579 const char * const s = SvPV_nolen_const(prog->check_substr
6580 ? prog->check_substr : prog->check_utf8);
6582 if (!PL_colorset) reginitcolors();
6583 PerlIO_printf(Perl_debug_log,
6584 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6586 prog->check_substr ? "" : "utf8 ",
6587 PL_colors[5],PL_colors[0],
6590 (strlen(s) > 60 ? "..." : ""));
6593 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6597 Perl_pregfree(pTHX_ struct regexp *r)
6601 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6603 GET_RE_DEBUG_FLAGS_DECL;
6605 if (!r || (--r->refcnt > 0))
6607 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6608 const char * const s = (r->reganch & ROPT_UTF8)
6609 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6610 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6611 const int len = SvCUR(dsv);
6614 PerlIO_printf(Perl_debug_log,
6615 "%sFreeing REx:%s %s%*.*s%s%s\n",
6616 PL_colors[4],PL_colors[5],PL_colors[0],
6619 len > 60 ? "..." : "");
6622 /* gcov results gave these as non-null 100% of the time, so there's no
6623 optimisation in checking them before calling Safefree */
6624 Safefree(r->precomp);
6625 Safefree(r->offsets); /* 20010421 MJD */
6626 RX_MATCH_COPY_FREE(r);
6627 #ifdef PERL_OLD_COPY_ON_WRITE
6629 SvREFCNT_dec(r->saved_copy);
6632 if (r->anchored_substr)
6633 SvREFCNT_dec(r->anchored_substr);
6634 if (r->anchored_utf8)
6635 SvREFCNT_dec(r->anchored_utf8);
6636 if (r->float_substr)
6637 SvREFCNT_dec(r->float_substr);
6639 SvREFCNT_dec(r->float_utf8);
6640 Safefree(r->substrs);
6643 int n = r->data->count;
6644 PAD* new_comppad = NULL;
6649 /* If you add a ->what type here, update the comment in regcomp.h */
6650 switch (r->data->what[n]) {
6652 SvREFCNT_dec((SV*)r->data->data[n]);
6655 Safefree(r->data->data[n]);
6658 new_comppad = (AV*)r->data->data[n];
6661 if (new_comppad == NULL)
6662 Perl_croak(aTHX_ "panic: pregfree comppad");
6663 PAD_SAVE_LOCAL(old_comppad,
6664 /* Watch out for global destruction's random ordering. */
6665 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6668 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6671 op_free((OP_4tree*)r->data->data[n]);
6673 PAD_RESTORE_LOCAL(old_comppad);
6674 SvREFCNT_dec((SV*)new_comppad);
6682 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6684 refcount = --aho->refcount;
6687 Safefree(aho->states);
6688 Safefree(aho->fail);
6689 aho->trie=NULL; /* not necessary to free this as it is
6690 handled by the 't' case */
6691 Safefree(r->data->data[n]); /* do this last!!!! */
6698 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6700 refcount = --trie->refcount;
6703 Safefree(trie->charmap);
6704 if (trie->widecharmap)
6705 SvREFCNT_dec((SV*)trie->widecharmap);
6706 Safefree(trie->states);
6707 Safefree(trie->trans);
6709 Safefree(trie->bitmap);
6711 Safefree(trie->wordlen);
6714 SvREFCNT_dec((SV*)trie->words);
6715 if (trie->revcharmap)
6716 SvREFCNT_dec((SV*)trie->revcharmap);
6718 Safefree(r->data->data[n]); /* do this last!!!! */
6723 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6726 Safefree(r->data->what);
6729 Safefree(r->startp);
6734 #ifndef PERL_IN_XSUB_RE
6736 - regnext - dig the "next" pointer out of a node
6739 Perl_regnext(pTHX_ register regnode *p)
6742 register I32 offset;
6744 if (p == &PL_regdummy)
6747 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6756 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6759 STRLEN l1 = strlen(pat1);
6760 STRLEN l2 = strlen(pat2);
6763 const char *message;
6769 Copy(pat1, buf, l1 , char);
6770 Copy(pat2, buf + l1, l2 , char);
6771 buf[l1 + l2] = '\n';
6772 buf[l1 + l2 + 1] = '\0';
6774 /* ANSI variant takes additional second argument */
6775 va_start(args, pat2);
6779 msv = vmess(buf, &args);
6781 message = SvPV_const(msv,l1);
6784 Copy(message, buf, l1 , char);
6785 buf[l1-1] = '\0'; /* Overwrite \n */
6786 Perl_croak(aTHX_ "%s", buf);
6789 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6791 #ifndef PERL_IN_XSUB_RE
6793 Perl_save_re_context(pTHX)
6797 struct re_save_state *state;
6799 SAVEVPTR(PL_curcop);
6800 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6802 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6803 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6804 SSPUSHINT(SAVEt_RE_STATE);
6806 Copy(&PL_reg_state, state, 1, struct re_save_state);
6808 PL_reg_start_tmp = 0;
6809 PL_reg_start_tmpl = 0;
6810 PL_reg_oldsaved = NULL;
6811 PL_reg_oldsavedlen = 0;
6813 PL_reg_leftiter = 0;
6814 PL_reg_poscache = NULL;
6815 PL_reg_poscache_size = 0;
6816 #ifdef PERL_OLD_COPY_ON_WRITE
6820 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6822 const REGEXP * const rx = PM_GETRE(PL_curpm);
6825 for (i = 1; i <= rx->nparens; i++) {
6826 char digits[TYPE_CHARS(long)];
6827 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
6828 GV *const *const gvp
6829 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6832 GV * const gv = *gvp;
6833 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6843 clear_re(pTHX_ void *r)
6846 ReREFCNT_dec((regexp *)r);
6852 S_put_byte(pTHX_ SV *sv, int c)
6854 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6855 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6856 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6857 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6859 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6862 #define CLEAR_OPTSTART \
6863 if (optstart) STMT_START { \
6864 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
6868 #define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6870 STATIC const regnode *
6871 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6872 const regnode *last, SV* sv, I32 l)
6875 register U8 op = EXACT; /* Arbitrary non-END op. */
6876 register const regnode *next;
6877 const regnode *optstart= NULL;
6878 GET_RE_DEBUG_FLAGS_DECL;
6880 while (op != END && (!last || node < last)) {
6881 /* While that wasn't END last time... */
6887 next = regnext((regnode *)node);
6890 if (OP(node) == OPTIMIZED) {
6891 if (!optstart && (SvIV(re_debug_flags) & RE_DEBUG_OPTIMISE))
6898 regprop(r, sv, node);
6899 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6900 (int)(2*l + 1), "", SvPVX_const(sv));
6902 if (OP(node) != OPTIMIZED) {
6903 if (next == NULL) /* Next ptr. */
6904 PerlIO_printf(Perl_debug_log, "(0)");
6906 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6907 (void)PerlIO_putc(Perl_debug_log, '\n');
6911 if (PL_regkind[(U8)op] == BRANCHJ) {
6912 register const regnode *nnode = (OP(next) == LONGJMP
6913 ? regnext((regnode *)next)
6915 if (last && nnode > last)
6917 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6919 else if (PL_regkind[(U8)op] == BRANCH) {
6920 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
6922 else if ( PL_regkind[(U8)op] == TRIE ) {
6923 const I32 n = ARG(node);
6924 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6925 const I32 arry_len = av_len(trie->words)+1;
6927 PerlIO_printf(Perl_debug_log,
6928 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
6932 TRIE_WORDCOUNT(trie),
6933 (int)TRIE_CHARCOUNT(trie),
6934 trie->uniquecharcount,
6935 (IV)TRIE_LASTSTATE(trie)-1,
6936 trie->minlen, trie->maxlen
6941 sv_setpvn(sv, "", 0);
6942 for (i = 0; i <= 256; i++) {
6943 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6944 if (rangestart == -1)
6946 } else if (rangestart != -1) {
6947 if (i <= rangestart + 3)
6948 for (; rangestart < i; rangestart++)
6949 put_byte(sv, rangestart);
6951 put_byte(sv, rangestart);
6953 put_byte(sv, i - 1);
6958 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
6960 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
6962 for (word_idx=0; word_idx < arry_len; word_idx++) {
6963 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
6965 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6968 SvPV_nolen_const(*elem_ptr),
6974 node = NEXTOPER(node);
6975 node += regarglen[(U8)op];
6978 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6979 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6980 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6982 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6983 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6986 else if ( op == PLUS || op == STAR) {
6987 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6989 else if (op == ANYOF) {
6990 /* arglen 1 + class block */
6991 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6992 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6993 node = NEXTOPER(node);
6995 else if (PL_regkind[(U8)op] == EXACT) {
6996 /* Literal string, where present. */
6997 node += NODE_SZ_STR(node) - 1;
6998 node = NEXTOPER(node);
7001 node = NEXTOPER(node);
7002 node += regarglen[(U8)op];
7004 if (op == CURLYX || op == OPEN)
7006 else if (op == WHILEM)
7013 #endif /* DEBUGGING */
7017 * c-indentation-style: bsd
7019 * indent-tabs-mode: t
7022 * ex: set ts=8 sts=4 sw=4 noet: