5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
176 /* Length of a variant. */
178 typedef struct scan_data_t {
184 I32 last_end; /* min value, <0 unless valid. */
187 SV **longest; /* Either &l_fixed, or &l_float. */
191 I32 offset_float_min;
192 I32 offset_float_max;
196 struct regnode_charclass_class *start_class;
200 * Forward declarations for pregcomp()'s friends.
203 static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL 0x0001
208 #define SF_BEFORE_MEOL 0x0002
209 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
213 # define SF_FIX_SHIFT_EOL (0+2)
214 # define SF_FL_SHIFT_EOL (0+4)
216 # define SF_FIX_SHIFT_EOL (+2)
217 # define SF_FL_SHIFT_EOL (+4)
220 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF 0x0040
226 #define SF_HAS_PAR 0x0080
227 #define SF_IN_PAR 0x0100
228 #define SF_HAS_EVAL 0x0200
229 #define SCF_DO_SUBSTR 0x0400
230 #define SCF_DO_STCLASS_AND 0x0800
231 #define SCF_DO_STCLASS_OR 0x1000
232 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS 0x2000
235 #define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
237 #define UTF (RExC_utf8 != 0)
238 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
239 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
241 #define OOB_UNICODE 12345678
242 #define OOB_NAMEDCLASS -1
244 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248 /* length of regex to show in messages that don't mark a position within */
249 #define RegexLengthToShowInErrorMessages 127
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
256 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
257 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
259 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 #define FAIL(msg) STMT_START { \
267 const char *ellipses = ""; \
268 IV len = RExC_end - RExC_precomp; \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
284 #define Simple_vFAIL(m) STMT_START { \
285 const IV offset = RExC_parse - RExC_precomp; \
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
293 #define vFAIL(m) STMT_START { \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
300 * Like Simple_vFAIL(), but accepts two arguments.
302 #define Simple_vFAIL2(m,a1) STMT_START { \
303 const IV offset = RExC_parse - RExC_precomp; \
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
311 #define vFAIL2(m,a1) STMT_START { \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
319 * Like Simple_vFAIL(), but accepts three arguments.
321 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
322 const IV offset = RExC_parse - RExC_precomp; \
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
330 #define vFAIL3(m,a1,a2) STMT_START { \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
337 * Like Simple_vFAIL(), but accepts four arguments.
339 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
340 const IV offset = RExC_parse - RExC_precomp; \
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
345 #define vWARN(loc,m) STMT_START { \
346 const IV offset = loc - RExC_precomp; \
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351 #define vWARNdep(loc,m) STMT_START { \
352 const IV offset = loc - RExC_precomp; \
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359 #define vWARN2(loc, m, a1) STMT_START { \
360 const IV offset = loc - RExC_precomp; \
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365 #define vWARN3(loc, m, a1, a2) STMT_START { \
366 const IV offset = loc - RExC_precomp; \
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
372 const IV offset = loc - RExC_precomp; \
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
378 const IV offset = loc - RExC_precomp; \
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 /* Allow for side effects in s */
385 #define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389 /* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
393 * Position is 1 indexed.
396 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
399 __LINE__, (node), (int)(byte))); \
401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
403 RExC_offsets[2*(node)-1] = (byte); \
408 #define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
412 #define Set_Node_Length_To_R(node,len) STMT_START { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
415 __LINE__, (int)(node), (int)(len))); \
417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
419 RExC_offsets[2*(node)] = (len); \
424 #define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427 #define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
430 /* Get offsets and lengths */
431 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
434 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
440 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441 #define EXPERIMENTAL_INPLACESCAN
444 static void clear_re(pTHX_ void *r);
446 /* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
451 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
457 SvSetMagicSV(*data->longest, data->last_found);
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
464 data->flags &= ~SF_FIX_BEFORE_EOL;
467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
468 data->offset_float_max = (l
469 ? data->last_start_max
470 : data->pos_min + data->pos_delta);
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
473 if (data->flags & SF_BEFORE_EOL)
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
477 data->flags &= ~SF_FL_BEFORE_EOL;
480 SvCUR_set(data->last_found, 0);
482 SV * const sv = data->last_found;
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
490 data->flags &= ~SF_BEFORE_EOL;
493 /* Can match anything (initialization) */
495 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
497 ANYOF_CLASS_ZERO(cl);
498 ANYOF_BITMAP_SETALL(cl);
499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
501 cl->flags |= ANYOF_LOCALE;
504 /* Can match anything (initialization) */
506 S_cl_is_anything(const struct regnode_charclass_class *cl)
510 for (value = 0; value <= ANYOF_MAX; value += 2)
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
520 /* Can match anything (initialization) */
522 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 Zero(cl, 1, struct regnode_charclass_class);
526 cl_anything(pRExC_state, cl);
530 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 Zero(cl, 1, struct regnode_charclass_class);
534 cl_anything(pRExC_state, cl);
536 cl->flags |= ANYOF_LOCALE;
539 /* 'And' a given class with another one. Can create false positives */
540 /* We assume that cl is not inverted */
542 S_cl_and(struct regnode_charclass_class *cl,
543 const struct regnode_charclass_class *and_with)
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
570 cl->flags &= ~ANYOF_UNICODE_ALL;
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
573 cl->flags &= ~ANYOF_UNICODE;
576 /* 'OR' a given class with another one. Can create false positives */
577 /* We assume that cl is not inverted */
579 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
581 if (or_with->flags & ANYOF_INVERT) {
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
600 cl_anything(pRExC_state, cl);
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && (!(or_with->flags & ANYOF_FOLD)
606 || (cl->flags & ANYOF_FOLD)) ) {
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
619 cl_anything(pRExC_state, cl);
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
638 make_trie(startbranch,first,last,tail,flags,depth)
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
648 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
650 A trie is an N'ary tree where the branches are determined by digital
651 decomposition of the key. IE, at the root node you look up the 1st character and
652 follow that branch repeat until you find the end of the branches. Nodes can be
653 marked as "accepting" meaning they represent a complete word. Eg:
657 would convert into the following structure. Numbers represent states, letters
658 following numbers represent valid transitions on the letter from that state, if
659 the number is in square brackets it represents an accepting state, otherwise it
660 will be in parenthesis.
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
666 (1) +-i->(6)-+-s->[7]
668 +-s->(3)-+-h->(4)-+-e->[5]
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
672 This shows that when matching against the string 'hers' we will begin at state 1
673 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676 single traverse. We store a mapping from accepting to state to which word was
677 matched, and then when we have multiple possibilities we try to complete the
678 rest of the regex in the order in which they occured in the alternation.
680 The only prior NFA like behaviour that would be changed by the TRIE support is
681 the silent ignoring of duplicate alternations which are of the form:
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
685 Thus EVAL blocks follwing a trie may be called a different number of times with
686 and without the optimisation. With the optimisations dupes will be silently
687 ignored. This inconsistant behaviour of EVAL type nodes is well established as
688 the following demonstrates:
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
692 which prints out 'word' three times, but
694 'words'=~/(word|word|word)(?{ print $1 })S/
696 which doesnt print it out at all. This is due to other optimisations kicking in.
698 Example of what happens on a structural level:
700 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
702 1: CURLYM[1] {1,32767}(18)
713 This would be optimizable with startbranch=5, first=5, last=16, tail=16
714 and should turn into:
716 1: CURLYM[1] {1,32767}(18)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
726 Cases where tail != last would be like /(?foo|bar)baz/:
736 which would be optimizable with startbranch=1, first=1, last=7, tail=8
737 and would end up looking like:
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
747 d = uvuni_to_utf8_flags(d, uv, 0);
749 is the recommended Unicode-aware way of saying
754 #define TRIE_STORE_REVCHAR \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
760 #define TRIE_READ_CHAR STMT_START { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
785 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
790 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
801 #define TRIE_LIST_NEW(state) STMT_START { \
802 Newxz( trie->states[ state ].trans.list, \
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
808 #define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
812 trie->wordlen[ curword ] = wordlen; \
813 trie->states[ state ].wordnum = ++curword; \
815 /* store the word for dumping */ \
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
820 tmp = newSVpvn( "", 0 ); \
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
825 NOOP; /* It's a dupe. So ignore it. */ \
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
835 These routines dump out a trie in a somewhat readable format.
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
852 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
855 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
875 const U32 base = trie->states[ state ].trans.base;
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
909 PerlIO_printf( Perl_debug_log, "]");
912 PerlIO_printf( Perl_debug_log, "\n" );
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
923 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
932 for( state=1 ; state < next_alloc ; state ++ ) {
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
945 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
964 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
968 GET_RE_DEBUG_FLAGS_DECL;
971 print out the table precompression so that we can do a visual check
972 that they are identical.
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
978 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
990 PerlIO_printf( Perl_debug_log, "\n" );
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1013 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1023 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1025 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1045 const U32 ucharcount = trie->uniquecharcount;
1046 const U32 numstates = trie->laststate;
1047 const U32 ubound = trie->lasttrans + ucharcount;
1051 U32 base = trie->states[ 1 ].trans.base;
1054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1055 GET_RE_DEBUG_FLAGS_DECL;
1057 ARG_SET( stclass, data_slot );
1058 Newxz( aho, 1, reg_ac_data );
1059 RExC_rx->data->data[ data_slot ] = (void*)aho;
1061 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1062 (trie->laststate+1)*sizeof(reg_trie_state));
1063 Newxz( q, numstates, U32);
1064 Newxz( aho->fail, numstates, U32 );
1067 fail[ 0 ] = fail[ 1 ] = 1;
1069 for ( charid = 0; charid < ucharcount ; charid++ ) {
1070 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1072 q[ q_write ] = newstate;
1073 /* set to point at the root */
1074 fail[ q[ q_write++ ] ]=1;
1077 while ( q_read < q_write) {
1078 const U32 cur = q[ q_read++ % numstates ];
1079 base = trie->states[ cur ].trans.base;
1081 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1082 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1084 U32 fail_state = cur;
1087 fail_state = fail[ fail_state ];
1088 fail_base = aho->states[ fail_state ].trans.base;
1089 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1091 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1092 fail[ ch_state ] = fail_state;
1093 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1095 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1097 q[ q_write++ % numstates] = ch_state;
1102 DEBUG_TRIE_COMPILE_MORE_r({
1103 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1104 for( q_read=2; q_read<numstates; q_read++ ) {
1105 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1107 PerlIO_printf(Perl_debug_log, "\n");
1110 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1116 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1119 /* first pass, loop through and scan words */
1120 reg_trie_data *trie;
1122 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1127 /* we just use folder as a flag in utf8 */
1128 const U8 * const folder = ( flags == EXACTF
1130 : ( flags == EXACTFL
1136 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1137 SV *re_trie_maxbuff;
1139 /* these are only used during construction but are useful during
1140 * debugging so we store them in the struct when debugging.
1141 * Wordcount is actually superfluous in debugging as we have
1142 * (AV*)trie->words to use for it, but that's not available when
1143 * not debugging... We could make the macro use the AV during
1144 * debugging though...
1146 U16 trie_wordcount=0;
1147 STRLEN trie_charcount=0;
1148 /*U32 trie_laststate=0;*/
1149 AV *trie_revcharmap;
1151 GET_RE_DEBUG_FLAGS_DECL;
1153 Newxz( trie, 1, reg_trie_data );
1155 trie->startstate = 1;
1156 RExC_rx->data->data[ data_slot ] = (void*)trie;
1157 Newxz( trie->charmap, 256, U16 );
1158 if (!(UTF && folder))
1159 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1161 trie->words = newAV();
1163 TRIE_REVCHARMAP(trie) = newAV();
1165 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1166 if (!SvIOK(re_trie_maxbuff)) {
1167 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1170 PerlIO_printf( Perl_debug_log,
1171 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1172 (int)depth * 2 + 2, "",
1173 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1174 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1176 /* -- First loop and Setup --
1178 We first traverse the branches and scan each word to determine if it
1179 contains widechars, and how many unique chars there are, this is
1180 important as we have to build a table with at least as many columns as we
1183 We use an array of integers to represent the character codes 0..255
1184 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1185 native representation of the character value as the key and IV's for the
1188 *TODO* If we keep track of how many times each character is used we can
1189 remap the columns so that the table compression later on is more
1190 efficient in terms of memory by ensuring most common value is in the
1191 middle and the least common are on the outside. IMO this would be better
1192 than a most to least common mapping as theres a decent chance the most
1193 common letter will share a node with the least common, meaning the node
1194 will not be compressable. With a middle is most common approach the worst
1195 case is when we have the least common nodes twice.
1199 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1200 regnode * const noper = NEXTOPER( cur );
1201 const U8 *uc = (U8*)STRING( noper );
1202 const U8 * const e = uc + STR_LEN( noper );
1204 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1205 const U8 *scan = (U8*)NULL;
1206 U32 wordlen = 0; /* required init */
1209 TRIE_WORDCOUNT(trie)++;
1210 if (OP(noper) == NOTHING) {
1215 TRIE_BITMAP_SET(trie,*uc);
1216 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1218 for ( ; uc < e ; uc += len ) {
1219 TRIE_CHARCOUNT(trie)++;
1223 if ( !trie->charmap[ uvc ] ) {
1224 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1226 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1231 if ( !trie->widecharmap )
1232 trie->widecharmap = newHV();
1234 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1237 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1239 if ( !SvTRUE( *svpp ) ) {
1240 sv_setiv( *svpp, ++trie->uniquecharcount );
1245 if( cur == first ) {
1248 } else if (chars < trie->minlen) {
1250 } else if (chars > trie->maxlen) {
1254 } /* end first pass */
1255 DEBUG_TRIE_COMPILE_r(
1256 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1257 (int)depth * 2 + 2,"",
1258 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1259 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1260 (int)trie->minlen, (int)trie->maxlen )
1262 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
1265 We now know what we are dealing with in terms of unique chars and
1266 string sizes so we can calculate how much memory a naive
1267 representation using a flat table will take. If it's over a reasonable
1268 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1269 conservative but potentially much slower representation using an array
1272 At the end we convert both representations into the same compressed
1273 form that will be used in regexec.c for matching with. The latter
1274 is a form that cannot be used to construct with but has memory
1275 properties similar to the list form and access properties similar
1276 to the table form making it both suitable for fast searches and
1277 small enough that its feasable to store for the duration of a program.
1279 See the comment in the code where the compressed table is produced
1280 inplace from the flat tabe representation for an explanation of how
1281 the compression works.
1286 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1288 Second Pass -- Array Of Lists Representation
1290 Each state will be represented by a list of charid:state records
1291 (reg_trie_trans_le) the first such element holds the CUR and LEN
1292 points of the allocated array. (See defines above).
1294 We build the initial structure using the lists, and then convert
1295 it into the compressed table form which allows faster lookups
1296 (but cant be modified once converted).
1299 STRLEN transcount = 1;
1301 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1305 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1307 regnode * const noper = NEXTOPER( cur );
1308 U8 *uc = (U8*)STRING( noper );
1309 const U8 * const e = uc + STR_LEN( noper );
1310 U32 state = 1; /* required init */
1311 U16 charid = 0; /* sanity init */
1312 U8 *scan = (U8*)NULL; /* sanity init */
1313 STRLEN foldlen = 0; /* required init */
1314 U32 wordlen = 0; /* required init */
1315 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1317 if (OP(noper) != NOTHING) {
1318 for ( ; uc < e ; uc += len ) {
1323 charid = trie->charmap[ uvc ];
1325 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1329 charid=(U16)SvIV( *svpp );
1338 if ( !trie->states[ state ].trans.list ) {
1339 TRIE_LIST_NEW( state );
1341 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1342 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1343 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1348 newstate = next_alloc++;
1349 TRIE_LIST_PUSH( state, charid, newstate );
1354 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1356 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1359 TRIE_HANDLE_WORD(state);
1361 } /* end second pass */
1363 TRIE_LASTSTATE(trie) = next_alloc;
1364 Renew( trie->states, next_alloc, reg_trie_state );
1366 /* and now dump it out before we compress it */
1367 DEBUG_TRIE_COMPILE_MORE_r(
1368 dump_trie_interim_list(trie,next_alloc,depth+1)
1371 Newxz( trie->trans, transcount ,reg_trie_trans );
1378 for( state=1 ; state < next_alloc ; state ++ ) {
1382 DEBUG_TRIE_COMPILE_MORE_r(
1383 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1387 if (trie->states[state].trans.list) {
1388 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1392 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1393 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1394 if ( forid < minid ) {
1396 } else if ( forid > maxid ) {
1400 if ( transcount < tp + maxid - minid + 1) {
1402 Renew( trie->trans, transcount, reg_trie_trans );
1403 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1405 base = trie->uniquecharcount + tp - minid;
1406 if ( maxid == minid ) {
1408 for ( ; zp < tp ; zp++ ) {
1409 if ( ! trie->trans[ zp ].next ) {
1410 base = trie->uniquecharcount + zp - minid;
1411 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1412 trie->trans[ zp ].check = state;
1418 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1419 trie->trans[ tp ].check = state;
1424 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1425 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1426 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1427 trie->trans[ tid ].check = state;
1429 tp += ( maxid - minid + 1 );
1431 Safefree(trie->states[ state ].trans.list);
1434 DEBUG_TRIE_COMPILE_MORE_r(
1435 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1438 trie->states[ state ].trans.base=base;
1440 trie->lasttrans = tp + 1;
1444 Second Pass -- Flat Table Representation.
1446 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1447 We know that we will need Charcount+1 trans at most to store the data
1448 (one row per char at worst case) So we preallocate both structures
1449 assuming worst case.
1451 We then construct the trie using only the .next slots of the entry
1454 We use the .check field of the first entry of the node temporarily to
1455 make compression both faster and easier by keeping track of how many non
1456 zero fields are in the node.
1458 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1461 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1462 number representing the first entry of the node, and state as a
1463 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1464 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1465 are 2 entrys per node. eg:
1473 The table is internally in the right hand, idx form. However as we also
1474 have to deal with the states array which is indexed by nodenum we have to
1475 use TRIE_NODENUM() to convert.
1480 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1482 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1483 next_alloc = trie->uniquecharcount + 1;
1486 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1488 regnode * const noper = NEXTOPER( cur );
1489 const U8 *uc = (U8*)STRING( noper );
1490 const U8 * const e = uc + STR_LEN( noper );
1492 U32 state = 1; /* required init */
1494 U16 charid = 0; /* sanity init */
1495 U32 accept_state = 0; /* sanity init */
1496 U8 *scan = (U8*)NULL; /* sanity init */
1498 STRLEN foldlen = 0; /* required init */
1499 U32 wordlen = 0; /* required init */
1500 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1502 if ( OP(noper) != NOTHING ) {
1503 for ( ; uc < e ; uc += len ) {
1508 charid = trie->charmap[ uvc ];
1510 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1511 charid = svpp ? (U16)SvIV(*svpp) : 0;
1515 if ( !trie->trans[ state + charid ].next ) {
1516 trie->trans[ state + charid ].next = next_alloc;
1517 trie->trans[ state ].check++;
1518 next_alloc += trie->uniquecharcount;
1520 state = trie->trans[ state + charid ].next;
1522 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1524 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1527 accept_state = TRIE_NODENUM( state );
1528 TRIE_HANDLE_WORD(accept_state);
1530 } /* end second pass */
1532 /* and now dump it out before we compress it */
1533 DEBUG_TRIE_COMPILE_MORE_r(
1534 dump_trie_interim_table(trie,next_alloc,depth+1)
1539 * Inplace compress the table.*
1541 For sparse data sets the table constructed by the trie algorithm will
1542 be mostly 0/FAIL transitions or to put it another way mostly empty.
1543 (Note that leaf nodes will not contain any transitions.)
1545 This algorithm compresses the tables by eliminating most such
1546 transitions, at the cost of a modest bit of extra work during lookup:
1548 - Each states[] entry contains a .base field which indicates the
1549 index in the state[] array wheres its transition data is stored.
1551 - If .base is 0 there are no valid transitions from that node.
1553 - If .base is nonzero then charid is added to it to find an entry in
1556 -If trans[states[state].base+charid].check!=state then the
1557 transition is taken to be a 0/Fail transition. Thus if there are fail
1558 transitions at the front of the node then the .base offset will point
1559 somewhere inside the previous nodes data (or maybe even into a node
1560 even earlier), but the .check field determines if the transition is
1563 The following process inplace converts the table to the compressed
1564 table: We first do not compress the root node 1,and mark its all its
1565 .check pointers as 1 and set its .base pointer as 1 as well. This
1566 allows to do a DFA construction from the compressed table later, and
1567 ensures that any .base pointers we calculate later are greater than
1570 - We set 'pos' to indicate the first entry of the second node.
1572 - We then iterate over the columns of the node, finding the first and
1573 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1574 and set the .check pointers accordingly, and advance pos
1575 appropriately and repreat for the next node. Note that when we copy
1576 the next pointers we have to convert them from the original
1577 NODEIDX form to NODENUM form as the former is not valid post
1580 - If a node has no transitions used we mark its base as 0 and do not
1581 advance the pos pointer.
1583 - If a node only has one transition we use a second pointer into the
1584 structure to fill in allocated fail transitions from other states.
1585 This pointer is independent of the main pointer and scans forward
1586 looking for null transitions that are allocated to a state. When it
1587 finds one it writes the single transition into the "hole". If the
1588 pointer doesnt find one the single transition is appeneded as normal.
1590 - Once compressed we can Renew/realloc the structures to release the
1593 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1594 specifically Fig 3.47 and the associated pseudocode.
1598 const U32 laststate = TRIE_NODENUM( next_alloc );
1601 TRIE_LASTSTATE(trie) = laststate;
1603 for ( state = 1 ; state < laststate ; state++ ) {
1605 const U32 stateidx = TRIE_NODEIDX( state );
1606 const U32 o_used = trie->trans[ stateidx ].check;
1607 U32 used = trie->trans[ stateidx ].check;
1608 trie->trans[ stateidx ].check = 0;
1610 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1611 if ( flag || trie->trans[ stateidx + charid ].next ) {
1612 if ( trie->trans[ stateidx + charid ].next ) {
1614 for ( ; zp < pos ; zp++ ) {
1615 if ( ! trie->trans[ zp ].next ) {
1619 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1620 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1621 trie->trans[ zp ].check = state;
1622 if ( ++zp > pos ) pos = zp;
1629 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1631 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1632 trie->trans[ pos ].check = state;
1637 trie->lasttrans = pos + 1;
1638 Renew( trie->states, laststate + 1, reg_trie_state);
1639 DEBUG_TRIE_COMPILE_MORE_r(
1640 PerlIO_printf( Perl_debug_log,
1641 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1642 (int)depth * 2 + 2,"",
1643 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1646 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1649 } /* end table compress */
1651 /* resize the trans array to remove unused space */
1652 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1654 /* and now dump out the compressed format */
1655 DEBUG_TRIE_COMPILE_r(
1656 dump_trie(trie,depth+1)
1659 { /* Modify the program and insert the new TRIE node*/
1661 U8 nodetype =(U8)(flags & 0xFF);
1668 This means we convert either the first branch or the first Exact,
1669 depending on whether the thing following (in 'last') is a branch
1670 or not and whther first is the startbranch (ie is it a sub part of
1671 the alternation or is it the whole thing.)
1672 Assuming its a sub part we conver the EXACT otherwise we convert
1673 the whole branch sequence, including the first.
1675 /* Find the node we are going to overwrite */
1676 if ( first == startbranch && OP( last ) != BRANCH ) {
1677 /* whole branch chain */
1680 const regnode *nop = NEXTOPER( convert );
1681 mjd_offset= Node_Offset((nop));
1682 mjd_nodelen= Node_Length((nop));
1685 /* branch sub-chain */
1686 convert = NEXTOPER( first );
1687 NEXT_OFF( first ) = (U16)(last - first);
1689 mjd_offset= Node_Offset((convert));
1690 mjd_nodelen= Node_Length((convert));
1694 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1695 (int)depth * 2 + 2, "",
1696 mjd_offset,mjd_nodelen)
1699 /* But first we check to see if there is a common prefix we can
1700 split out as an EXACT and put in front of the TRIE node. */
1701 trie->startstate= 1;
1702 if ( trie->bitmap && !trie->widecharmap ) {
1705 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1706 (int)depth * 2 + 2, "",
1707 TRIE_LASTSTATE(trie))
1709 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1713 const U32 base = trie->states[ state ].trans.base;
1715 if ( trie->states[state].wordnum )
1718 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1719 if ( ( base + ofs >= trie->uniquecharcount ) &&
1720 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1721 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1723 if ( ++count > 1 ) {
1724 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1725 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1726 if ( state == 1 ) break;
1728 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1730 PerlIO_printf(Perl_debug_log,
1731 "%*sNew Start State=%"UVuf" Class: [",
1732 (int)depth * 2 + 2, "",
1735 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1736 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1738 TRIE_BITMAP_SET(trie,*ch);
1740 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1742 PerlIO_printf(Perl_debug_log, (char*)ch)
1746 TRIE_BITMAP_SET(trie,*ch);
1748 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1749 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1755 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1756 const char *ch = SvPV_nolen_const( *tmp );
1758 PerlIO_printf( Perl_debug_log,
1759 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1760 (int)depth * 2 + 2, "",
1764 OP( convert ) = nodetype;
1765 str=STRING(convert);
1773 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1778 regnode *n = convert+NODE_SZ_STR(convert);
1779 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1780 trie->startstate = state;
1781 trie->minlen -= (state - 1);
1782 trie->maxlen -= (state - 1);
1784 regnode *fix = convert;
1786 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1787 while( ++fix < n ) {
1788 Set_Node_Offset_Length(fix, 0, 0);
1794 NEXT_OFF(convert) = (U16)(tail - convert);
1798 if ( trie->maxlen ) {
1799 OP( convert ) = TRIE;
1800 NEXT_OFF( convert ) = (U16)(tail - convert);
1801 ARG_SET( convert, data_slot );
1803 /* store the type in the flags */
1804 convert->flags = nodetype;
1805 /* XXX We really should free up the resource in trie now, as we wont use them */
1807 /* needed for dumping*/
1809 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1810 regnode *opt = convert;
1811 while (++opt<optimize) {
1812 Set_Node_Offset_Length(opt,0,0);
1814 /* We now need to mark all of the space originally used by the
1815 branches as optimized away. This keeps the dumpuntil from
1816 throwing a wobbly as it doesnt use regnext() to traverse the
1818 We also "fix" the offsets
1820 while( optimize < last ) {
1821 mjd_nodelen += Node_Length((optimize));
1822 OP( optimize ) = OPTIMIZED;
1823 Set_Node_Offset_Length(optimize,0,0);
1826 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1828 } /* end node insert */
1830 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1836 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1837 * These need to be revisited when a newer toolchain becomes available.
1839 #if defined(__sparc64__) && defined(__GNUC__)
1840 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1841 # undef SPARC64_GCC_WORKAROUND
1842 # define SPARC64_GCC_WORKAROUND 1
1846 #define DEBUG_PEEP(str,scan,depth) \
1847 DEBUG_OPTIMISE_r({ \
1848 SV * const mysv=sv_newmortal(); \
1849 regnode *Next = regnext(scan); \
1850 regprop(RExC_rx, mysv, scan); \
1851 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1852 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1853 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1856 #define JOIN_EXACT(scan,min,flags) \
1857 if (PL_regkind[OP(scan)] == EXACT) \
1858 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1861 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1862 /* Merge several consecutive EXACTish nodes into one. */
1863 regnode *n = regnext(scan);
1865 regnode *next = scan + NODE_SZ_STR(scan);
1869 regnode *stop = scan;
1871 GET_RE_DEBUG_FLAGS_DECL;
1872 DEBUG_PEEP("join",scan,depth);
1874 /* Skip NOTHING, merge EXACT*. */
1876 ( PL_regkind[OP(n)] == NOTHING ||
1877 (stringok && (OP(n) == OP(scan))))
1879 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1881 if (OP(n) == TAIL || n > next)
1883 if (PL_regkind[OP(n)] == NOTHING) {
1885 DEBUG_PEEP("skip:",n,depth);
1886 NEXT_OFF(scan) += NEXT_OFF(n);
1887 next = n + NODE_STEP_REGNODE;
1894 else if (stringok) {
1895 const int oldl = STR_LEN(scan);
1896 regnode * const nnext = regnext(n);
1898 DEBUG_PEEP("merg",n,depth);
1901 if (oldl + STR_LEN(n) > U8_MAX)
1903 NEXT_OFF(scan) += NEXT_OFF(n);
1904 STR_LEN(scan) += STR_LEN(n);
1905 next = n + NODE_SZ_STR(n);
1906 /* Now we can overwrite *n : */
1907 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1915 #ifdef EXPERIMENTAL_INPLACESCAN
1916 if (flags && !NEXT_OFF(n)) {
1917 DEBUG_PEEP("atch",val,depth);
1918 if (reg_off_by_arg[OP(n)]) {
1919 ARG_SET(n, val - n);
1922 NEXT_OFF(n) = val - n;
1929 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1931 Two problematic code points in Unicode casefolding of EXACT nodes:
1933 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1934 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1940 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1941 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1943 This means that in case-insensitive matching (or "loose matching",
1944 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1945 length of the above casefolded versions) can match a target string
1946 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1947 This would rather mess up the minimum length computation.
1949 What we'll do is to look for the tail four bytes, and then peek
1950 at the preceding two bytes to see whether we need to decrease
1951 the minimum length by four (six minus two).
1953 Thanks to the design of UTF-8, there cannot be false matches:
1954 A sequence of valid UTF-8 bytes cannot be a subsequence of
1955 another valid sequence of UTF-8 bytes.
1958 char * const s0 = STRING(scan), *s, *t;
1959 char * const s1 = s0 + STR_LEN(scan) - 1;
1960 char * const s2 = s1 - 4;
1961 const char t0[] = "\xcc\x88\xcc\x81";
1962 const char * const t1 = t0 + 3;
1965 s < s2 && (t = ninstr(s, s1, t0, t1));
1967 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1968 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1975 n = scan + NODE_SZ_STR(scan);
1977 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1984 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
1988 /* REx optimizer. Converts nodes into quickier variants "in place".
1989 Finds fixed substrings. */
1991 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1992 to the position after last scanned or to NULL. */
1997 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1998 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1999 /* scanp: Start here (read-write). */
2000 /* deltap: Write maxlen-minlen here. */
2001 /* last: Stop before this one. */
2004 I32 min = 0, pars = 0, code;
2005 regnode *scan = *scanp, *next;
2007 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2008 int is_inf_internal = 0; /* The studied chunk is infinite */
2009 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2010 scan_data_t data_fake;
2011 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2012 SV *re_trie_maxbuff = NULL;
2014 GET_RE_DEBUG_FLAGS_DECL;
2016 while (scan && OP(scan) != END && scan < last) {
2017 /* Peephole optimizer: */
2018 DEBUG_PEEP("Peep",scan,depth);
2020 JOIN_EXACT(scan,&min,0);
2022 /* Follow the next-chain of the current node and optimize
2023 away all the NOTHINGs from it. */
2024 if (OP(scan) != CURLYX) {
2025 const int max = (reg_off_by_arg[OP(scan)]
2027 /* I32 may be smaller than U16 on CRAYs! */
2028 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2029 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2033 /* Skip NOTHING and LONGJMP. */
2034 while ((n = regnext(n))
2035 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2036 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2037 && off + noff < max)
2039 if (reg_off_by_arg[OP(scan)])
2042 NEXT_OFF(scan) = off;
2047 /* The principal pseudo-switch. Cannot be a switch, since we
2048 look into several different things. */
2049 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2050 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2051 next = regnext(scan);
2053 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2055 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2056 I32 max1 = 0, min1 = I32_MAX, num = 0;
2057 struct regnode_charclass_class accum;
2058 regnode * const startbranch=scan;
2060 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2061 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2062 if (flags & SCF_DO_STCLASS)
2063 cl_init_zero(pRExC_state, &accum);
2065 while (OP(scan) == code) {
2066 I32 deltanext, minnext, f = 0, fake;
2067 struct regnode_charclass_class this_class;
2070 data_fake.flags = 0;
2072 data_fake.whilem_c = data->whilem_c;
2073 data_fake.last_closep = data->last_closep;
2076 data_fake.last_closep = &fake;
2077 next = regnext(scan);
2078 scan = NEXTOPER(scan);
2080 scan = NEXTOPER(scan);
2081 if (flags & SCF_DO_STCLASS) {
2082 cl_init(pRExC_state, &this_class);
2083 data_fake.start_class = &this_class;
2084 f = SCF_DO_STCLASS_AND;
2086 if (flags & SCF_WHILEM_VISITED_POS)
2087 f |= SCF_WHILEM_VISITED_POS;
2089 /* we suppose the run is continuous, last=next...*/
2090 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2091 next, &data_fake, f,depth+1);
2094 if (max1 < minnext + deltanext)
2095 max1 = minnext + deltanext;
2096 if (deltanext == I32_MAX)
2097 is_inf = is_inf_internal = 1;
2099 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2102 if (data_fake.flags & SF_HAS_EVAL)
2103 data->flags |= SF_HAS_EVAL;
2104 data->whilem_c = data_fake.whilem_c;
2106 if (flags & SCF_DO_STCLASS)
2107 cl_or(pRExC_state, &accum, &this_class);
2108 if (code == SUSPEND)
2111 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2113 if (flags & SCF_DO_SUBSTR) {
2114 data->pos_min += min1;
2115 data->pos_delta += max1 - min1;
2116 if (max1 != min1 || is_inf)
2117 data->longest = &(data->longest_float);
2120 delta += max1 - min1;
2121 if (flags & SCF_DO_STCLASS_OR) {
2122 cl_or(pRExC_state, data->start_class, &accum);
2124 cl_and(data->start_class, &and_with);
2125 flags &= ~SCF_DO_STCLASS;
2128 else if (flags & SCF_DO_STCLASS_AND) {
2130 cl_and(data->start_class, &accum);
2131 flags &= ~SCF_DO_STCLASS;
2134 /* Switch to OR mode: cache the old value of
2135 * data->start_class */
2136 StructCopy(data->start_class, &and_with,
2137 struct regnode_charclass_class);
2138 flags &= ~SCF_DO_STCLASS_AND;
2139 StructCopy(&accum, data->start_class,
2140 struct regnode_charclass_class);
2141 flags |= SCF_DO_STCLASS_OR;
2142 data->start_class->flags |= ANYOF_EOS;
2148 Assuming this was/is a branch we are dealing with: 'scan' now
2149 points at the item that follows the branch sequence, whatever
2150 it is. We now start at the beginning of the sequence and look
2156 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2158 If we can find such a subseqence we need to turn the first
2159 element into a trie and then add the subsequent branch exact
2160 strings to the trie.
2164 1. patterns where the whole set of branch can be converted to a trie,
2166 2. patterns where only a subset of the alternations can be
2167 converted to a trie.
2169 In case 1 we can replace the whole set with a single regop
2170 for the trie. In case 2 we need to keep the start and end
2173 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2174 becomes BRANCH TRIE; BRANCH X;
2176 Hypthetically when we know the regex isnt anchored we can
2177 turn a case 1 into a DFA and let it rip... Every time it finds a match
2178 it would just call its tail, no WHILEM/CURLY needed.
2181 if (PERL_ENABLE_TRIE_OPTIMISATION) {
2183 if (!re_trie_maxbuff) {
2184 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2185 if (!SvIOK(re_trie_maxbuff))
2186 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2188 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2190 regnode *first = (regnode *)NULL;
2191 regnode *last = (regnode *)NULL;
2192 regnode *tail = scan;
2197 SV * const mysv = sv_newmortal(); /* for dumping */
2199 /* var tail is used because there may be a TAIL
2200 regop in the way. Ie, the exacts will point to the
2201 thing following the TAIL, but the last branch will
2202 point at the TAIL. So we advance tail. If we
2203 have nested (?:) we may have to move through several
2207 while ( OP( tail ) == TAIL ) {
2208 /* this is the TAIL generated by (?:) */
2209 tail = regnext( tail );
2214 regprop(RExC_rx, mysv, tail );
2215 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2216 (int)depth * 2 + 2, "",
2217 "Looking for TRIE'able sequences. Tail node is: ",
2218 SvPV_nolen_const( mysv )
2224 step through the branches, cur represents each
2225 branch, noper is the first thing to be matched
2226 as part of that branch and noper_next is the
2227 regnext() of that node. if noper is an EXACT
2228 and noper_next is the same as scan (our current
2229 position in the regex) then the EXACT branch is
2230 a possible optimization target. Once we have
2231 two or more consequetive such branches we can
2232 create a trie of the EXACT's contents and stich
2233 it in place. If the sequence represents all of
2234 the branches we eliminate the whole thing and
2235 replace it with a single TRIE. If it is a
2236 subsequence then we need to stitch it in. This
2237 means the first branch has to remain, and needs
2238 to be repointed at the item on the branch chain
2239 following the last branch optimized. This could
2240 be either a BRANCH, in which case the
2241 subsequence is internal, or it could be the
2242 item following the branch sequence in which
2243 case the subsequence is at the end.
2247 /* dont use tail as the end marker for this traverse */
2248 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2249 regnode * const noper = NEXTOPER( cur );
2250 regnode * const noper_next = regnext( noper );
2253 regprop(RExC_rx, mysv, cur);
2254 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2255 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2257 regprop(RExC_rx, mysv, noper);
2258 PerlIO_printf( Perl_debug_log, " -> %s",
2259 SvPV_nolen_const(mysv));
2262 regprop(RExC_rx, mysv, noper_next );
2263 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2264 SvPV_nolen_const(mysv));
2266 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2267 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2269 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2270 : PL_regkind[ OP( noper ) ] == EXACT )
2271 || OP(noper) == NOTHING )
2272 && noper_next == tail && count<U16_MAX)
2275 if ( !first || optype == NOTHING ) {
2276 if (!first) first = cur;
2277 optype = OP( noper );
2283 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2285 if ( PL_regkind[ OP( noper ) ] == EXACT
2286 && noper_next == tail )
2290 optype = OP( noper );
2300 regprop(RExC_rx, mysv, cur);
2301 PerlIO_printf( Perl_debug_log,
2302 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2303 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2307 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2308 #ifdef TRIE_STUDY_OPT
2309 if ( made && startbranch == first ) {
2310 if ( OP(first)!=TRIE )
2311 flags |= SCF_EXACT_TRIE;
2313 regnode *chk=*scanp;
2314 while ( OP( chk ) == OPEN )
2315 chk = regnext( chk );
2317 flags |= SCF_EXACT_TRIE;
2326 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2327 scan = NEXTOPER(NEXTOPER(scan));
2328 } else /* single branch is optimized. */
2329 scan = NEXTOPER(scan);
2332 else if (OP(scan) == EXACT) {
2333 I32 l = STR_LEN(scan);
2336 const U8 * const s = (U8*)STRING(scan);
2337 l = utf8_length(s, s + l);
2338 uc = utf8_to_uvchr(s, NULL);
2340 uc = *((U8*)STRING(scan));
2343 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2344 /* The code below prefers earlier match for fixed
2345 offset, later match for variable offset. */
2346 if (data->last_end == -1) { /* Update the start info. */
2347 data->last_start_min = data->pos_min;
2348 data->last_start_max = is_inf
2349 ? I32_MAX : data->pos_min + data->pos_delta;
2351 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2353 SvUTF8_on(data->last_found);
2355 SV * const sv = data->last_found;
2356 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2357 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2358 if (mg && mg->mg_len >= 0)
2359 mg->mg_len += utf8_length((U8*)STRING(scan),
2360 (U8*)STRING(scan)+STR_LEN(scan));
2362 data->last_end = data->pos_min + l;
2363 data->pos_min += l; /* As in the first entry. */
2364 data->flags &= ~SF_BEFORE_EOL;
2366 if (flags & SCF_DO_STCLASS_AND) {
2367 /* Check whether it is compatible with what we know already! */
2371 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2372 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2373 && (!(data->start_class->flags & ANYOF_FOLD)
2374 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2377 ANYOF_CLASS_ZERO(data->start_class);
2378 ANYOF_BITMAP_ZERO(data->start_class);
2380 ANYOF_BITMAP_SET(data->start_class, uc);
2381 data->start_class->flags &= ~ANYOF_EOS;
2383 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2385 else if (flags & SCF_DO_STCLASS_OR) {
2386 /* false positive possible if the class is case-folded */
2388 ANYOF_BITMAP_SET(data->start_class, uc);
2390 data->start_class->flags |= ANYOF_UNICODE_ALL;
2391 data->start_class->flags &= ~ANYOF_EOS;
2392 cl_and(data->start_class, &and_with);
2394 flags &= ~SCF_DO_STCLASS;
2396 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2397 I32 l = STR_LEN(scan);
2398 UV uc = *((U8*)STRING(scan));
2400 /* Search for fixed substrings supports EXACT only. */
2401 if (flags & SCF_DO_SUBSTR) {
2403 scan_commit(pRExC_state, data);
2406 const U8 * const s = (U8 *)STRING(scan);
2407 l = utf8_length(s, s + l);
2408 uc = utf8_to_uvchr(s, NULL);
2411 if (flags & SCF_DO_SUBSTR)
2413 if (flags & SCF_DO_STCLASS_AND) {
2414 /* Check whether it is compatible with what we know already! */
2418 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2419 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2420 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2422 ANYOF_CLASS_ZERO(data->start_class);
2423 ANYOF_BITMAP_ZERO(data->start_class);
2425 ANYOF_BITMAP_SET(data->start_class, uc);
2426 data->start_class->flags &= ~ANYOF_EOS;
2427 data->start_class->flags |= ANYOF_FOLD;
2428 if (OP(scan) == EXACTFL)
2429 data->start_class->flags |= ANYOF_LOCALE;
2432 else if (flags & SCF_DO_STCLASS_OR) {
2433 if (data->start_class->flags & ANYOF_FOLD) {
2434 /* false positive possible if the class is case-folded.
2435 Assume that the locale settings are the same... */
2437 ANYOF_BITMAP_SET(data->start_class, uc);
2438 data->start_class->flags &= ~ANYOF_EOS;
2440 cl_and(data->start_class, &and_with);
2442 flags &= ~SCF_DO_STCLASS;
2444 #ifdef TRIE_STUDY_OPT
2445 else if (OP(scan) == TRIE) {
2446 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2447 min += trie->minlen;
2448 delta += (trie->maxlen - trie->minlen);
2449 flags &= ~SCF_DO_STCLASS; /* xxx */
2450 if (flags & SCF_DO_SUBSTR) {
2451 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2452 data->pos_min += trie->minlen;
2453 data->pos_delta += (trie->maxlen - trie->minlen);
2454 if (trie->maxlen != trie->minlen)
2455 data->longest = &(data->longest_float);
2459 else if (strchr((const char*)PL_varies,OP(scan))) {
2460 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2461 I32 f = flags, pos_before = 0;
2462 regnode * const oscan = scan;
2463 struct regnode_charclass_class this_class;
2464 struct regnode_charclass_class *oclass = NULL;
2465 I32 next_is_eval = 0;
2467 switch (PL_regkind[OP(scan)]) {
2468 case WHILEM: /* End of (?:...)* . */
2469 scan = NEXTOPER(scan);
2472 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2473 next = NEXTOPER(scan);
2474 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2476 maxcount = REG_INFTY;
2477 next = regnext(scan);
2478 scan = NEXTOPER(scan);
2482 if (flags & SCF_DO_SUBSTR)
2487 if (flags & SCF_DO_STCLASS) {
2489 maxcount = REG_INFTY;
2490 next = regnext(scan);
2491 scan = NEXTOPER(scan);
2494 is_inf = is_inf_internal = 1;
2495 scan = regnext(scan);
2496 if (flags & SCF_DO_SUBSTR) {
2497 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2498 data->longest = &(data->longest_float);
2500 goto optimize_curly_tail;
2502 mincount = ARG1(scan);
2503 maxcount = ARG2(scan);
2504 next = regnext(scan);
2505 if (OP(scan) == CURLYX) {
2506 I32 lp = (data ? *(data->last_closep) : 0);
2507 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2509 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2510 next_is_eval = (OP(scan) == EVAL);
2512 if (flags & SCF_DO_SUBSTR) {
2513 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2514 pos_before = data->pos_min;
2518 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2520 data->flags |= SF_IS_INF;
2522 if (flags & SCF_DO_STCLASS) {
2523 cl_init(pRExC_state, &this_class);
2524 oclass = data->start_class;
2525 data->start_class = &this_class;
2526 f |= SCF_DO_STCLASS_AND;
2527 f &= ~SCF_DO_STCLASS_OR;
2529 /* These are the cases when once a subexpression
2530 fails at a particular position, it cannot succeed
2531 even after backtracking at the enclosing scope.
2533 XXXX what if minimal match and we are at the
2534 initial run of {n,m}? */
2535 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2536 f &= ~SCF_WHILEM_VISITED_POS;
2538 /* This will finish on WHILEM, setting scan, or on NULL: */
2539 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2541 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2543 if (flags & SCF_DO_STCLASS)
2544 data->start_class = oclass;
2545 if (mincount == 0 || minnext == 0) {
2546 if (flags & SCF_DO_STCLASS_OR) {
2547 cl_or(pRExC_state, data->start_class, &this_class);
2549 else if (flags & SCF_DO_STCLASS_AND) {
2550 /* Switch to OR mode: cache the old value of
2551 * data->start_class */
2552 StructCopy(data->start_class, &and_with,
2553 struct regnode_charclass_class);
2554 flags &= ~SCF_DO_STCLASS_AND;
2555 StructCopy(&this_class, data->start_class,
2556 struct regnode_charclass_class);
2557 flags |= SCF_DO_STCLASS_OR;
2558 data->start_class->flags |= ANYOF_EOS;
2560 } else { /* Non-zero len */
2561 if (flags & SCF_DO_STCLASS_OR) {
2562 cl_or(pRExC_state, data->start_class, &this_class);
2563 cl_and(data->start_class, &and_with);
2565 else if (flags & SCF_DO_STCLASS_AND)
2566 cl_and(data->start_class, &this_class);
2567 flags &= ~SCF_DO_STCLASS;
2569 if (!scan) /* It was not CURLYX, but CURLY. */
2571 if ( /* ? quantifier ok, except for (?{ ... }) */
2572 (next_is_eval || !(mincount == 0 && maxcount == 1))
2573 && (minnext == 0) && (deltanext == 0)
2574 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2575 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2576 && ckWARN(WARN_REGEXP))
2579 "Quantifier unexpected on zero-length expression");
2582 min += minnext * mincount;
2583 is_inf_internal |= ((maxcount == REG_INFTY
2584 && (minnext + deltanext) > 0)
2585 || deltanext == I32_MAX);
2586 is_inf |= is_inf_internal;
2587 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2589 /* Try powerful optimization CURLYX => CURLYN. */
2590 if ( OP(oscan) == CURLYX && data
2591 && data->flags & SF_IN_PAR
2592 && !(data->flags & SF_HAS_EVAL)
2593 && !deltanext && minnext == 1 ) {
2594 /* Try to optimize to CURLYN. */
2595 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2596 regnode * const nxt1 = nxt;
2603 if (!strchr((const char*)PL_simple,OP(nxt))
2604 && !(PL_regkind[OP(nxt)] == EXACT
2605 && STR_LEN(nxt) == 1))
2611 if (OP(nxt) != CLOSE)
2613 /* Now we know that nxt2 is the only contents: */
2614 oscan->flags = (U8)ARG(nxt);
2616 OP(nxt1) = NOTHING; /* was OPEN. */
2618 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2619 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2620 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2621 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2622 OP(nxt + 1) = OPTIMIZED; /* was count. */
2623 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2628 /* Try optimization CURLYX => CURLYM. */
2629 if ( OP(oscan) == CURLYX && data
2630 && !(data->flags & SF_HAS_PAR)
2631 && !(data->flags & SF_HAS_EVAL)
2632 && !deltanext /* atom is fixed width */
2633 && minnext != 0 /* CURLYM can't handle zero width */
2635 /* XXXX How to optimize if data == 0? */
2636 /* Optimize to a simpler form. */
2637 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2641 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2642 && (OP(nxt2) != WHILEM))
2644 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2645 /* Need to optimize away parenths. */
2646 if (data->flags & SF_IN_PAR) {
2647 /* Set the parenth number. */
2648 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2650 if (OP(nxt) != CLOSE)
2651 FAIL("Panic opt close");
2652 oscan->flags = (U8)ARG(nxt);
2653 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2654 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2656 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2657 OP(nxt + 1) = OPTIMIZED; /* was count. */
2658 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2659 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2662 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2663 regnode *nnxt = regnext(nxt1);
2666 if (reg_off_by_arg[OP(nxt1)])
2667 ARG_SET(nxt1, nxt2 - nxt1);
2668 else if (nxt2 - nxt1 < U16_MAX)
2669 NEXT_OFF(nxt1) = nxt2 - nxt1;
2671 OP(nxt) = NOTHING; /* Cannot beautify */
2676 /* Optimize again: */
2677 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2683 else if ((OP(oscan) == CURLYX)
2684 && (flags & SCF_WHILEM_VISITED_POS)
2685 /* See the comment on a similar expression above.
2686 However, this time it not a subexpression
2687 we care about, but the expression itself. */
2688 && (maxcount == REG_INFTY)
2689 && data && ++data->whilem_c < 16) {
2690 /* This stays as CURLYX, we can put the count/of pair. */
2691 /* Find WHILEM (as in regexec.c) */
2692 regnode *nxt = oscan + NEXT_OFF(oscan);
2694 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2696 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2697 | (RExC_whilem_seen << 4)); /* On WHILEM */
2699 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2701 if (flags & SCF_DO_SUBSTR) {
2702 SV *last_str = NULL;
2703 int counted = mincount != 0;
2705 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2706 #if defined(SPARC64_GCC_WORKAROUND)
2709 const char *s = NULL;
2712 if (pos_before >= data->last_start_min)
2715 b = data->last_start_min;
2718 s = SvPV_const(data->last_found, l);
2719 old = b - data->last_start_min;
2722 I32 b = pos_before >= data->last_start_min
2723 ? pos_before : data->last_start_min;
2725 const char * const s = SvPV_const(data->last_found, l);
2726 I32 old = b - data->last_start_min;
2730 old = utf8_hop((U8*)s, old) - (U8*)s;
2733 /* Get the added string: */
2734 last_str = newSVpvn(s + old, l);
2736 SvUTF8_on(last_str);
2737 if (deltanext == 0 && pos_before == b) {
2738 /* What was added is a constant string */
2740 SvGROW(last_str, (mincount * l) + 1);
2741 repeatcpy(SvPVX(last_str) + l,
2742 SvPVX_const(last_str), l, mincount - 1);
2743 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2744 /* Add additional parts. */
2745 SvCUR_set(data->last_found,
2746 SvCUR(data->last_found) - l);
2747 sv_catsv(data->last_found, last_str);
2749 SV * sv = data->last_found;
2751 SvUTF8(sv) && SvMAGICAL(sv) ?
2752 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2753 if (mg && mg->mg_len >= 0)
2754 mg->mg_len += CHR_SVLEN(last_str);
2756 data->last_end += l * (mincount - 1);
2759 /* start offset must point into the last copy */
2760 data->last_start_min += minnext * (mincount - 1);
2761 data->last_start_max += is_inf ? I32_MAX
2762 : (maxcount - 1) * (minnext + data->pos_delta);
2765 /* It is counted once already... */
2766 data->pos_min += minnext * (mincount - counted);
2767 data->pos_delta += - counted * deltanext +
2768 (minnext + deltanext) * maxcount - minnext * mincount;
2769 if (mincount != maxcount) {
2770 /* Cannot extend fixed substrings found inside
2772 scan_commit(pRExC_state,data);
2773 if (mincount && last_str) {
2774 SV * const sv = data->last_found;
2775 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2776 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2780 sv_setsv(sv, last_str);
2781 data->last_end = data->pos_min;
2782 data->last_start_min =
2783 data->pos_min - CHR_SVLEN(last_str);
2784 data->last_start_max = is_inf
2786 : data->pos_min + data->pos_delta
2787 - CHR_SVLEN(last_str);
2789 data->longest = &(data->longest_float);
2791 SvREFCNT_dec(last_str);
2793 if (data && (fl & SF_HAS_EVAL))
2794 data->flags |= SF_HAS_EVAL;
2795 optimize_curly_tail:
2796 if (OP(oscan) != CURLYX) {
2797 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2799 NEXT_OFF(oscan) += NEXT_OFF(next);
2802 default: /* REF and CLUMP only? */
2803 if (flags & SCF_DO_SUBSTR) {
2804 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2805 data->longest = &(data->longest_float);
2807 is_inf = is_inf_internal = 1;
2808 if (flags & SCF_DO_STCLASS_OR)
2809 cl_anything(pRExC_state, data->start_class);
2810 flags &= ~SCF_DO_STCLASS;
2814 else if (strchr((const char*)PL_simple,OP(scan))) {
2817 if (flags & SCF_DO_SUBSTR) {
2818 scan_commit(pRExC_state,data);
2822 if (flags & SCF_DO_STCLASS) {
2823 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2825 /* Some of the logic below assumes that switching
2826 locale on will only add false positives. */
2827 switch (PL_regkind[OP(scan)]) {
2831 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2832 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2833 cl_anything(pRExC_state, data->start_class);
2836 if (OP(scan) == SANY)
2838 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2839 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2840 || (data->start_class->flags & ANYOF_CLASS));
2841 cl_anything(pRExC_state, data->start_class);
2843 if (flags & SCF_DO_STCLASS_AND || !value)
2844 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2847 if (flags & SCF_DO_STCLASS_AND)
2848 cl_and(data->start_class,
2849 (struct regnode_charclass_class*)scan);
2851 cl_or(pRExC_state, data->start_class,
2852 (struct regnode_charclass_class*)scan);
2855 if (flags & SCF_DO_STCLASS_AND) {
2856 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2857 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2858 for (value = 0; value < 256; value++)
2859 if (!isALNUM(value))
2860 ANYOF_BITMAP_CLEAR(data->start_class, value);
2864 if (data->start_class->flags & ANYOF_LOCALE)
2865 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2867 for (value = 0; value < 256; value++)
2869 ANYOF_BITMAP_SET(data->start_class, value);
2874 if (flags & SCF_DO_STCLASS_AND) {
2875 if (data->start_class->flags & ANYOF_LOCALE)
2876 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2879 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2880 data->start_class->flags |= ANYOF_LOCALE;
2884 if (flags & SCF_DO_STCLASS_AND) {
2885 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2886 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2887 for (value = 0; value < 256; value++)
2889 ANYOF_BITMAP_CLEAR(data->start_class, value);
2893 if (data->start_class->flags & ANYOF_LOCALE)
2894 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2896 for (value = 0; value < 256; value++)
2897 if (!isALNUM(value))
2898 ANYOF_BITMAP_SET(data->start_class, value);
2903 if (flags & SCF_DO_STCLASS_AND) {
2904 if (data->start_class->flags & ANYOF_LOCALE)
2905 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2908 data->start_class->flags |= ANYOF_LOCALE;
2909 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2913 if (flags & SCF_DO_STCLASS_AND) {
2914 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2915 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2916 for (value = 0; value < 256; value++)
2917 if (!isSPACE(value))
2918 ANYOF_BITMAP_CLEAR(data->start_class, value);
2922 if (data->start_class->flags & ANYOF_LOCALE)
2923 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2925 for (value = 0; value < 256; value++)
2927 ANYOF_BITMAP_SET(data->start_class, value);
2932 if (flags & SCF_DO_STCLASS_AND) {
2933 if (data->start_class->flags & ANYOF_LOCALE)
2934 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2937 data->start_class->flags |= ANYOF_LOCALE;
2938 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2942 if (flags & SCF_DO_STCLASS_AND) {
2943 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2944 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2945 for (value = 0; value < 256; value++)
2947 ANYOF_BITMAP_CLEAR(data->start_class, value);
2951 if (data->start_class->flags & ANYOF_LOCALE)
2952 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2954 for (value = 0; value < 256; value++)
2955 if (!isSPACE(value))
2956 ANYOF_BITMAP_SET(data->start_class, value);
2961 if (flags & SCF_DO_STCLASS_AND) {
2962 if (data->start_class->flags & ANYOF_LOCALE) {
2963 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2964 for (value = 0; value < 256; value++)
2965 if (!isSPACE(value))
2966 ANYOF_BITMAP_CLEAR(data->start_class, value);
2970 data->start_class->flags |= ANYOF_LOCALE;
2971 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2975 if (flags & SCF_DO_STCLASS_AND) {
2976 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2977 for (value = 0; value < 256; value++)
2978 if (!isDIGIT(value))
2979 ANYOF_BITMAP_CLEAR(data->start_class, value);
2982 if (data->start_class->flags & ANYOF_LOCALE)
2983 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2985 for (value = 0; value < 256; value++)
2987 ANYOF_BITMAP_SET(data->start_class, value);
2992 if (flags & SCF_DO_STCLASS_AND) {
2993 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2994 for (value = 0; value < 256; value++)
2996 ANYOF_BITMAP_CLEAR(data->start_class, value);
2999 if (data->start_class->flags & ANYOF_LOCALE)
3000 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3002 for (value = 0; value < 256; value++)
3003 if (!isDIGIT(value))
3004 ANYOF_BITMAP_SET(data->start_class, value);
3009 if (flags & SCF_DO_STCLASS_OR)
3010 cl_and(data->start_class, &and_with);
3011 flags &= ~SCF_DO_STCLASS;
3014 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3015 data->flags |= (OP(scan) == MEOL
3019 else if ( PL_regkind[OP(scan)] == BRANCHJ
3020 /* Lookbehind, or need to calculate parens/evals/stclass: */
3021 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3022 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3023 /* Lookahead/lookbehind */
3024 I32 deltanext, minnext, fake = 0;
3026 struct regnode_charclass_class intrnl;
3029 data_fake.flags = 0;
3031 data_fake.whilem_c = data->whilem_c;
3032 data_fake.last_closep = data->last_closep;
3035 data_fake.last_closep = &fake;
3036 if ( flags & SCF_DO_STCLASS && !scan->flags
3037 && OP(scan) == IFMATCH ) { /* Lookahead */
3038 cl_init(pRExC_state, &intrnl);
3039 data_fake.start_class = &intrnl;
3040 f |= SCF_DO_STCLASS_AND;
3042 if (flags & SCF_WHILEM_VISITED_POS)
3043 f |= SCF_WHILEM_VISITED_POS;
3044 next = regnext(scan);
3045 nscan = NEXTOPER(NEXTOPER(scan));
3046 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3049 vFAIL("Variable length lookbehind not implemented");
3051 else if (minnext > U8_MAX) {
3052 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3054 scan->flags = (U8)minnext;
3057 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3059 if (data_fake.flags & SF_HAS_EVAL)
3060 data->flags |= SF_HAS_EVAL;
3061 data->whilem_c = data_fake.whilem_c;
3063 if (f & SCF_DO_STCLASS_AND) {
3064 const int was = (data->start_class->flags & ANYOF_EOS);
3066 cl_and(data->start_class, &intrnl);
3068 data->start_class->flags |= ANYOF_EOS;
3071 else if (OP(scan) == OPEN) {
3074 else if (OP(scan) == CLOSE) {
3075 if ((I32)ARG(scan) == is_par) {
3076 next = regnext(scan);
3078 if ( next && (OP(next) != WHILEM) && next < last)
3079 is_par = 0; /* Disable optimization */
3082 *(data->last_closep) = ARG(scan);
3084 else if (OP(scan) == EVAL) {
3086 data->flags |= SF_HAS_EVAL;
3088 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3089 if (flags & SCF_DO_SUBSTR) {
3090 scan_commit(pRExC_state,data);
3091 data->longest = &(data->longest_float);
3093 is_inf = is_inf_internal = 1;
3094 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3095 cl_anything(pRExC_state, data->start_class);
3096 flags &= ~SCF_DO_STCLASS;
3098 /* Else: zero-length, ignore. */
3099 scan = regnext(scan);
3104 *deltap = is_inf_internal ? I32_MAX : delta;
3105 if (flags & SCF_DO_SUBSTR && is_inf)
3106 data->pos_delta = I32_MAX - data->pos_min;
3107 if (is_par > U8_MAX)
3109 if (is_par && pars==1 && data) {
3110 data->flags |= SF_IN_PAR;
3111 data->flags &= ~SF_HAS_PAR;
3113 else if (pars && data) {
3114 data->flags |= SF_HAS_PAR;
3115 data->flags &= ~SF_IN_PAR;
3117 if (flags & SCF_DO_STCLASS_OR)
3118 cl_and(data->start_class, &and_with);
3119 if (flags & SCF_EXACT_TRIE)
3120 data->flags |= SCF_EXACT_TRIE;
3125 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3127 if (RExC_rx->data) {
3128 Renewc(RExC_rx->data,
3129 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3130 char, struct reg_data);
3131 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3132 RExC_rx->data->count += n;
3135 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3136 char, struct reg_data);
3137 Newx(RExC_rx->data->what, n, U8);
3138 RExC_rx->data->count = n;
3140 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3141 return RExC_rx->data->count - n;
3144 #ifndef PERL_IN_XSUB_RE
3146 Perl_reginitcolors(pTHX)
3149 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3151 char *t = savepv(s);
3155 t = strchr(t, '\t');
3161 PL_colors[i] = t = (char *)"";
3166 PL_colors[i++] = (char *)"";
3174 - pregcomp - compile a regular expression into internal code
3176 * We can't allocate space until we know how big the compiled form will be,
3177 * but we can't compile it (and thus know how big it is) until we've got a
3178 * place to put the code. So we cheat: we compile it twice, once with code
3179 * generation turned off and size counting turned on, and once "for real".
3180 * This also means that we don't allocate space until we are sure that the
3181 * thing really will compile successfully, and we never have to move the
3182 * code and thus invalidate pointers into it. (Note that it has to be in
3183 * one piece because free() must be able to free it all.) [NB: not true in perl]
3185 * Beware that the optimization-preparation code in here knows about some
3186 * of the structure of the compiled regexp. [I'll say.]
3189 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3200 RExC_state_t RExC_state;
3201 RExC_state_t * const pRExC_state = &RExC_state;
3202 #ifdef TRIE_STUDY_OPT
3204 RExC_state_t copyRExC_state;
3207 GET_RE_DEBUG_FLAGS_DECL;
3210 FAIL("NULL regexp argument");
3212 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3215 DEBUG_r(if (!PL_colorset) reginitcolors());
3217 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
3218 PL_colors[4],PL_colors[5],PL_colors[0],
3219 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3221 RExC_flags = pm->op_pmflags;
3225 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3226 RExC_seen_evals = 0;
3229 /* First pass: determine size, legality. */
3236 RExC_emit = &PL_regdummy;
3237 RExC_whilem_seen = 0;
3238 #if 0 /* REGC() is (currently) a NOP at the first pass.
3239 * Clever compilers notice this and complain. --jhi */
3240 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3242 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3243 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3244 RExC_precomp = NULL;
3247 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3248 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3249 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3252 RExC_lastparse=NULL;
3256 /* Small enough for pointer-storage convention?
3257 If extralen==0, this means that we will not need long jumps. */
3258 if (RExC_size >= 0x10000L && RExC_extralen)
3259 RExC_size += RExC_extralen;
3262 if (RExC_whilem_seen > 15)
3263 RExC_whilem_seen = 15;
3265 /* Allocate space and initialize. */
3266 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3269 FAIL("Regexp out of space");
3272 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3273 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3276 r->prelen = xend - exp;
3277 r->precomp = savepvn(RExC_precomp, r->prelen);
3279 #ifdef PERL_OLD_COPY_ON_WRITE
3280 r->saved_copy = NULL;
3282 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3283 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3284 r->lastparen = 0; /* mg.c reads this. */
3286 r->substrs = 0; /* Useful during FAIL. */
3287 r->startp = 0; /* Useful during FAIL. */
3288 r->endp = 0; /* Useful during FAIL. */
3290 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3292 r->offsets[0] = RExC_size;
3294 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3295 "%s %"UVuf" bytes for offset annotations.\n",
3296 r->offsets ? "Got" : "Couldn't get",
3297 (UV)((2*RExC_size+1) * sizeof(U32))));
3301 /* Second pass: emit code. */
3302 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3307 RExC_emit_start = r->program;
3308 RExC_emit = r->program;
3309 /* Store the count of eval-groups for security checks: */
3310 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3311 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3313 if (reg(pRExC_state, 0, &flags,1) == NULL)
3315 /* XXXX To minimize changes to RE engine we always allocate
3316 3-units-long substrs field. */
3317 Newx(r->substrs, 1, struct reg_substr_data);
3320 Zero(r->substrs, 1, struct reg_substr_data);
3321 StructCopy(&zero_scan_data, &data, scan_data_t);
3323 #ifdef TRIE_STUDY_OPT
3325 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3326 RExC_state=copyRExC_state;
3327 if (data.longest_fixed)
3328 SvREFCNT_dec(data.longest_fixed);
3329 if (data.longest_float)
3330 SvREFCNT_dec(data.longest_float);
3331 if (data.last_found)
3332 SvREFCNT_dec(data.last_found);
3334 copyRExC_state=RExC_state;
3337 /* Dig out information for optimizations. */
3338 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3339 pm->op_pmflags = RExC_flags;
3341 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3342 r->regstclass = NULL;
3343 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3344 r->reganch |= ROPT_NAUGHTY;
3345 scan = r->program + 1; /* First BRANCH. */
3347 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3348 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3350 STRLEN longest_float_length, longest_fixed_length;
3351 struct regnode_charclass_class ch_class; /* pointed to by data */
3353 I32 last_close = 0; /* pointed to by data */
3356 /* Skip introductions and multiplicators >= 1. */
3357 while ((OP(first) == OPEN && (sawopen = 1)) ||
3358 /* An OR of *one* alternative - should not happen now. */
3359 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3360 /* for now we can't handle lookbehind IFMATCH*/
3361 (OP(first) == IFMATCH && !first->flags) ||
3362 (OP(first) == PLUS) ||
3363 (OP(first) == MINMOD) ||
3364 /* An {n,m} with n>0 */
3365 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3367 DEBUG_PEEP("first:",first,0);
3368 if (OP(first) == PLUS)
3371 first += regarglen[OP(first)];
3372 if (OP(first) == IFMATCH) {
3373 first = NEXTOPER(first);
3374 first += EXTRA_STEP_2ARGS;
3375 } else /*xxx possible optimisation for /(?=)/*/
3376 first = NEXTOPER(first);
3379 /* Starting-point info. */
3381 /* Ignore EXACT as we deal with it later. */
3382 if (PL_regkind[OP(first)] == EXACT) {
3383 if (OP(first) == EXACT)
3384 NOOP; /* Empty, get anchored substr later. */
3385 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3386 r->regstclass = first;
3389 else if (OP(first) == TRIE &&
3390 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3392 /* this can happen only on restudy */
3393 struct regnode_1 *trie_op;
3394 Newxz(trie_op,1,struct regnode_1);
3395 StructCopy(first,trie_op,struct regnode_1);
3396 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3397 r->regstclass = (regnode *)trie_op;
3400 else if (strchr((const char*)PL_simple,OP(first)))
3401 r->regstclass = first;
3402 else if (PL_regkind[OP(first)] == BOUND ||
3403 PL_regkind[OP(first)] == NBOUND)
3404 r->regstclass = first;
3405 else if (PL_regkind[OP(first)] == BOL) {
3406 r->reganch |= (OP(first) == MBOL
3408 : (OP(first) == SBOL
3411 first = NEXTOPER(first);
3414 else if (OP(first) == GPOS) {
3415 r->reganch |= ROPT_ANCH_GPOS;
3416 first = NEXTOPER(first);
3419 else if (!sawopen && (OP(first) == STAR &&
3420 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3421 !(r->reganch & ROPT_ANCH) )
3423 /* turn .* into ^.* with an implied $*=1 */
3425 (OP(NEXTOPER(first)) == REG_ANY)
3428 r->reganch |= type | ROPT_IMPLICIT;
3429 first = NEXTOPER(first);
3432 if (sawplus && (!sawopen || !RExC_sawback)
3433 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3434 /* x+ must match at the 1st pos of run of x's */
3435 r->reganch |= ROPT_SKIP;
3437 /* Scan is after the zeroth branch, first is atomic matcher. */
3438 #ifdef TRIE_STUDY_OPT
3441 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3442 (IV)(first - scan + 1))
3446 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3447 (IV)(first - scan + 1))
3453 * If there's something expensive in the r.e., find the
3454 * longest literal string that must appear and make it the
3455 * regmust. Resolve ties in favor of later strings, since
3456 * the regstart check works with the beginning of the r.e.
3457 * and avoiding duplication strengthens checking. Not a
3458 * strong reason, but sufficient in the absence of others.
3459 * [Now we resolve ties in favor of the earlier string if
3460 * it happens that c_offset_min has been invalidated, since the
3461 * earlier string may buy us something the later one won't.]
3465 data.longest_fixed = newSVpvs("");
3466 data.longest_float = newSVpvs("");
3467 data.last_found = newSVpvs("");
3468 data.longest = &(data.longest_fixed);
3470 if (!r->regstclass) {
3471 cl_init(pRExC_state, &ch_class);
3472 data.start_class = &ch_class;
3473 stclass_flag = SCF_DO_STCLASS_AND;
3474 } else /* XXXX Check for BOUND? */
3476 data.last_closep = &last_close;
3478 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3479 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3481 #ifdef TRIE_STUDY_OPT
3482 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3487 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3488 && data.last_start_min == 0 && data.last_end > 0
3489 && !RExC_seen_zerolen
3490 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3491 r->reganch |= ROPT_CHECK_ALL;
3492 scan_commit(pRExC_state, &data);
3493 SvREFCNT_dec(data.last_found);
3495 longest_float_length = CHR_SVLEN(data.longest_float);
3496 if (longest_float_length
3497 || (data.flags & SF_FL_BEFORE_EOL
3498 && (!(data.flags & SF_FL_BEFORE_MEOL)
3499 || (RExC_flags & PMf_MULTILINE)))) {
3502 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3503 && data.offset_fixed == data.offset_float_min
3504 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3505 goto remove_float; /* As in (a)+. */
3507 if (SvUTF8(data.longest_float)) {
3508 r->float_utf8 = data.longest_float;
3509 r->float_substr = NULL;
3511 r->float_substr = data.longest_float;
3512 r->float_utf8 = NULL;
3514 r->float_min_offset = data.offset_float_min;
3515 r->float_max_offset = data.offset_float_max;
3516 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3517 && (!(data.flags & SF_FL_BEFORE_MEOL)
3518 || (RExC_flags & PMf_MULTILINE)));
3519 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3523 r->float_substr = r->float_utf8 = NULL;
3524 SvREFCNT_dec(data.longest_float);
3525 longest_float_length = 0;
3528 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3529 if (longest_fixed_length
3530 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3531 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3532 || (RExC_flags & PMf_MULTILINE)))) {
3535 if (SvUTF8(data.longest_fixed)) {
3536 r->anchored_utf8 = data.longest_fixed;
3537 r->anchored_substr = NULL;
3539 r->anchored_substr = data.longest_fixed;
3540 r->anchored_utf8 = NULL;
3542 r->anchored_offset = data.offset_fixed;
3543 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3544 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3545 || (RExC_flags & PMf_MULTILINE)));
3546 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3549 r->anchored_substr = r->anchored_utf8 = NULL;
3550 SvREFCNT_dec(data.longest_fixed);
3551 longest_fixed_length = 0;
3554 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3555 r->regstclass = NULL;
3556 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3558 && !(data.start_class->flags & ANYOF_EOS)
3559 && !cl_is_anything(data.start_class))
3561 const I32 n = add_data(pRExC_state, 1, "f");
3563 Newx(RExC_rx->data->data[n], 1,
3564 struct regnode_charclass_class);
3565 StructCopy(data.start_class,
3566 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3567 struct regnode_charclass_class);
3568 r->regstclass = (regnode*)RExC_rx->data->data[n];
3569 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3570 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3571 regprop(r, sv, (regnode*)data.start_class);
3572 PerlIO_printf(Perl_debug_log,
3573 "synthetic stclass \"%s\".\n",
3574 SvPVX_const(sv));});
3577 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3578 if (longest_fixed_length > longest_float_length) {
3579 r->check_substr = r->anchored_substr;
3580 r->check_utf8 = r->anchored_utf8;
3581 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3582 if (r->reganch & ROPT_ANCH_SINGLE)
3583 r->reganch |= ROPT_NOSCAN;
3586 r->check_substr = r->float_substr;
3587 r->check_utf8 = r->float_utf8;
3588 r->check_offset_min = data.offset_float_min;
3589 r->check_offset_max = data.offset_float_max;
3591 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3592 This should be changed ASAP! */
3593 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3594 r->reganch |= RE_USE_INTUIT;
3595 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3596 r->reganch |= RE_INTUIT_TAIL;
3600 /* Several toplevels. Best we can is to set minlen. */
3602 struct regnode_charclass_class ch_class;
3605 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3607 scan = r->program + 1;
3608 cl_init(pRExC_state, &ch_class);
3609 data.start_class = &ch_class;
3610 data.last_closep = &last_close;
3612 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3613 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3615 #ifdef TRIE_STUDY_OPT
3616 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3621 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3622 = r->float_substr = r->float_utf8 = NULL;
3623 if (!(data.start_class->flags & ANYOF_EOS)
3624 && !cl_is_anything(data.start_class))
3626 const I32 n = add_data(pRExC_state, 1, "f");
3628 Newx(RExC_rx->data->data[n], 1,
3629 struct regnode_charclass_class);
3630 StructCopy(data.start_class,
3631 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3632 struct regnode_charclass_class);
3633 r->regstclass = (regnode*)RExC_rx->data->data[n];
3634 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3635 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3636 regprop(r, sv, (regnode*)data.start_class);
3637 PerlIO_printf(Perl_debug_log,
3638 "synthetic stclass \"%s\".\n",
3639 SvPVX_const(sv));});
3644 if (RExC_seen & REG_SEEN_GPOS)
3645 r->reganch |= ROPT_GPOS_SEEN;
3646 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3647 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3648 if (RExC_seen & REG_SEEN_EVAL)
3649 r->reganch |= ROPT_EVAL_SEEN;
3650 if (RExC_seen & REG_SEEN_CANY)
3651 r->reganch |= ROPT_CANY_SEEN;
3652 Newxz(r->startp, RExC_npar, I32);
3653 Newxz(r->endp, RExC_npar, I32);
3655 DEBUG_r( RX_DEBUG_on(r) );
3657 PerlIO_printf(Perl_debug_log,"Final program:\n");
3664 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3665 int rem=(int)(RExC_end - RExC_parse); \
3674 if (RExC_lastparse!=RExC_parse) \
3675 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3678 iscut ? "..." : "<" \
3681 PerlIO_printf(Perl_debug_log,"%16s",""); \
3686 num=REG_NODE_NUM(RExC_emit); \
3687 if (RExC_lastnum!=num) \
3688 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3690 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3691 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3692 (int)((depth*2)), "", \
3696 RExC_lastparse=RExC_parse; \
3701 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3702 DEBUG_PARSE_MSG((funcname)); \
3703 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3706 - reg - regular expression, i.e. main body or parenthesized thing
3708 * Caller must absorb opening parenthesis.
3710 * Combining parenthesis handling with the base level of regular expression
3711 * is a trifle forced, but the need to tie the tails of the branches to what
3712 * follows makes it hard to avoid.
3714 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3716 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3718 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3722 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3723 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3726 register regnode *ret; /* Will be the head of the group. */
3727 register regnode *br;
3728 register regnode *lastbr;
3729 register regnode *ender = NULL;
3730 register I32 parno = 0;
3732 const I32 oregflags = RExC_flags;
3733 bool have_branch = 0;
3736 /* for (?g), (?gc), and (?o) warnings; warning
3737 about (?c) will warn about (?g) -- japhy */
3739 #define WASTED_O 0x01
3740 #define WASTED_G 0x02
3741 #define WASTED_C 0x04
3742 #define WASTED_GC (0x02|0x04)
3743 I32 wastedflags = 0x00;
3745 char * parse_start = RExC_parse; /* MJD */
3746 char * const oregcomp_parse = RExC_parse;
3748 GET_RE_DEBUG_FLAGS_DECL;
3749 DEBUG_PARSE("reg ");
3752 *flagp = 0; /* Tentatively. */
3755 /* Make an OPEN node, if parenthesized. */
3757 if (*RExC_parse == '?') { /* (?...) */
3758 U32 posflags = 0, negflags = 0;
3759 U32 *flagsp = &posflags;
3760 bool is_logical = 0;
3761 const char * const seqstart = RExC_parse;
3764 paren = *RExC_parse++;
3765 ret = NULL; /* For look-ahead/behind. */
3767 case '<': /* (?<...) */
3768 RExC_seen |= REG_SEEN_LOOKBEHIND;
3769 if (*RExC_parse == '!')
3771 if (*RExC_parse != '=' && *RExC_parse != '!')
3774 case '=': /* (?=...) */
3775 case '!': /* (?!...) */
3776 RExC_seen_zerolen++;
3777 case ':': /* (?:...) */
3778 case '>': /* (?>...) */
3780 case '$': /* (?$...) */
3781 case '@': /* (?@...) */
3782 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3784 case '#': /* (?#...) */
3785 while (*RExC_parse && *RExC_parse != ')')
3787 if (*RExC_parse != ')')
3788 FAIL("Sequence (?#... not terminated");
3789 nextchar(pRExC_state);
3792 case 'p': /* (?p...) */
3793 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3794 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3796 case '?': /* (??...) */
3798 if (*RExC_parse != '{')
3800 paren = *RExC_parse++;
3802 case '{': /* (?{...}) */
3804 I32 count = 1, n = 0;
3806 char *s = RExC_parse;
3808 RExC_seen_zerolen++;
3809 RExC_seen |= REG_SEEN_EVAL;
3810 while (count && (c = *RExC_parse)) {
3821 if (*RExC_parse != ')') {
3823 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3827 OP_4tree *sop, *rop;
3828 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3831 Perl_save_re_context(aTHX);
3832 rop = sv_compile_2op(sv, &sop, "re", &pad);
3833 sop->op_private |= OPpREFCOUNTED;
3834 /* re_dup will OpREFCNT_inc */
3835 OpREFCNT_set(sop, 1);
3838 n = add_data(pRExC_state, 3, "nop");
3839 RExC_rx->data->data[n] = (void*)rop;
3840 RExC_rx->data->data[n+1] = (void*)sop;
3841 RExC_rx->data->data[n+2] = (void*)pad;
3844 else { /* First pass */
3845 if (PL_reginterp_cnt < ++RExC_seen_evals
3847 /* No compiled RE interpolated, has runtime
3848 components ===> unsafe. */
3849 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3850 if (PL_tainting && PL_tainted)
3851 FAIL("Eval-group in insecure regular expression");
3852 #if PERL_VERSION > 8
3853 if (IN_PERL_COMPILETIME)
3858 nextchar(pRExC_state);
3860 ret = reg_node(pRExC_state, LOGICAL);
3863 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3864 /* deal with the length of this later - MJD */
3867 ret = reganode(pRExC_state, EVAL, n);
3868 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3869 Set_Node_Offset(ret, parse_start);
3872 case '(': /* (?(?{...})...) and (?(?=...)...) */
3874 if (RExC_parse[0] == '?') { /* (?(?...)) */
3875 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3876 || RExC_parse[1] == '<'
3877 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3880 ret = reg_node(pRExC_state, LOGICAL);
3883 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3887 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3890 parno = atoi(RExC_parse++);
3892 while (isDIGIT(*RExC_parse))
3894 ret = reganode(pRExC_state, GROUPP, parno);
3896 if ((c = *nextchar(pRExC_state)) != ')')
3897 vFAIL("Switch condition not recognized");
3899 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3900 br = regbranch(pRExC_state, &flags, 1,depth+1);
3902 br = reganode(pRExC_state, LONGJMP, 0);
3904 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3905 c = *nextchar(pRExC_state);
3909 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3910 regbranch(pRExC_state, &flags, 1,depth+1);
3911 REGTAIL(pRExC_state, ret, lastbr);
3914 c = *nextchar(pRExC_state);
3919 vFAIL("Switch (?(condition)... contains too many branches");
3920 ender = reg_node(pRExC_state, TAIL);
3921 REGTAIL(pRExC_state, br, ender);
3923 REGTAIL(pRExC_state, lastbr, ender);
3924 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3927 REGTAIL(pRExC_state, ret, ender);
3931 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3935 RExC_parse--; /* for vFAIL to print correctly */
3936 vFAIL("Sequence (? incomplete");
3940 parse_flags: /* (?i) */
3941 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3942 /* (?g), (?gc) and (?o) are useless here
3943 and must be globally applied -- japhy */
3945 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3946 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3947 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3948 if (! (wastedflags & wflagbit) ) {
3949 wastedflags |= wflagbit;
3952 "Useless (%s%c) - %suse /%c modifier",
3953 flagsp == &negflags ? "?-" : "?",
3955 flagsp == &negflags ? "don't " : "",
3961 else if (*RExC_parse == 'c') {
3962 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3963 if (! (wastedflags & WASTED_C) ) {
3964 wastedflags |= WASTED_GC;
3967 "Useless (%sc) - %suse /gc modifier",
3968 flagsp == &negflags ? "?-" : "?",
3969 flagsp == &negflags ? "don't " : ""
3974 else { pmflag(flagsp, *RExC_parse); }
3978 if (*RExC_parse == '-') {
3980 wastedflags = 0; /* reset so (?g-c) warns twice */
3984 RExC_flags |= posflags;
3985 RExC_flags &= ~negflags;
3986 if (*RExC_parse == ':') {
3992 if (*RExC_parse != ')') {
3994 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3996 nextchar(pRExC_state);
4004 ret = reganode(pRExC_state, OPEN, parno);
4005 Set_Node_Length(ret, 1); /* MJD */
4006 Set_Node_Offset(ret, RExC_parse); /* MJD */
4013 /* Pick up the branches, linking them together. */
4014 parse_start = RExC_parse; /* MJD */
4015 br = regbranch(pRExC_state, &flags, 1,depth+1);
4016 /* branch_len = (paren != 0); */
4020 if (*RExC_parse == '|') {
4021 if (!SIZE_ONLY && RExC_extralen) {
4022 reginsert(pRExC_state, BRANCHJ, br);
4025 reginsert(pRExC_state, BRANCH, br);
4026 Set_Node_Length(br, paren != 0);
4027 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4031 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4033 else if (paren == ':') {
4034 *flagp |= flags&SIMPLE;
4036 if (is_open) { /* Starts with OPEN. */
4037 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4039 else if (paren != '?') /* Not Conditional */
4041 *flagp |= flags & (SPSTART | HASWIDTH);
4043 while (*RExC_parse == '|') {
4044 if (!SIZE_ONLY && RExC_extralen) {
4045 ender = reganode(pRExC_state, LONGJMP,0);
4046 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4049 RExC_extralen += 2; /* Account for LONGJMP. */
4050 nextchar(pRExC_state);
4051 br = regbranch(pRExC_state, &flags, 0, depth+1);
4055 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4059 *flagp |= flags&SPSTART;
4062 if (have_branch || paren != ':') {
4063 /* Make a closing node, and hook it on the end. */
4066 ender = reg_node(pRExC_state, TAIL);
4069 ender = reganode(pRExC_state, CLOSE, parno);
4070 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4071 Set_Node_Length(ender,1); /* MJD */
4077 *flagp &= ~HASWIDTH;
4080 ender = reg_node(pRExC_state, SUCCEED);
4083 ender = reg_node(pRExC_state, END);
4086 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4088 if (have_branch && !SIZE_ONLY) {
4089 /* Hook the tails of the branches to the closing node. */
4090 for (br = ret; br; br = regnext(br)) {
4091 const U8 op = PL_regkind[OP(br)];
4093 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4095 else if (op == BRANCHJ) {
4096 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4104 static const char parens[] = "=!<,>";
4106 if (paren && (p = strchr(parens, paren))) {
4107 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4108 int flag = (p - parens) > 1;
4111 node = SUSPEND, flag = 0;
4112 reginsert(pRExC_state, node,ret);
4113 Set_Node_Cur_Length(ret);
4114 Set_Node_Offset(ret, parse_start + 1);
4116 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4120 /* Check for proper termination. */
4122 RExC_flags = oregflags;
4123 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4124 RExC_parse = oregcomp_parse;
4125 vFAIL("Unmatched (");
4128 else if (!paren && RExC_parse < RExC_end) {
4129 if (*RExC_parse == ')') {
4131 vFAIL("Unmatched )");
4134 FAIL("Junk on end of regexp"); /* "Can't happen". */
4142 - regbranch - one alternative of an | operator
4144 * Implements the concatenation operator.
4147 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4150 register regnode *ret;
4151 register regnode *chain = NULL;
4152 register regnode *latest;
4153 I32 flags = 0, c = 0;
4154 GET_RE_DEBUG_FLAGS_DECL;
4155 DEBUG_PARSE("brnc");
4159 if (!SIZE_ONLY && RExC_extralen)
4160 ret = reganode(pRExC_state, BRANCHJ,0);
4162 ret = reg_node(pRExC_state, BRANCH);
4163 Set_Node_Length(ret, 1);
4167 if (!first && SIZE_ONLY)
4168 RExC_extralen += 1; /* BRANCHJ */
4170 *flagp = WORST; /* Tentatively. */
4173 nextchar(pRExC_state);
4174 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4176 latest = regpiece(pRExC_state, &flags,depth+1);
4177 if (latest == NULL) {
4178 if (flags & TRYAGAIN)
4182 else if (ret == NULL)
4184 *flagp |= flags&HASWIDTH;
4185 if (chain == NULL) /* First piece. */
4186 *flagp |= flags&SPSTART;
4189 REGTAIL(pRExC_state, chain, latest);
4194 if (chain == NULL) { /* Loop ran zero times. */
4195 chain = reg_node(pRExC_state, NOTHING);
4200 *flagp |= flags&SIMPLE;
4207 - regpiece - something followed by possible [*+?]
4209 * Note that the branching code sequences used for ? and the general cases
4210 * of * and + are somewhat optimized: they use the same NOTHING node as
4211 * both the endmarker for their branch list and the body of the last branch.
4212 * It might seem that this node could be dispensed with entirely, but the
4213 * endmarker role is not redundant.
4216 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4219 register regnode *ret;
4221 register char *next;
4223 const char * const origparse = RExC_parse;
4225 I32 max = REG_INFTY;
4227 GET_RE_DEBUG_FLAGS_DECL;
4228 DEBUG_PARSE("piec");
4230 ret = regatom(pRExC_state, &flags,depth+1);
4232 if (flags & TRYAGAIN)
4239 if (op == '{' && regcurly(RExC_parse)) {
4240 const char *maxpos = NULL;
4241 parse_start = RExC_parse; /* MJD */
4242 next = RExC_parse + 1;
4243 while (isDIGIT(*next) || *next == ',') {
4252 if (*next == '}') { /* got one */
4256 min = atoi(RExC_parse);
4260 maxpos = RExC_parse;
4262 if (!max && *maxpos != '0')
4263 max = REG_INFTY; /* meaning "infinity" */
4264 else if (max >= REG_INFTY)
4265 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4267 nextchar(pRExC_state);
4270 if ((flags&SIMPLE)) {
4271 RExC_naughty += 2 + RExC_naughty / 2;
4272 reginsert(pRExC_state, CURLY, ret);
4273 Set_Node_Offset(ret, parse_start+1); /* MJD */
4274 Set_Node_Cur_Length(ret);
4277 regnode * const w = reg_node(pRExC_state, WHILEM);
4280 REGTAIL(pRExC_state, ret, w);
4281 if (!SIZE_ONLY && RExC_extralen) {
4282 reginsert(pRExC_state, LONGJMP,ret);
4283 reginsert(pRExC_state, NOTHING,ret);
4284 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4286 reginsert(pRExC_state, CURLYX,ret);
4288 Set_Node_Offset(ret, parse_start+1);
4289 Set_Node_Length(ret,
4290 op == '{' ? (RExC_parse - parse_start) : 1);
4292 if (!SIZE_ONLY && RExC_extralen)
4293 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4294 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4296 RExC_whilem_seen++, RExC_extralen += 3;
4297 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4305 if (max && max < min)
4306 vFAIL("Can't do {n,m} with n > m");
4308 ARG1_SET(ret, (U16)min);
4309 ARG2_SET(ret, (U16)max);
4321 #if 0 /* Now runtime fix should be reliable. */
4323 /* if this is reinstated, don't forget to put this back into perldiag:
4325 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4327 (F) The part of the regexp subject to either the * or + quantifier
4328 could match an empty string. The {#} shows in the regular
4329 expression about where the problem was discovered.
4333 if (!(flags&HASWIDTH) && op != '?')
4334 vFAIL("Regexp *+ operand could be empty");
4337 parse_start = RExC_parse;
4338 nextchar(pRExC_state);
4340 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4342 if (op == '*' && (flags&SIMPLE)) {
4343 reginsert(pRExC_state, STAR, ret);
4347 else if (op == '*') {
4351 else if (op == '+' && (flags&SIMPLE)) {
4352 reginsert(pRExC_state, PLUS, ret);
4356 else if (op == '+') {
4360 else if (op == '?') {
4365 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4367 "%.*s matches null string many times",
4368 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4372 if (*RExC_parse == '?') {
4373 nextchar(pRExC_state);
4374 reginsert(pRExC_state, MINMOD, ret);
4375 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4377 if (ISMULT2(RExC_parse)) {
4379 vFAIL("Nested quantifiers");
4386 - regatom - the lowest level
4388 * Optimization: gobbles an entire sequence of ordinary characters so that
4389 * it can turn them into a single node, which is smaller to store and
4390 * faster to run. Backslashed characters are exceptions, each becoming a
4391 * separate node; the code is simpler that way and it's not worth fixing.
4393 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4394 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4397 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4400 register regnode *ret = NULL;
4402 char *parse_start = RExC_parse;
4403 GET_RE_DEBUG_FLAGS_DECL;
4404 DEBUG_PARSE("atom");
4405 *flagp = WORST; /* Tentatively. */
4408 switch (*RExC_parse) {
4410 RExC_seen_zerolen++;
4411 nextchar(pRExC_state);
4412 if (RExC_flags & PMf_MULTILINE)
4413 ret = reg_node(pRExC_state, MBOL);
4414 else if (RExC_flags & PMf_SINGLELINE)
4415 ret = reg_node(pRExC_state, SBOL);
4417 ret = reg_node(pRExC_state, BOL);
4418 Set_Node_Length(ret, 1); /* MJD */
4421 nextchar(pRExC_state);
4423 RExC_seen_zerolen++;
4424 if (RExC_flags & PMf_MULTILINE)
4425 ret = reg_node(pRExC_state, MEOL);
4426 else if (RExC_flags & PMf_SINGLELINE)
4427 ret = reg_node(pRExC_state, SEOL);
4429 ret = reg_node(pRExC_state, EOL);
4430 Set_Node_Length(ret, 1); /* MJD */
4433 nextchar(pRExC_state);
4434 if (RExC_flags & PMf_SINGLELINE)
4435 ret = reg_node(pRExC_state, SANY);
4437 ret = reg_node(pRExC_state, REG_ANY);
4438 *flagp |= HASWIDTH|SIMPLE;
4440 Set_Node_Length(ret, 1); /* MJD */
4444 char * const oregcomp_parse = ++RExC_parse;
4445 ret = regclass(pRExC_state,depth+1);
4446 if (*RExC_parse != ']') {
4447 RExC_parse = oregcomp_parse;
4448 vFAIL("Unmatched [");
4450 nextchar(pRExC_state);
4451 *flagp |= HASWIDTH|SIMPLE;
4452 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4456 nextchar(pRExC_state);
4457 ret = reg(pRExC_state, 1, &flags,depth+1);
4459 if (flags & TRYAGAIN) {
4460 if (RExC_parse == RExC_end) {
4461 /* Make parent create an empty node if needed. */
4469 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4473 if (flags & TRYAGAIN) {
4477 vFAIL("Internal urp");
4478 /* Supposed to be caught earlier. */
4481 if (!regcurly(RExC_parse)) {
4490 vFAIL("Quantifier follows nothing");
4493 switch (*++RExC_parse) {
4495 RExC_seen_zerolen++;
4496 ret = reg_node(pRExC_state, SBOL);
4498 nextchar(pRExC_state);
4499 Set_Node_Length(ret, 2); /* MJD */
4502 ret = reg_node(pRExC_state, GPOS);
4503 RExC_seen |= REG_SEEN_GPOS;
4505 nextchar(pRExC_state);
4506 Set_Node_Length(ret, 2); /* MJD */
4509 ret = reg_node(pRExC_state, SEOL);
4511 RExC_seen_zerolen++; /* Do not optimize RE away */
4512 nextchar(pRExC_state);
4515 ret = reg_node(pRExC_state, EOS);
4517 RExC_seen_zerolen++; /* Do not optimize RE away */
4518 nextchar(pRExC_state);
4519 Set_Node_Length(ret, 2); /* MJD */
4522 ret = reg_node(pRExC_state, CANY);
4523 RExC_seen |= REG_SEEN_CANY;
4524 *flagp |= HASWIDTH|SIMPLE;
4525 nextchar(pRExC_state);
4526 Set_Node_Length(ret, 2); /* MJD */
4529 ret = reg_node(pRExC_state, CLUMP);
4531 nextchar(pRExC_state);
4532 Set_Node_Length(ret, 2); /* MJD */
4535 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4536 *flagp |= HASWIDTH|SIMPLE;
4537 nextchar(pRExC_state);
4538 Set_Node_Length(ret, 2); /* MJD */
4541 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4542 *flagp |= HASWIDTH|SIMPLE;
4543 nextchar(pRExC_state);
4544 Set_Node_Length(ret, 2); /* MJD */
4547 RExC_seen_zerolen++;
4548 RExC_seen |= REG_SEEN_LOOKBEHIND;
4549 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4551 nextchar(pRExC_state);
4552 Set_Node_Length(ret, 2); /* MJD */
4555 RExC_seen_zerolen++;
4556 RExC_seen |= REG_SEEN_LOOKBEHIND;
4557 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4559 nextchar(pRExC_state);
4560 Set_Node_Length(ret, 2); /* MJD */
4563 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4564 *flagp |= HASWIDTH|SIMPLE;
4565 nextchar(pRExC_state);
4566 Set_Node_Length(ret, 2); /* MJD */
4569 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4570 *flagp |= HASWIDTH|SIMPLE;
4571 nextchar(pRExC_state);
4572 Set_Node_Length(ret, 2); /* MJD */
4575 ret = reg_node(pRExC_state, DIGIT);
4576 *flagp |= HASWIDTH|SIMPLE;
4577 nextchar(pRExC_state);
4578 Set_Node_Length(ret, 2); /* MJD */
4581 ret = reg_node(pRExC_state, NDIGIT);
4582 *flagp |= HASWIDTH|SIMPLE;
4583 nextchar(pRExC_state);
4584 Set_Node_Length(ret, 2); /* MJD */
4589 char* const oldregxend = RExC_end;
4590 char* parse_start = RExC_parse - 2;
4592 if (RExC_parse[1] == '{') {
4593 /* a lovely hack--pretend we saw [\pX] instead */
4594 RExC_end = strchr(RExC_parse, '}');
4596 const U8 c = (U8)*RExC_parse;
4598 RExC_end = oldregxend;
4599 vFAIL2("Missing right brace on \\%c{}", c);
4604 RExC_end = RExC_parse + 2;
4605 if (RExC_end > oldregxend)
4606 RExC_end = oldregxend;
4610 ret = regclass(pRExC_state,depth+1);
4612 RExC_end = oldregxend;
4615 Set_Node_Offset(ret, parse_start + 2);
4616 Set_Node_Cur_Length(ret);
4617 nextchar(pRExC_state);
4618 *flagp |= HASWIDTH|SIMPLE;
4631 case '1': case '2': case '3': case '4':
4632 case '5': case '6': case '7': case '8': case '9':
4634 const I32 num = atoi(RExC_parse);
4636 if (num > 9 && num >= RExC_npar)
4639 char * const parse_start = RExC_parse - 1; /* MJD */
4640 while (isDIGIT(*RExC_parse))
4643 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4644 vFAIL("Reference to nonexistent group");
4646 ret = reganode(pRExC_state,
4647 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4651 /* override incorrect value set in reganode MJD */
4652 Set_Node_Offset(ret, parse_start+1);
4653 Set_Node_Cur_Length(ret); /* MJD */
4655 nextchar(pRExC_state);
4660 if (RExC_parse >= RExC_end)
4661 FAIL("Trailing \\");
4664 /* Do not generate "unrecognized" warnings here, we fall
4665 back into the quick-grab loop below */
4672 if (RExC_flags & PMf_EXTENDED) {
4673 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4675 if (RExC_parse < RExC_end)
4681 register STRLEN len;
4686 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4688 parse_start = RExC_parse - 1;
4694 ret = reg_node(pRExC_state,
4695 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4697 for (len = 0, p = RExC_parse - 1;
4698 len < 127 && p < RExC_end;
4701 char * const oldp = p;
4703 if (RExC_flags & PMf_EXTENDED)
4704 p = regwhite(p, RExC_end);
4751 ender = ASCII_TO_NATIVE('\033');
4755 ender = ASCII_TO_NATIVE('\007');
4760 char* const e = strchr(p, '}');
4764 vFAIL("Missing right brace on \\x{}");
4767 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4768 | PERL_SCAN_DISALLOW_PREFIX;
4769 STRLEN numlen = e - p - 1;
4770 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4777 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4779 ender = grok_hex(p, &numlen, &flags, NULL);
4785 ender = UCHARAT(p++);
4786 ender = toCTRL(ender);
4788 case '0': case '1': case '2': case '3':case '4':
4789 case '5': case '6': case '7': case '8':case '9':
4791 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4794 ender = grok_oct(p, &numlen, &flags, NULL);
4804 FAIL("Trailing \\");
4807 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4808 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4809 goto normal_default;
4814 if (UTF8_IS_START(*p) && UTF) {
4816 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4817 &numlen, UTF8_ALLOW_DEFAULT);
4824 if (RExC_flags & PMf_EXTENDED)
4825 p = regwhite(p, RExC_end);
4827 /* Prime the casefolded buffer. */
4828 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4830 if (ISMULT2(p)) { /* Back off on ?+*. */
4835 /* Emit all the Unicode characters. */
4837 for (foldbuf = tmpbuf;
4839 foldlen -= numlen) {
4840 ender = utf8_to_uvchr(foldbuf, &numlen);
4842 const STRLEN unilen = reguni(pRExC_state, ender, s);
4845 /* In EBCDIC the numlen
4846 * and unilen can differ. */
4848 if (numlen >= foldlen)
4852 break; /* "Can't happen." */
4856 const STRLEN unilen = reguni(pRExC_state, ender, s);
4865 REGC((char)ender, s++);
4871 /* Emit all the Unicode characters. */
4873 for (foldbuf = tmpbuf;
4875 foldlen -= numlen) {
4876 ender = utf8_to_uvchr(foldbuf, &numlen);
4878 const STRLEN unilen = reguni(pRExC_state, ender, s);
4881 /* In EBCDIC the numlen
4882 * and unilen can differ. */
4884 if (numlen >= foldlen)
4892 const STRLEN unilen = reguni(pRExC_state, ender, s);
4901 REGC((char)ender, s++);
4905 Set_Node_Cur_Length(ret); /* MJD */
4906 nextchar(pRExC_state);
4908 /* len is STRLEN which is unsigned, need to copy to signed */
4911 vFAIL("Internal disaster");
4915 if (len == 1 && UNI_IS_INVARIANT(ender))
4919 RExC_size += STR_SZ(len);
4922 RExC_emit += STR_SZ(len);
4928 /* If the encoding pragma is in effect recode the text of
4929 * any EXACT-kind nodes. */
4930 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4931 const STRLEN oldlen = STR_LEN(ret);
4932 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4936 if (sv_utf8_downgrade(sv, TRUE)) {
4937 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4938 const STRLEN newlen = SvCUR(sv);
4943 GET_RE_DEBUG_FLAGS_DECL;
4944 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4945 (int)oldlen, STRING(ret),
4947 Copy(s, STRING(ret), newlen, char);
4948 STR_LEN(ret) += newlen - oldlen;
4949 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4951 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4959 S_regwhite(char *p, const char *e)
4964 else if (*p == '#') {
4967 } while (p < e && *p != '\n');
4975 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4976 Character classes ([:foo:]) can also be negated ([:^foo:]).
4977 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4978 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4979 but trigger failures because they are currently unimplemented. */
4981 #define POSIXCC_DONE(c) ((c) == ':')
4982 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4983 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4986 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4989 I32 namedclass = OOB_NAMEDCLASS;
4991 if (value == '[' && RExC_parse + 1 < RExC_end &&
4992 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4993 POSIXCC(UCHARAT(RExC_parse))) {
4994 const char c = UCHARAT(RExC_parse);
4995 char* const s = RExC_parse++;
4997 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4999 if (RExC_parse == RExC_end)
5000 /* Grandfather lone [:, [=, [. */
5003 const char* const t = RExC_parse++; /* skip over the c */
5006 if (UCHARAT(RExC_parse) == ']') {
5007 const char *posixcc = s + 1;
5008 RExC_parse++; /* skip over the ending ] */
5011 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5012 const I32 skip = t - posixcc;
5014 /* Initially switch on the length of the name. */
5017 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5018 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5021 /* Names all of length 5. */
5022 /* alnum alpha ascii blank cntrl digit graph lower
5023 print punct space upper */
5024 /* Offset 4 gives the best switch position. */
5025 switch (posixcc[4]) {
5027 if (memEQ(posixcc, "alph", 4)) /* alpha */
5028 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5031 if (memEQ(posixcc, "spac", 4)) /* space */
5032 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5035 if (memEQ(posixcc, "grap", 4)) /* graph */
5036 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5039 if (memEQ(posixcc, "asci", 4)) /* ascii */
5040 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5043 if (memEQ(posixcc, "blan", 4)) /* blank */
5044 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5047 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5048 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5051 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5052 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5055 if (memEQ(posixcc, "lowe", 4)) /* lower */
5056 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5057 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5058 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5061 if (memEQ(posixcc, "digi", 4)) /* digit */
5062 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5063 else if (memEQ(posixcc, "prin", 4)) /* print */
5064 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5065 else if (memEQ(posixcc, "punc", 4)) /* punct */
5066 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5071 if (memEQ(posixcc, "xdigit", 6))
5072 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5076 if (namedclass == OOB_NAMEDCLASS)
5077 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5079 assert (posixcc[skip] == ':');
5080 assert (posixcc[skip+1] == ']');
5081 } else if (!SIZE_ONLY) {
5082 /* [[=foo=]] and [[.foo.]] are still future. */
5084 /* adjust RExC_parse so the warning shows after
5086 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5088 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5091 /* Maternal grandfather:
5092 * "[:" ending in ":" but not in ":]" */
5102 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5105 if (POSIXCC(UCHARAT(RExC_parse))) {
5106 const char *s = RExC_parse;
5107 const char c = *s++;
5111 if (*s && c == *s && s[1] == ']') {
5112 if (ckWARN(WARN_REGEXP))
5114 "POSIX syntax [%c %c] belongs inside character classes",
5117 /* [[=foo=]] and [[.foo.]] are still future. */
5118 if (POSIXCC_NOTYET(c)) {
5119 /* adjust RExC_parse so the error shows after
5121 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5123 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5131 parse a class specification and produce either an ANYOF node that
5132 matches the pattern. If the pattern matches a single char only and
5133 that char is < 256 then we produce an EXACT node instead.
5136 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5140 register UV nextvalue;
5141 register IV prevvalue = OOB_UNICODE;
5142 register IV range = 0;
5143 register regnode *ret;
5146 char *rangebegin = NULL;
5147 bool need_class = 0;
5150 bool optimize_invert = TRUE;
5151 AV* unicode_alternate = NULL;
5153 UV literal_endpoint = 0;
5155 UV stored = 0; /* number of chars stored in the class */
5157 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5158 case we need to change the emitted regop to an EXACT. */
5159 const char * orig_parse = RExC_parse;
5160 GET_RE_DEBUG_FLAGS_DECL;
5161 DEBUG_PARSE("clas");
5163 /* Assume we are going to generate an ANYOF node. */
5164 ret = reganode(pRExC_state, ANYOF, 0);
5167 ANYOF_FLAGS(ret) = 0;
5169 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5173 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5177 RExC_size += ANYOF_SKIP;
5178 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5181 RExC_emit += ANYOF_SKIP;
5183 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5185 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5186 ANYOF_BITMAP_ZERO(ret);
5187 listsv = newSVpvs("# comment\n");
5190 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5192 if (!SIZE_ONLY && POSIXCC(nextvalue))
5193 checkposixcc(pRExC_state);
5195 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5196 if (UCHARAT(RExC_parse) == ']')
5199 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5203 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5206 rangebegin = RExC_parse;
5208 value = utf8n_to_uvchr((U8*)RExC_parse,
5209 RExC_end - RExC_parse,
5210 &numlen, UTF8_ALLOW_DEFAULT);
5211 RExC_parse += numlen;
5214 value = UCHARAT(RExC_parse++);
5216 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5217 if (value == '[' && POSIXCC(nextvalue))
5218 namedclass = regpposixcc(pRExC_state, value);
5219 else if (value == '\\') {
5221 value = utf8n_to_uvchr((U8*)RExC_parse,
5222 RExC_end - RExC_parse,
5223 &numlen, UTF8_ALLOW_DEFAULT);
5224 RExC_parse += numlen;
5227 value = UCHARAT(RExC_parse++);
5228 /* Some compilers cannot handle switching on 64-bit integer
5229 * values, therefore value cannot be an UV. Yes, this will
5230 * be a problem later if we want switch on Unicode.
5231 * A similar issue a little bit later when switching on
5232 * namedclass. --jhi */
5233 switch ((I32)value) {
5234 case 'w': namedclass = ANYOF_ALNUM; break;
5235 case 'W': namedclass = ANYOF_NALNUM; break;
5236 case 's': namedclass = ANYOF_SPACE; break;
5237 case 'S': namedclass = ANYOF_NSPACE; break;
5238 case 'd': namedclass = ANYOF_DIGIT; break;
5239 case 'D': namedclass = ANYOF_NDIGIT; break;
5244 if (RExC_parse >= RExC_end)
5245 vFAIL2("Empty \\%c{}", (U8)value);
5246 if (*RExC_parse == '{') {
5247 const U8 c = (U8)value;
5248 e = strchr(RExC_parse++, '}');
5250 vFAIL2("Missing right brace on \\%c{}", c);
5251 while (isSPACE(UCHARAT(RExC_parse)))
5253 if (e == RExC_parse)
5254 vFAIL2("Empty \\%c{}", c);
5256 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5264 if (UCHARAT(RExC_parse) == '^') {
5267 value = value == 'p' ? 'P' : 'p'; /* toggle */
5268 while (isSPACE(UCHARAT(RExC_parse))) {
5273 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5274 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5277 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5278 namedclass = ANYOF_MAX; /* no official name, but it's named */
5281 case 'n': value = '\n'; break;
5282 case 'r': value = '\r'; break;
5283 case 't': value = '\t'; break;
5284 case 'f': value = '\f'; break;
5285 case 'b': value = '\b'; break;
5286 case 'e': value = ASCII_TO_NATIVE('\033');break;
5287 case 'a': value = ASCII_TO_NATIVE('\007');break;
5289 if (*RExC_parse == '{') {
5290 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5291 | PERL_SCAN_DISALLOW_PREFIX;
5292 char * const e = strchr(RExC_parse++, '}');
5294 vFAIL("Missing right brace on \\x{}");
5296 numlen = e - RExC_parse;
5297 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5301 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5303 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5304 RExC_parse += numlen;
5308 value = UCHARAT(RExC_parse++);
5309 value = toCTRL(value);
5311 case '0': case '1': case '2': case '3': case '4':
5312 case '5': case '6': case '7': case '8': case '9':
5316 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5317 RExC_parse += numlen;
5321 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5323 "Unrecognized escape \\%c in character class passed through",
5327 } /* end of \blah */
5333 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5335 if (!SIZE_ONLY && !need_class)
5336 ANYOF_CLASS_ZERO(ret);
5340 /* a bad range like a-\d, a-[:digit:] ? */
5343 if (ckWARN(WARN_REGEXP)) {
5345 RExC_parse >= rangebegin ?
5346 RExC_parse - rangebegin : 0;
5348 "False [] range \"%*.*s\"",
5351 if (prevvalue < 256) {
5352 ANYOF_BITMAP_SET(ret, prevvalue);
5353 ANYOF_BITMAP_SET(ret, '-');
5356 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5357 Perl_sv_catpvf(aTHX_ listsv,
5358 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5362 range = 0; /* this was not a true range */
5366 const char *what = NULL;
5369 if (namedclass > OOB_NAMEDCLASS)
5370 optimize_invert = FALSE;
5371 /* Possible truncation here but in some 64-bit environments
5372 * the compiler gets heartburn about switch on 64-bit values.
5373 * A similar issue a little earlier when switching on value.
5375 switch ((I32)namedclass) {
5378 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5380 for (value = 0; value < 256; value++)
5382 ANYOF_BITMAP_SET(ret, value);
5389 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5391 for (value = 0; value < 256; value++)
5392 if (!isALNUM(value))
5393 ANYOF_BITMAP_SET(ret, value);
5400 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5402 for (value = 0; value < 256; value++)
5403 if (isALNUMC(value))
5404 ANYOF_BITMAP_SET(ret, value);
5411 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5413 for (value = 0; value < 256; value++)
5414 if (!isALNUMC(value))
5415 ANYOF_BITMAP_SET(ret, value);
5422 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5424 for (value = 0; value < 256; value++)
5426 ANYOF_BITMAP_SET(ret, value);
5433 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5435 for (value = 0; value < 256; value++)
5436 if (!isALPHA(value))
5437 ANYOF_BITMAP_SET(ret, value);
5444 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5447 for (value = 0; value < 128; value++)
5448 ANYOF_BITMAP_SET(ret, value);
5450 for (value = 0; value < 256; value++) {
5452 ANYOF_BITMAP_SET(ret, value);
5461 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5464 for (value = 128; value < 256; value++)
5465 ANYOF_BITMAP_SET(ret, value);
5467 for (value = 0; value < 256; value++) {
5468 if (!isASCII(value))
5469 ANYOF_BITMAP_SET(ret, value);
5478 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5480 for (value = 0; value < 256; value++)
5482 ANYOF_BITMAP_SET(ret, value);
5489 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5491 for (value = 0; value < 256; value++)
5492 if (!isBLANK(value))
5493 ANYOF_BITMAP_SET(ret, value);
5500 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5502 for (value = 0; value < 256; value++)
5504 ANYOF_BITMAP_SET(ret, value);
5511 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5513 for (value = 0; value < 256; value++)
5514 if (!isCNTRL(value))
5515 ANYOF_BITMAP_SET(ret, value);
5522 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5524 /* consecutive digits assumed */
5525 for (value = '0'; value <= '9'; value++)
5526 ANYOF_BITMAP_SET(ret, value);
5533 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5535 /* consecutive digits assumed */
5536 for (value = 0; value < '0'; value++)
5537 ANYOF_BITMAP_SET(ret, value);
5538 for (value = '9' + 1; value < 256; value++)
5539 ANYOF_BITMAP_SET(ret, value);
5546 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5548 for (value = 0; value < 256; value++)
5550 ANYOF_BITMAP_SET(ret, value);
5557 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5559 for (value = 0; value < 256; value++)
5560 if (!isGRAPH(value))
5561 ANYOF_BITMAP_SET(ret, value);
5568 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5570 for (value = 0; value < 256; value++)
5572 ANYOF_BITMAP_SET(ret, value);
5579 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5581 for (value = 0; value < 256; value++)
5582 if (!isLOWER(value))
5583 ANYOF_BITMAP_SET(ret, value);
5590 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5592 for (value = 0; value < 256; value++)
5594 ANYOF_BITMAP_SET(ret, value);
5601 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5603 for (value = 0; value < 256; value++)
5604 if (!isPRINT(value))
5605 ANYOF_BITMAP_SET(ret, value);
5612 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5614 for (value = 0; value < 256; value++)
5615 if (isPSXSPC(value))
5616 ANYOF_BITMAP_SET(ret, value);
5623 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5625 for (value = 0; value < 256; value++)
5626 if (!isPSXSPC(value))
5627 ANYOF_BITMAP_SET(ret, value);
5634 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5636 for (value = 0; value < 256; value++)
5638 ANYOF_BITMAP_SET(ret, value);
5645 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5647 for (value = 0; value < 256; value++)
5648 if (!isPUNCT(value))
5649 ANYOF_BITMAP_SET(ret, value);
5656 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5658 for (value = 0; value < 256; value++)
5660 ANYOF_BITMAP_SET(ret, value);
5667 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5669 for (value = 0; value < 256; value++)
5670 if (!isSPACE(value))
5671 ANYOF_BITMAP_SET(ret, value);
5678 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5680 for (value = 0; value < 256; value++)
5682 ANYOF_BITMAP_SET(ret, value);
5689 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5691 for (value = 0; value < 256; value++)
5692 if (!isUPPER(value))
5693 ANYOF_BITMAP_SET(ret, value);
5700 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5702 for (value = 0; value < 256; value++)
5703 if (isXDIGIT(value))
5704 ANYOF_BITMAP_SET(ret, value);
5711 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5713 for (value = 0; value < 256; value++)
5714 if (!isXDIGIT(value))
5715 ANYOF_BITMAP_SET(ret, value);
5721 /* this is to handle \p and \P */
5724 vFAIL("Invalid [::] class");
5728 /* Strings such as "+utf8::isWord\n" */
5729 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5732 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5735 } /* end of namedclass \blah */
5738 if (prevvalue > (IV)value) /* b-a */ {
5739 const int w = RExC_parse - rangebegin;
5740 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5741 range = 0; /* not a valid range */
5745 prevvalue = value; /* save the beginning of the range */
5746 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5747 RExC_parse[1] != ']') {
5750 /* a bad range like \w-, [:word:]- ? */
5751 if (namedclass > OOB_NAMEDCLASS) {
5752 if (ckWARN(WARN_REGEXP)) {
5754 RExC_parse >= rangebegin ?
5755 RExC_parse - rangebegin : 0;
5757 "False [] range \"%*.*s\"",
5761 ANYOF_BITMAP_SET(ret, '-');
5763 range = 1; /* yeah, it's a range! */
5764 continue; /* but do it the next time */
5768 /* now is the next time */
5769 /*stored += (value - prevvalue + 1);*/
5771 if (prevvalue < 256) {
5772 const IV ceilvalue = value < 256 ? value : 255;
5775 /* In EBCDIC [\x89-\x91] should include
5776 * the \x8e but [i-j] should not. */
5777 if (literal_endpoint == 2 &&
5778 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5779 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5781 if (isLOWER(prevvalue)) {
5782 for (i = prevvalue; i <= ceilvalue; i++)
5784 ANYOF_BITMAP_SET(ret, i);
5786 for (i = prevvalue; i <= ceilvalue; i++)
5788 ANYOF_BITMAP_SET(ret, i);
5793 for (i = prevvalue; i <= ceilvalue; i++) {
5794 if (!ANYOF_BITMAP_TEST(ret,i)) {
5796 ANYOF_BITMAP_SET(ret, i);
5800 if (value > 255 || UTF) {
5801 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5802 const UV natvalue = NATIVE_TO_UNI(value);
5803 stored+=2; /* can't optimize this class */
5804 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5805 if (prevnatvalue < natvalue) { /* what about > ? */
5806 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5807 prevnatvalue, natvalue);
5809 else if (prevnatvalue == natvalue) {
5810 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5812 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5814 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5816 /* If folding and foldable and a single
5817 * character, insert also the folded version
5818 * to the charclass. */
5820 if (foldlen == (STRLEN)UNISKIP(f))
5821 Perl_sv_catpvf(aTHX_ listsv,
5824 /* Any multicharacter foldings
5825 * require the following transform:
5826 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5827 * where E folds into "pq" and F folds
5828 * into "rst", all other characters
5829 * fold to single characters. We save
5830 * away these multicharacter foldings,
5831 * to be later saved as part of the
5832 * additional "s" data. */
5835 if (!unicode_alternate)
5836 unicode_alternate = newAV();
5837 sv = newSVpvn((char*)foldbuf, foldlen);
5839 av_push(unicode_alternate, sv);
5843 /* If folding and the value is one of the Greek
5844 * sigmas insert a few more sigmas to make the
5845 * folding rules of the sigmas to work right.
5846 * Note that not all the possible combinations
5847 * are handled here: some of them are handled
5848 * by the standard folding rules, and some of
5849 * them (literal or EXACTF cases) are handled
5850 * during runtime in regexec.c:S_find_byclass(). */
5851 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5852 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5853 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5854 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5855 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5857 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5858 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5859 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5864 literal_endpoint = 0;
5868 range = 0; /* this range (if it was one) is done now */
5872 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5874 RExC_size += ANYOF_CLASS_ADD_SKIP;
5876 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5882 /****** !SIZE_ONLY AFTER HERE *********/
5884 if( stored == 1 && value < 256
5885 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5887 /* optimize single char class to an EXACT node
5888 but *only* when its not a UTF/high char */
5889 const char * cur_parse= RExC_parse;
5890 RExC_emit = (regnode *)orig_emit;
5891 RExC_parse = (char *)orig_parse;
5892 ret = reg_node(pRExC_state,
5893 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5894 RExC_parse = (char *)cur_parse;
5895 *STRING(ret)= (char)value;
5897 RExC_emit += STR_SZ(1);
5900 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5901 if ( /* If the only flag is folding (plus possibly inversion). */
5902 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5904 for (value = 0; value < 256; ++value) {
5905 if (ANYOF_BITMAP_TEST(ret, value)) {
5906 UV fold = PL_fold[value];
5909 ANYOF_BITMAP_SET(ret, fold);
5912 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5915 /* optimize inverted simple patterns (e.g. [^a-z]) */
5916 if (optimize_invert &&
5917 /* If the only flag is inversion. */
5918 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5919 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5920 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5921 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5924 AV * const av = newAV();
5926 /* The 0th element stores the character class description
5927 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5928 * to initialize the appropriate swash (which gets stored in
5929 * the 1st element), and also useful for dumping the regnode.
5930 * The 2nd element stores the multicharacter foldings,
5931 * used later (regexec.c:S_reginclass()). */
5932 av_store(av, 0, listsv);
5933 av_store(av, 1, NULL);
5934 av_store(av, 2, (SV*)unicode_alternate);
5935 rv = newRV_noinc((SV*)av);
5936 n = add_data(pRExC_state, 1, "s");
5937 RExC_rx->data->data[n] = (void*)rv;
5944 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5946 char* const retval = RExC_parse++;
5949 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5950 RExC_parse[2] == '#') {
5951 while (*RExC_parse != ')') {
5952 if (RExC_parse == RExC_end)
5953 FAIL("Sequence (?#... not terminated");
5959 if (RExC_flags & PMf_EXTENDED) {
5960 if (isSPACE(*RExC_parse)) {
5964 else if (*RExC_parse == '#') {
5965 while (RExC_parse < RExC_end)
5966 if (*RExC_parse++ == '\n') break;
5975 - reg_node - emit a node
5977 STATIC regnode * /* Location. */
5978 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5981 register regnode *ptr;
5982 regnode * const ret = RExC_emit;
5983 GET_RE_DEBUG_FLAGS_DECL;
5986 SIZE_ALIGN(RExC_size);
5990 NODE_ALIGN_FILL(ret);
5992 FILL_ADVANCE_NODE(ptr, op);
5993 if (RExC_offsets) { /* MJD */
5994 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
5995 "reg_node", __LINE__,
5997 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
5998 ? "Overwriting end of array!\n" : "OK",
5999 (UV)(RExC_emit - RExC_emit_start),
6000 (UV)(RExC_parse - RExC_start),
6001 (UV)RExC_offsets[0]));
6002 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
6011 - reganode - emit a node with an argument
6013 STATIC regnode * /* Location. */
6014 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6017 register regnode *ptr;
6018 regnode * const ret = RExC_emit;
6019 GET_RE_DEBUG_FLAGS_DECL;
6022 SIZE_ALIGN(RExC_size);
6027 NODE_ALIGN_FILL(ret);
6029 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6030 if (RExC_offsets) { /* MJD */
6031 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6035 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6036 "Overwriting end of array!\n" : "OK",
6037 (UV)(RExC_emit - RExC_emit_start),
6038 (UV)(RExC_parse - RExC_start),
6039 (UV)RExC_offsets[0]));
6040 Set_Cur_Node_Offset;
6049 - reguni - emit (if appropriate) a Unicode character
6052 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6055 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6059 - reginsert - insert an operator in front of already-emitted operand
6061 * Means relocating the operand.
6064 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6067 register regnode *src;
6068 register regnode *dst;
6069 register regnode *place;
6070 const int offset = regarglen[(U8)op];
6071 GET_RE_DEBUG_FLAGS_DECL;
6072 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6075 RExC_size += NODE_STEP_REGNODE + offset;
6080 RExC_emit += NODE_STEP_REGNODE + offset;
6082 while (src > opnd) {
6083 StructCopy(--src, --dst, regnode);
6084 if (RExC_offsets) { /* MJD 20010112 */
6085 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6089 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6090 ? "Overwriting end of array!\n" : "OK",
6091 (UV)(src - RExC_emit_start),
6092 (UV)(dst - RExC_emit_start),
6093 (UV)RExC_offsets[0]));
6094 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6095 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6100 place = opnd; /* Op node, where operand used to be. */
6101 if (RExC_offsets) { /* MJD */
6102 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6106 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6107 ? "Overwriting end of array!\n" : "OK",
6108 (UV)(place - RExC_emit_start),
6109 (UV)(RExC_parse - RExC_start),
6111 Set_Node_Offset(place, RExC_parse);
6112 Set_Node_Length(place, 1);
6114 src = NEXTOPER(place);
6115 FILL_ADVANCE_NODE(place, op);
6116 Zero(src, offset, regnode);
6120 - regtail - set the next-pointer at the end of a node chain of p to val.
6121 - SEE ALSO: regtail_study
6123 /* TODO: All three parms should be const */
6125 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6128 register regnode *scan;
6129 GET_RE_DEBUG_FLAGS_DECL;
6134 /* Find last node. */
6137 regnode * const temp = regnext(scan);
6139 SV * const mysv=sv_newmortal();
6140 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6141 regprop(RExC_rx, mysv, scan);
6142 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6143 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6150 if (reg_off_by_arg[OP(scan)]) {
6151 ARG_SET(scan, val - scan);
6154 NEXT_OFF(scan) = val - scan;
6160 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6161 - Look for optimizable sequences at the same time.
6162 - currently only looks for EXACT chains.
6164 This is expermental code. The idea is to use this routine to perform
6165 in place optimizations on branches and groups as they are constructed,
6166 with the long term intention of removing optimization from study_chunk so
6167 that it is purely analytical.
6169 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6170 to control which is which.
6173 /* TODO: All four parms should be const */
6176 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6179 register regnode *scan;
6181 #ifdef EXPERIMENTAL_INPLACESCAN
6185 GET_RE_DEBUG_FLAGS_DECL;
6191 /* Find last node. */
6195 regnode * const temp = regnext(scan);
6196 #ifdef EXPERIMENTAL_INPLACESCAN
6197 if (PL_regkind[OP(scan)] == EXACT)
6198 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6206 if( exact == PSEUDO )
6208 else if ( exact != OP(scan) )
6217 SV * const mysv=sv_newmortal();
6218 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6219 regprop(RExC_rx, mysv, scan);
6220 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6221 SvPV_nolen_const(mysv),
6223 REG_NODE_NUM(scan));
6230 SV * const mysv_val=sv_newmortal();
6231 DEBUG_PARSE_MSG("");
6232 regprop(RExC_rx, mysv_val, val);
6233 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6234 SvPV_nolen_const(mysv_val),
6239 if (reg_off_by_arg[OP(scan)]) {
6240 ARG_SET(scan, val - scan);
6243 NEXT_OFF(scan) = val - scan;
6251 - regcurly - a little FSA that accepts {\d+,?\d*}
6254 S_regcurly(register const char *s)
6273 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6276 Perl_regdump(pTHX_ const regexp *r)
6280 SV * const sv = sv_newmortal();
6282 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
6284 /* Header fields of interest. */
6285 if (r->anchored_substr)
6286 PerlIO_printf(Perl_debug_log,
6287 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
6289 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
6290 SvPVX_const(r->anchored_substr),
6292 SvTAIL(r->anchored_substr) ? "$" : "",
6293 (IV)r->anchored_offset);
6294 else if (r->anchored_utf8)
6295 PerlIO_printf(Perl_debug_log,
6296 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
6298 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
6299 SvPVX_const(r->anchored_utf8),
6301 SvTAIL(r->anchored_utf8) ? "$" : "",
6302 (IV)r->anchored_offset);
6303 if (r->float_substr)
6304 PerlIO_printf(Perl_debug_log,
6305 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6307 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
6308 SvPVX_const(r->float_substr),
6310 SvTAIL(r->float_substr) ? "$" : "",
6311 (IV)r->float_min_offset, (UV)r->float_max_offset);
6312 else if (r->float_utf8)
6313 PerlIO_printf(Perl_debug_log,
6314 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6316 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
6317 SvPVX_const(r->float_utf8),
6319 SvTAIL(r->float_utf8) ? "$" : "",
6320 (IV)r->float_min_offset, (UV)r->float_max_offset);
6321 if (r->check_substr || r->check_utf8)
6322 PerlIO_printf(Perl_debug_log,
6323 r->check_substr == r->float_substr
6324 && r->check_utf8 == r->float_utf8
6325 ? "(checking floating" : "(checking anchored");
6326 if (r->reganch & ROPT_NOSCAN)
6327 PerlIO_printf(Perl_debug_log, " noscan");
6328 if (r->reganch & ROPT_CHECK_ALL)
6329 PerlIO_printf(Perl_debug_log, " isall");
6330 if (r->check_substr || r->check_utf8)
6331 PerlIO_printf(Perl_debug_log, ") ");
6333 if (r->regstclass) {
6334 regprop(r, sv, r->regstclass);
6335 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6337 if (r->reganch & ROPT_ANCH) {
6338 PerlIO_printf(Perl_debug_log, "anchored");
6339 if (r->reganch & ROPT_ANCH_BOL)
6340 PerlIO_printf(Perl_debug_log, "(BOL)");
6341 if (r->reganch & ROPT_ANCH_MBOL)
6342 PerlIO_printf(Perl_debug_log, "(MBOL)");
6343 if (r->reganch & ROPT_ANCH_SBOL)
6344 PerlIO_printf(Perl_debug_log, "(SBOL)");
6345 if (r->reganch & ROPT_ANCH_GPOS)
6346 PerlIO_printf(Perl_debug_log, "(GPOS)");
6347 PerlIO_putc(Perl_debug_log, ' ');
6349 if (r->reganch & ROPT_GPOS_SEEN)
6350 PerlIO_printf(Perl_debug_log, "GPOS ");
6351 if (r->reganch & ROPT_SKIP)
6352 PerlIO_printf(Perl_debug_log, "plus ");
6353 if (r->reganch & ROPT_IMPLICIT)
6354 PerlIO_printf(Perl_debug_log, "implicit ");
6355 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6356 if (r->reganch & ROPT_EVAL_SEEN)
6357 PerlIO_printf(Perl_debug_log, "with eval ");
6358 PerlIO_printf(Perl_debug_log, "\n");
6360 const U32 len = r->offsets[0];
6361 GET_RE_DEBUG_FLAGS_DECL;
6364 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
6365 for (i = 1; i <= len; i++) {
6366 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6367 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
6369 PerlIO_printf(Perl_debug_log, "\n");
6373 PERL_UNUSED_CONTEXT;
6375 #endif /* DEBUGGING */
6379 - regprop - printable representation of opcode
6382 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6388 sv_setpvn(sv, "", 0);
6389 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6390 /* It would be nice to FAIL() here, but this may be called from
6391 regexec.c, and it would be hard to supply pRExC_state. */
6392 Perl_croak(aTHX_ "Corrupted regexp opcode");
6393 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6395 k = PL_regkind[OP(o)];
6398 SV * const dsv = sv_2mortal(newSVpvs(""));
6399 /* Using is_utf8_string() is a crude hack but it may
6400 * be the best for now since we have no flag "this EXACTish
6401 * node was UTF-8" --jhi */
6402 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
6403 const char * const s = do_utf8 ?
6404 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6405 UNI_DISPLAY_REGEX) :
6407 const int len = do_utf8 ?
6410 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6414 } else if (k == TRIE) {
6415 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6416 /* print the details of the trie in dumpuntil instead, as
6417 * prog->data isn't available here */
6418 } else if (k == CURLY) {
6419 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6420 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6421 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6423 else if (k == WHILEM && o->flags) /* Ordinal/of */
6424 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6425 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6426 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6427 else if (k == LOGICAL)
6428 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6429 else if (k == ANYOF) {
6430 int i, rangestart = -1;
6431 const U8 flags = ANYOF_FLAGS(o);
6433 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6434 static const char * const anyofs[] = {
6467 if (flags & ANYOF_LOCALE)
6468 sv_catpvs(sv, "{loc}");
6469 if (flags & ANYOF_FOLD)
6470 sv_catpvs(sv, "{i}");
6471 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6472 if (flags & ANYOF_INVERT)
6474 for (i = 0; i <= 256; i++) {
6475 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6476 if (rangestart == -1)
6478 } else if (rangestart != -1) {
6479 if (i <= rangestart + 3)
6480 for (; rangestart < i; rangestart++)
6481 put_byte(sv, rangestart);
6483 put_byte(sv, rangestart);
6485 put_byte(sv, i - 1);
6491 if (o->flags & ANYOF_CLASS)
6492 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6493 if (ANYOF_CLASS_TEST(o,i))
6494 sv_catpv(sv, anyofs[i]);
6496 if (flags & ANYOF_UNICODE)
6497 sv_catpvs(sv, "{unicode}");
6498 else if (flags & ANYOF_UNICODE_ALL)
6499 sv_catpvs(sv, "{unicode_all}");
6503 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6507 U8 s[UTF8_MAXBYTES_CASE+1];
6509 for (i = 0; i <= 256; i++) { /* just the first 256 */
6510 uvchr_to_utf8(s, i);
6512 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6513 if (rangestart == -1)
6515 } else if (rangestart != -1) {
6516 if (i <= rangestart + 3)
6517 for (; rangestart < i; rangestart++) {
6518 const U8 * const e = uvchr_to_utf8(s,rangestart);
6520 for(p = s; p < e; p++)
6524 const U8 *e = uvchr_to_utf8(s,rangestart);
6526 for (p = s; p < e; p++)
6529 e = uvchr_to_utf8(s, i-1);
6530 for (p = s; p < e; p++)
6537 sv_catpvs(sv, "..."); /* et cetera */
6541 char *s = savesvpv(lv);
6542 char * const origs = s;
6544 while (*s && *s != '\n')
6548 const char * const t = ++s;
6566 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6568 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6569 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
6571 PERL_UNUSED_CONTEXT;
6572 PERL_UNUSED_ARG(sv);
6574 #endif /* DEBUGGING */
6578 Perl_re_intuit_string(pTHX_ regexp *prog)
6579 { /* Assume that RE_INTUIT is set */
6581 GET_RE_DEBUG_FLAGS_DECL;
6582 PERL_UNUSED_CONTEXT;
6586 const char * const s = SvPV_nolen_const(prog->check_substr
6587 ? prog->check_substr : prog->check_utf8);
6589 if (!PL_colorset) reginitcolors();
6590 PerlIO_printf(Perl_debug_log,
6591 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6593 prog->check_substr ? "" : "utf8 ",
6594 PL_colors[5],PL_colors[0],
6597 (strlen(s) > 60 ? "..." : ""));
6600 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6604 Perl_pregfree(pTHX_ struct regexp *r)
6608 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6610 GET_RE_DEBUG_FLAGS_DECL;
6612 if (!r || (--r->refcnt > 0))
6614 DEBUG_COMPILE_r(if (RX_DEBUG(r)){
6615 const char * const s = (r->reganch & ROPT_UTF8)
6616 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6617 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6618 const int len = SvCUR(dsv);
6621 PerlIO_printf(Perl_debug_log,
6622 "%sFreeing REx:%s %s%*.*s%s%s\n",
6623 PL_colors[4],PL_colors[5],PL_colors[0],
6626 len > 60 ? "..." : "");
6629 /* gcov results gave these as non-null 100% of the time, so there's no
6630 optimisation in checking them before calling Safefree */
6631 Safefree(r->precomp);
6632 Safefree(r->offsets); /* 20010421 MJD */
6633 RX_MATCH_COPY_FREE(r);
6634 #ifdef PERL_OLD_COPY_ON_WRITE
6636 SvREFCNT_dec(r->saved_copy);
6639 if (r->anchored_substr)
6640 SvREFCNT_dec(r->anchored_substr);
6641 if (r->anchored_utf8)
6642 SvREFCNT_dec(r->anchored_utf8);
6643 if (r->float_substr)
6644 SvREFCNT_dec(r->float_substr);
6646 SvREFCNT_dec(r->float_utf8);
6647 Safefree(r->substrs);
6650 int n = r->data->count;
6651 PAD* new_comppad = NULL;
6656 /* If you add a ->what type here, update the comment in regcomp.h */
6657 switch (r->data->what[n]) {
6659 SvREFCNT_dec((SV*)r->data->data[n]);
6662 Safefree(r->data->data[n]);
6665 new_comppad = (AV*)r->data->data[n];
6668 if (new_comppad == NULL)
6669 Perl_croak(aTHX_ "panic: pregfree comppad");
6670 PAD_SAVE_LOCAL(old_comppad,
6671 /* Watch out for global destruction's random ordering. */
6672 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6675 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6678 op_free((OP_4tree*)r->data->data[n]);
6680 PAD_RESTORE_LOCAL(old_comppad);
6681 SvREFCNT_dec((SV*)new_comppad);
6687 { /* Aho Corasick add-on structure for a trie node.
6688 Used in stclass optimization only */
6690 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6692 refcount = --aho->refcount;
6695 Safefree(aho->states);
6696 Safefree(aho->fail);
6697 aho->trie=NULL; /* not necessary to free this as it is
6698 handled by the 't' case */
6699 Safefree(r->data->data[n]); /* do this last!!!! */
6700 Safefree(r->regstclass);
6706 /* trie structure. */
6708 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6710 refcount = --trie->refcount;
6713 Safefree(trie->charmap);
6714 if (trie->widecharmap)
6715 SvREFCNT_dec((SV*)trie->widecharmap);
6716 Safefree(trie->states);
6717 Safefree(trie->trans);
6719 Safefree(trie->bitmap);
6721 Safefree(trie->wordlen);
6725 SvREFCNT_dec((SV*)trie->words);
6726 if (trie->revcharmap)
6727 SvREFCNT_dec((SV*)trie->revcharmap);
6730 Safefree(r->data->data[n]); /* do this last!!!! */
6735 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6738 Safefree(r->data->what);
6741 Safefree(r->startp);
6746 #ifndef PERL_IN_XSUB_RE
6748 - regnext - dig the "next" pointer out of a node
6751 Perl_regnext(pTHX_ register regnode *p)
6754 register I32 offset;
6756 if (p == &PL_regdummy)
6759 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6768 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6771 STRLEN l1 = strlen(pat1);
6772 STRLEN l2 = strlen(pat2);
6775 const char *message;
6781 Copy(pat1, buf, l1 , char);
6782 Copy(pat2, buf + l1, l2 , char);
6783 buf[l1 + l2] = '\n';
6784 buf[l1 + l2 + 1] = '\0';
6786 /* ANSI variant takes additional second argument */
6787 va_start(args, pat2);
6791 msv = vmess(buf, &args);
6793 message = SvPV_const(msv,l1);
6796 Copy(message, buf, l1 , char);
6797 buf[l1-1] = '\0'; /* Overwrite \n */
6798 Perl_croak(aTHX_ "%s", buf);
6801 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6803 #ifndef PERL_IN_XSUB_RE
6805 Perl_save_re_context(pTHX)
6809 struct re_save_state *state;
6811 SAVEVPTR(PL_curcop);
6812 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6814 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6815 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6816 SSPUSHINT(SAVEt_RE_STATE);
6818 Copy(&PL_reg_state, state, 1, struct re_save_state);
6820 PL_reg_start_tmp = 0;
6821 PL_reg_start_tmpl = 0;
6822 PL_reg_oldsaved = NULL;
6823 PL_reg_oldsavedlen = 0;
6825 PL_reg_leftiter = 0;
6826 PL_reg_poscache = NULL;
6827 PL_reg_poscache_size = 0;
6828 #ifdef PERL_OLD_COPY_ON_WRITE
6832 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6834 const REGEXP * const rx = PM_GETRE(PL_curpm);
6837 for (i = 1; i <= rx->nparens; i++) {
6838 char digits[TYPE_CHARS(long)];
6839 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
6840 GV *const *const gvp
6841 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6844 GV * const gv = *gvp;
6845 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6855 clear_re(pTHX_ void *r)
6858 ReREFCNT_dec((regexp *)r);
6864 S_put_byte(pTHX_ SV *sv, int c)
6866 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6867 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6868 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6869 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6871 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6874 #define CLEAR_OPTSTART \
6875 if (optstart) STMT_START { \
6876 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
6880 #define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6882 STATIC const regnode *
6883 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6884 const regnode *last, SV* sv, I32 l)
6887 register U8 op = EXACT; /* Arbitrary non-END op. */
6888 register const regnode *next;
6889 const regnode *optstart= NULL;
6890 GET_RE_DEBUG_FLAGS_DECL;
6892 while (op != END && (!last || node < last)) {
6893 /* While that wasn't END last time... */
6899 next = regnext((regnode *)node);
6902 if (OP(node) == OPTIMIZED) {
6903 if (!optstart && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE_OPTIMISE))
6910 regprop(r, sv, node);
6911 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6912 (int)(2*l + 1), "", SvPVX_const(sv));
6914 if (OP(node) != OPTIMIZED) {
6915 if (next == NULL) /* Next ptr. */
6916 PerlIO_printf(Perl_debug_log, "(0)");
6918 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6919 (void)PerlIO_putc(Perl_debug_log, '\n');
6923 if (PL_regkind[(U8)op] == BRANCHJ) {
6926 register const regnode *nnode = (OP(next) == LONGJMP
6927 ? regnext((regnode *)next)
6929 if (last && nnode > last)
6931 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6934 else if (PL_regkind[(U8)op] == BRANCH) {
6936 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
6938 else if ( PL_regkind[(U8)op] == TRIE ) {
6939 const I32 n = ARG(node);
6940 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6941 const I32 arry_len = av_len(trie->words)+1;
6943 PerlIO_printf(Perl_debug_log,
6944 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
6948 TRIE_WORDCOUNT(trie),
6949 (int)TRIE_CHARCOUNT(trie),
6950 trie->uniquecharcount,
6951 (IV)TRIE_LASTSTATE(trie)-1,
6958 sv_setpvn(sv, "", 0);
6959 for (i = 0; i <= 256; i++) {
6960 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6961 if (rangestart == -1)
6963 } else if (rangestart != -1) {
6964 if (i <= rangestart + 3)
6965 for (; rangestart < i; rangestart++)
6966 put_byte(sv, rangestart);
6968 put_byte(sv, rangestart);
6970 put_byte(sv, i - 1);
6975 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
6977 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
6979 for (word_idx=0; word_idx < arry_len; word_idx++) {
6980 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
6982 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6985 SvPV_nolen_const(*elem_ptr),
6991 node = NEXTOPER(node);
6992 node += regarglen[(U8)op];
6995 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6996 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6997 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6999 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
7001 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7004 else if ( op == PLUS || op == STAR) {
7005 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
7007 else if (op == ANYOF) {
7008 /* arglen 1 + class block */
7009 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7010 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7011 node = NEXTOPER(node);
7013 else if (PL_regkind[(U8)op] == EXACT) {
7014 /* Literal string, where present. */
7015 node += NODE_SZ_STR(node) - 1;
7016 node = NEXTOPER(node);
7019 node = NEXTOPER(node);
7020 node += regarglen[(U8)op];
7022 if (op == CURLYX || op == OPEN)
7024 else if (op == WHILEM)
7031 #endif /* DEBUGGING */
7035 * c-indentation-style: bsd
7037 * indent-tabs-mode: t
7040 * ex: set ts=8 sts=4 sw=4 noet: