5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
176 /* Length of a variant. */
178 typedef struct scan_data_t {
184 I32 last_end; /* min value, <0 unless valid. */
187 SV **longest; /* Either &l_fixed, or &l_float. */
191 I32 offset_float_min;
192 I32 offset_float_max;
196 struct regnode_charclass_class *start_class;
200 * Forward declarations for pregcomp()'s friends.
203 static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL 0x0001
208 #define SF_BEFORE_MEOL 0x0002
209 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
213 # define SF_FIX_SHIFT_EOL (0+2)
214 # define SF_FL_SHIFT_EOL (0+4)
216 # define SF_FIX_SHIFT_EOL (+2)
217 # define SF_FL_SHIFT_EOL (+4)
220 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF 0x0040
226 #define SF_HAS_PAR 0x0080
227 #define SF_IN_PAR 0x0100
228 #define SF_HAS_EVAL 0x0200
229 #define SCF_DO_SUBSTR 0x0400
230 #define SCF_DO_STCLASS_AND 0x0800
231 #define SCF_DO_STCLASS_OR 0x1000
232 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS 0x2000
235 #define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
237 #define UTF (RExC_utf8 != 0)
238 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
239 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
241 #define OOB_UNICODE 12345678
242 #define OOB_NAMEDCLASS -1
244 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248 /* length of regex to show in messages that don't mark a position within */
249 #define RegexLengthToShowInErrorMessages 127
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
256 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
257 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
259 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 #define FAIL(msg) STMT_START { \
267 const char *ellipses = ""; \
268 IV len = RExC_end - RExC_precomp; \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
284 #define Simple_vFAIL(m) STMT_START { \
285 const IV offset = RExC_parse - RExC_precomp; \
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
293 #define vFAIL(m) STMT_START { \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
300 * Like Simple_vFAIL(), but accepts two arguments.
302 #define Simple_vFAIL2(m,a1) STMT_START { \
303 const IV offset = RExC_parse - RExC_precomp; \
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
311 #define vFAIL2(m,a1) STMT_START { \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
319 * Like Simple_vFAIL(), but accepts three arguments.
321 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
322 const IV offset = RExC_parse - RExC_precomp; \
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
330 #define vFAIL3(m,a1,a2) STMT_START { \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
337 * Like Simple_vFAIL(), but accepts four arguments.
339 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
340 const IV offset = RExC_parse - RExC_precomp; \
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
345 #define vWARN(loc,m) STMT_START { \
346 const IV offset = loc - RExC_precomp; \
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351 #define vWARNdep(loc,m) STMT_START { \
352 const IV offset = loc - RExC_precomp; \
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359 #define vWARN2(loc, m, a1) STMT_START { \
360 const IV offset = loc - RExC_precomp; \
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365 #define vWARN3(loc, m, a1, a2) STMT_START { \
366 const IV offset = loc - RExC_precomp; \
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
372 const IV offset = loc - RExC_precomp; \
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
378 const IV offset = loc - RExC_precomp; \
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 /* Allow for side effects in s */
385 #define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389 /* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
393 * Position is 1 indexed.
396 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
399 __LINE__, (node), (int)(byte))); \
401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
403 RExC_offsets[2*(node)-1] = (byte); \
408 #define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
412 #define Set_Node_Length_To_R(node,len) STMT_START { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
415 __LINE__, (int)(node), (int)(len))); \
417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
419 RExC_offsets[2*(node)] = (len); \
424 #define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427 #define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
430 /* Get offsets and lengths */
431 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
434 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
440 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441 #define EXPERIMENTAL_INPLACESCAN
444 static void clear_re(pTHX_ void *r);
446 /* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
451 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
457 SvSetMagicSV(*data->longest, data->last_found);
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
464 data->flags &= ~SF_FIX_BEFORE_EOL;
467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
468 data->offset_float_max = (l
469 ? data->last_start_max
470 : data->pos_min + data->pos_delta);
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
473 if (data->flags & SF_BEFORE_EOL)
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
477 data->flags &= ~SF_FL_BEFORE_EOL;
480 SvCUR_set(data->last_found, 0);
482 SV * const sv = data->last_found;
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
490 data->flags &= ~SF_BEFORE_EOL;
493 /* Can match anything (initialization) */
495 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
497 ANYOF_CLASS_ZERO(cl);
498 ANYOF_BITMAP_SETALL(cl);
499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
501 cl->flags |= ANYOF_LOCALE;
504 /* Can match anything (initialization) */
506 S_cl_is_anything(const struct regnode_charclass_class *cl)
510 for (value = 0; value <= ANYOF_MAX; value += 2)
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
520 /* Can match anything (initialization) */
522 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 Zero(cl, 1, struct regnode_charclass_class);
526 cl_anything(pRExC_state, cl);
530 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 Zero(cl, 1, struct regnode_charclass_class);
534 cl_anything(pRExC_state, cl);
536 cl->flags |= ANYOF_LOCALE;
539 /* 'And' a given class with another one. Can create false positives */
540 /* We assume that cl is not inverted */
542 S_cl_and(struct regnode_charclass_class *cl,
543 const struct regnode_charclass_class *and_with)
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
570 cl->flags &= ~ANYOF_UNICODE_ALL;
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
573 cl->flags &= ~ANYOF_UNICODE;
576 /* 'OR' a given class with another one. Can create false positives */
577 /* We assume that cl is not inverted */
579 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
581 if (or_with->flags & ANYOF_INVERT) {
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
600 cl_anything(pRExC_state, cl);
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && (!(or_with->flags & ANYOF_FOLD)
606 || (cl->flags & ANYOF_FOLD)) ) {
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
619 cl_anything(pRExC_state, cl);
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
638 make_trie(startbranch,first,last,tail,flags,depth)
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
648 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
650 A trie is an N'ary tree where the branches are determined by digital
651 decomposition of the key. IE, at the root node you look up the 1st character and
652 follow that branch repeat until you find the end of the branches. Nodes can be
653 marked as "accepting" meaning they represent a complete word. Eg:
657 would convert into the following structure. Numbers represent states, letters
658 following numbers represent valid transitions on the letter from that state, if
659 the number is in square brackets it represents an accepting state, otherwise it
660 will be in parenthesis.
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
666 (1) +-i->(6)-+-s->[7]
668 +-s->(3)-+-h->(4)-+-e->[5]
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
672 This shows that when matching against the string 'hers' we will begin at state 1
673 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676 single traverse. We store a mapping from accepting to state to which word was
677 matched, and then when we have multiple possibilities we try to complete the
678 rest of the regex in the order in which they occured in the alternation.
680 The only prior NFA like behaviour that would be changed by the TRIE support is
681 the silent ignoring of duplicate alternations which are of the form:
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
685 Thus EVAL blocks follwing a trie may be called a different number of times with
686 and without the optimisation. With the optimisations dupes will be silently
687 ignored. This inconsistant behaviour of EVAL type nodes is well established as
688 the following demonstrates:
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
692 which prints out 'word' three times, but
694 'words'=~/(word|word|word)(?{ print $1 })S/
696 which doesnt print it out at all. This is due to other optimisations kicking in.
698 Example of what happens on a structural level:
700 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
702 1: CURLYM[1] {1,32767}(18)
713 This would be optimizable with startbranch=5, first=5, last=16, tail=16
714 and should turn into:
716 1: CURLYM[1] {1,32767}(18)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
726 Cases where tail != last would be like /(?foo|bar)baz/:
736 which would be optimizable with startbranch=1, first=1, last=7, tail=8
737 and would end up looking like:
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
747 d = uvuni_to_utf8_flags(d, uv, 0);
749 is the recommended Unicode-aware way of saying
754 #define TRIE_STORE_REVCHAR \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
760 #define TRIE_READ_CHAR STMT_START { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
785 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
790 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
801 #define TRIE_LIST_NEW(state) STMT_START { \
802 Newxz( trie->states[ state ].trans.list, \
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
808 #define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
812 trie->wordlen[ curword ] = wordlen; \
813 trie->states[ state ].wordnum = ++curword; \
815 /* store the word for dumping */ \
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
820 tmp = newSVpvn( "", 0 ); \
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
825 NOOP; /* It's a dupe. So ignore it. */ \
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
835 These routines dump out a trie in a somewhat readable format.
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
852 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
855 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
875 const U32 base = trie->states[ state ].trans.base;
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
909 PerlIO_printf( Perl_debug_log, "]");
912 PerlIO_printf( Perl_debug_log, "\n" );
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
923 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
932 for( state=1 ; state < next_alloc ; state ++ ) {
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
945 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
964 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
968 GET_RE_DEBUG_FLAGS_DECL;
971 print out the table precompression so that we can do a visual check
972 that they are identical.
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
978 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
990 PerlIO_printf( Perl_debug_log, "\n" );
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1013 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1023 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1025 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1045 const U32 ucharcount = trie->uniquecharcount;
1046 const U32 numstates = trie->laststate;
1047 const U32 ubound = trie->lasttrans + ucharcount;
1051 U32 base = trie->states[ 1 ].trans.base;
1054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1055 GET_RE_DEBUG_FLAGS_DECL;
1057 ARG_SET( stclass, data_slot );
1058 Newxz( aho, 1, reg_ac_data );
1059 RExC_rx->data->data[ data_slot ] = (void*)aho;
1061 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1062 (trie->laststate+1)*sizeof(reg_trie_state));
1063 Newxz( q, numstates, U32);
1064 Newxz( aho->fail, numstates, U32 );
1066 fail[ 0 ] = fail[ 1 ] = 1;
1068 for ( charid = 0; charid < ucharcount ; charid++ ) {
1069 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1071 q[ q_write ] = newstate;
1072 /* set to point at the root */
1073 fail[ q[ q_write++ ] ]=1;
1076 while ( q_read < q_write) {
1077 const U32 cur = q[ q_read++ % numstates ];
1078 base = trie->states[ cur ].trans.base;
1080 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1081 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1083 U32 fail_state = cur;
1086 fail_state = fail[ fail_state ];
1087 fail_base = aho->states[ fail_state ].trans.base;
1088 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1090 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1091 fail[ ch_state ] = fail_state;
1092 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1094 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1096 q[ q_write++ % numstates] = ch_state;
1101 DEBUG_TRIE_COMPILE_MORE_r({
1102 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1103 for( q_read=2; q_read<numstates; q_read++ ) {
1104 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1106 PerlIO_printf(Perl_debug_log, "\n");
1109 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1115 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1118 /* first pass, loop through and scan words */
1119 reg_trie_data *trie;
1121 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1126 /* we just use folder as a flag in utf8 */
1127 const U8 * const folder = ( flags == EXACTF
1129 : ( flags == EXACTFL
1135 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1136 SV *re_trie_maxbuff;
1138 /* these are only used during construction but are useful during
1139 * debugging so we store them in the struct when debugging.
1140 * Wordcount is actually superfluous in debugging as we have
1141 * (AV*)trie->words to use for it, but that's not available when
1142 * not debugging... We could make the macro use the AV during
1143 * debugging though...
1145 U16 trie_wordcount=0;
1146 STRLEN trie_charcount=0;
1147 /*U32 trie_laststate=0;*/
1148 AV *trie_revcharmap;
1150 GET_RE_DEBUG_FLAGS_DECL;
1152 Newxz( trie, 1, reg_trie_data );
1154 trie->startstate = 1;
1155 RExC_rx->data->data[ data_slot ] = (void*)trie;
1156 Newxz( trie->charmap, 256, U16 );
1157 if (!(UTF && folder))
1158 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1160 trie->words = newAV();
1162 TRIE_REVCHARMAP(trie) = newAV();
1164 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1165 if (!SvIOK(re_trie_maxbuff)) {
1166 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1169 PerlIO_printf( Perl_debug_log,
1170 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1171 (int)depth * 2 + 2, "",
1172 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1173 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1175 /* -- First loop and Setup --
1177 We first traverse the branches and scan each word to determine if it
1178 contains widechars, and how many unique chars there are, this is
1179 important as we have to build a table with at least as many columns as we
1182 We use an array of integers to represent the character codes 0..255
1183 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1184 native representation of the character value as the key and IV's for the
1187 *TODO* If we keep track of how many times each character is used we can
1188 remap the columns so that the table compression later on is more
1189 efficient in terms of memory by ensuring most common value is in the
1190 middle and the least common are on the outside. IMO this would be better
1191 than a most to least common mapping as theres a decent chance the most
1192 common letter will share a node with the least common, meaning the node
1193 will not be compressable. With a middle is most common approach the worst
1194 case is when we have the least common nodes twice.
1198 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1199 regnode * const noper = NEXTOPER( cur );
1200 const U8 *uc = (U8*)STRING( noper );
1201 const U8 * const e = uc + STR_LEN( noper );
1203 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1204 const U8 *scan = (U8*)NULL;
1205 U32 wordlen = 0; /* required init */
1208 TRIE_WORDCOUNT(trie)++;
1209 if (OP(noper) == NOTHING) {
1214 TRIE_BITMAP_SET(trie,*uc);
1215 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1217 for ( ; uc < e ; uc += len ) {
1218 TRIE_CHARCOUNT(trie)++;
1222 if ( !trie->charmap[ uvc ] ) {
1223 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1225 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1230 if ( !trie->widecharmap )
1231 trie->widecharmap = newHV();
1233 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1236 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1238 if ( !SvTRUE( *svpp ) ) {
1239 sv_setiv( *svpp, ++trie->uniquecharcount );
1244 if( cur == first ) {
1247 } else if (chars < trie->minlen) {
1249 } else if (chars > trie->maxlen) {
1253 } /* end first pass */
1254 DEBUG_TRIE_COMPILE_r(
1255 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1256 (int)depth * 2 + 2,"",
1257 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1258 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1259 (int)trie->minlen, (int)trie->maxlen )
1261 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
1264 We now know what we are dealing with in terms of unique chars and
1265 string sizes so we can calculate how much memory a naive
1266 representation using a flat table will take. If it's over a reasonable
1267 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1268 conservative but potentially much slower representation using an array
1271 At the end we convert both representations into the same compressed
1272 form that will be used in regexec.c for matching with. The latter
1273 is a form that cannot be used to construct with but has memory
1274 properties similar to the list form and access properties similar
1275 to the table form making it both suitable for fast searches and
1276 small enough that its feasable to store for the duration of a program.
1278 See the comment in the code where the compressed table is produced
1279 inplace from the flat tabe representation for an explanation of how
1280 the compression works.
1285 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1287 Second Pass -- Array Of Lists Representation
1289 Each state will be represented by a list of charid:state records
1290 (reg_trie_trans_le) the first such element holds the CUR and LEN
1291 points of the allocated array. (See defines above).
1293 We build the initial structure using the lists, and then convert
1294 it into the compressed table form which allows faster lookups
1295 (but cant be modified once converted).
1298 STRLEN transcount = 1;
1300 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1304 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1306 regnode * const noper = NEXTOPER( cur );
1307 U8 *uc = (U8*)STRING( noper );
1308 const U8 * const e = uc + STR_LEN( noper );
1309 U32 state = 1; /* required init */
1310 U16 charid = 0; /* sanity init */
1311 U8 *scan = (U8*)NULL; /* sanity init */
1312 STRLEN foldlen = 0; /* required init */
1313 U32 wordlen = 0; /* required init */
1314 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1316 if (OP(noper) != NOTHING) {
1317 for ( ; uc < e ; uc += len ) {
1322 charid = trie->charmap[ uvc ];
1324 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1328 charid=(U16)SvIV( *svpp );
1337 if ( !trie->states[ state ].trans.list ) {
1338 TRIE_LIST_NEW( state );
1340 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1341 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1342 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1347 newstate = next_alloc++;
1348 TRIE_LIST_PUSH( state, charid, newstate );
1353 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1355 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1358 TRIE_HANDLE_WORD(state);
1360 } /* end second pass */
1362 TRIE_LASTSTATE(trie) = next_alloc;
1363 Renew( trie->states, next_alloc, reg_trie_state );
1365 /* and now dump it out before we compress it */
1366 DEBUG_TRIE_COMPILE_MORE_r(
1367 dump_trie_interim_list(trie,next_alloc,depth+1)
1370 Newxz( trie->trans, transcount ,reg_trie_trans );
1377 for( state=1 ; state < next_alloc ; state ++ ) {
1381 DEBUG_TRIE_COMPILE_MORE_r(
1382 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1386 if (trie->states[state].trans.list) {
1387 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1391 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1392 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1393 if ( forid < minid ) {
1395 } else if ( forid > maxid ) {
1399 if ( transcount < tp + maxid - minid + 1) {
1401 Renew( trie->trans, transcount, reg_trie_trans );
1402 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1404 base = trie->uniquecharcount + tp - minid;
1405 if ( maxid == minid ) {
1407 for ( ; zp < tp ; zp++ ) {
1408 if ( ! trie->trans[ zp ].next ) {
1409 base = trie->uniquecharcount + zp - minid;
1410 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1411 trie->trans[ zp ].check = state;
1417 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1418 trie->trans[ tp ].check = state;
1423 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1424 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1425 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1426 trie->trans[ tid ].check = state;
1428 tp += ( maxid - minid + 1 );
1430 Safefree(trie->states[ state ].trans.list);
1433 DEBUG_TRIE_COMPILE_MORE_r(
1434 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1437 trie->states[ state ].trans.base=base;
1439 trie->lasttrans = tp + 1;
1443 Second Pass -- Flat Table Representation.
1445 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1446 We know that we will need Charcount+1 trans at most to store the data
1447 (one row per char at worst case) So we preallocate both structures
1448 assuming worst case.
1450 We then construct the trie using only the .next slots of the entry
1453 We use the .check field of the first entry of the node temporarily to
1454 make compression both faster and easier by keeping track of how many non
1455 zero fields are in the node.
1457 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1460 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1461 number representing the first entry of the node, and state as a
1462 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1463 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1464 are 2 entrys per node. eg:
1472 The table is internally in the right hand, idx form. However as we also
1473 have to deal with the states array which is indexed by nodenum we have to
1474 use TRIE_NODENUM() to convert.
1479 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1481 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1482 next_alloc = trie->uniquecharcount + 1;
1485 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1487 regnode * const noper = NEXTOPER( cur );
1488 const U8 *uc = (U8*)STRING( noper );
1489 const U8 * const e = uc + STR_LEN( noper );
1491 U32 state = 1; /* required init */
1493 U16 charid = 0; /* sanity init */
1494 U32 accept_state = 0; /* sanity init */
1495 U8 *scan = (U8*)NULL; /* sanity init */
1497 STRLEN foldlen = 0; /* required init */
1498 U32 wordlen = 0; /* required init */
1499 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1501 if ( OP(noper) != NOTHING ) {
1502 for ( ; uc < e ; uc += len ) {
1507 charid = trie->charmap[ uvc ];
1509 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1510 charid = svpp ? (U16)SvIV(*svpp) : 0;
1514 if ( !trie->trans[ state + charid ].next ) {
1515 trie->trans[ state + charid ].next = next_alloc;
1516 trie->trans[ state ].check++;
1517 next_alloc += trie->uniquecharcount;
1519 state = trie->trans[ state + charid ].next;
1521 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1523 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1526 accept_state = TRIE_NODENUM( state );
1527 TRIE_HANDLE_WORD(accept_state);
1529 } /* end second pass */
1531 /* and now dump it out before we compress it */
1532 DEBUG_TRIE_COMPILE_MORE_r(
1533 dump_trie_interim_table(trie,next_alloc,depth+1)
1538 * Inplace compress the table.*
1540 For sparse data sets the table constructed by the trie algorithm will
1541 be mostly 0/FAIL transitions or to put it another way mostly empty.
1542 (Note that leaf nodes will not contain any transitions.)
1544 This algorithm compresses the tables by eliminating most such
1545 transitions, at the cost of a modest bit of extra work during lookup:
1547 - Each states[] entry contains a .base field which indicates the
1548 index in the state[] array wheres its transition data is stored.
1550 - If .base is 0 there are no valid transitions from that node.
1552 - If .base is nonzero then charid is added to it to find an entry in
1555 -If trans[states[state].base+charid].check!=state then the
1556 transition is taken to be a 0/Fail transition. Thus if there are fail
1557 transitions at the front of the node then the .base offset will point
1558 somewhere inside the previous nodes data (or maybe even into a node
1559 even earlier), but the .check field determines if the transition is
1562 The following process inplace converts the table to the compressed
1563 table: We first do not compress the root node 1,and mark its all its
1564 .check pointers as 1 and set its .base pointer as 1 as well. This
1565 allows to do a DFA construction from the compressed table later, and
1566 ensures that any .base pointers we calculate later are greater than
1569 - We set 'pos' to indicate the first entry of the second node.
1571 - We then iterate over the columns of the node, finding the first and
1572 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1573 and set the .check pointers accordingly, and advance pos
1574 appropriately and repreat for the next node. Note that when we copy
1575 the next pointers we have to convert them from the original
1576 NODEIDX form to NODENUM form as the former is not valid post
1579 - If a node has no transitions used we mark its base as 0 and do not
1580 advance the pos pointer.
1582 - If a node only has one transition we use a second pointer into the
1583 structure to fill in allocated fail transitions from other states.
1584 This pointer is independent of the main pointer and scans forward
1585 looking for null transitions that are allocated to a state. When it
1586 finds one it writes the single transition into the "hole". If the
1587 pointer doesnt find one the single transition is appeneded as normal.
1589 - Once compressed we can Renew/realloc the structures to release the
1592 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1593 specifically Fig 3.47 and the associated pseudocode.
1597 const U32 laststate = TRIE_NODENUM( next_alloc );
1600 TRIE_LASTSTATE(trie) = laststate;
1602 for ( state = 1 ; state < laststate ; state++ ) {
1604 const U32 stateidx = TRIE_NODEIDX( state );
1605 const U32 o_used = trie->trans[ stateidx ].check;
1606 U32 used = trie->trans[ stateidx ].check;
1607 trie->trans[ stateidx ].check = 0;
1609 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1610 if ( flag || trie->trans[ stateidx + charid ].next ) {
1611 if ( trie->trans[ stateidx + charid ].next ) {
1613 for ( ; zp < pos ; zp++ ) {
1614 if ( ! trie->trans[ zp ].next ) {
1618 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1619 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1620 trie->trans[ zp ].check = state;
1621 if ( ++zp > pos ) pos = zp;
1628 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1630 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1631 trie->trans[ pos ].check = state;
1636 trie->lasttrans = pos + 1;
1637 Renew( trie->states, laststate + 1, reg_trie_state);
1638 DEBUG_TRIE_COMPILE_MORE_r(
1639 PerlIO_printf( Perl_debug_log,
1640 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1641 (int)depth * 2 + 2,"",
1642 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1645 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1648 } /* end table compress */
1650 /* resize the trans array to remove unused space */
1651 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1653 /* and now dump out the compressed format */
1654 DEBUG_TRIE_COMPILE_r(
1655 dump_trie(trie,depth+1)
1658 { /* Modify the program and insert the new TRIE node*/
1660 U8 nodetype =(U8)(flags & 0xFF);
1667 This means we convert either the first branch or the first Exact,
1668 depending on whether the thing following (in 'last') is a branch
1669 or not and whther first is the startbranch (ie is it a sub part of
1670 the alternation or is it the whole thing.)
1671 Assuming its a sub part we conver the EXACT otherwise we convert
1672 the whole branch sequence, including the first.
1674 /* Find the node we are going to overwrite */
1675 if ( first == startbranch && OP( last ) != BRANCH ) {
1676 /* whole branch chain */
1679 const regnode *nop = NEXTOPER( convert );
1680 mjd_offset= Node_Offset((nop));
1681 mjd_nodelen= Node_Length((nop));
1684 /* branch sub-chain */
1685 convert = NEXTOPER( first );
1686 NEXT_OFF( first ) = (U16)(last - first);
1688 mjd_offset= Node_Offset((convert));
1689 mjd_nodelen= Node_Length((convert));
1693 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1694 (int)depth * 2 + 2, "",
1695 mjd_offset,mjd_nodelen)
1698 /* But first we check to see if there is a common prefix we can
1699 split out as an EXACT and put in front of the TRIE node. */
1700 trie->startstate= 1;
1701 if ( trie->bitmap && !trie->widecharmap ) {
1704 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1705 (int)depth * 2 + 2, "",
1706 TRIE_LASTSTATE(trie))
1708 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1712 const U32 base = trie->states[ state ].trans.base;
1714 if ( trie->states[state].wordnum )
1717 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1718 if ( ( base + ofs >= trie->uniquecharcount ) &&
1719 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1720 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1722 if ( ++count > 1 ) {
1723 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1724 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1725 if ( state == 1 ) break;
1727 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1729 PerlIO_printf(Perl_debug_log,
1730 "%*sNew Start State=%"UVuf" Class: [",
1731 (int)depth * 2 + 2, "",
1734 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1735 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1737 TRIE_BITMAP_SET(trie,*ch);
1739 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1741 PerlIO_printf(Perl_debug_log, (char*)ch)
1745 TRIE_BITMAP_SET(trie,*ch);
1747 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1748 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1754 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1755 const char *ch = SvPV_nolen_const( *tmp );
1757 PerlIO_printf( Perl_debug_log,
1758 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1759 (int)depth * 2 + 2, "",
1763 OP( convert ) = nodetype;
1764 str=STRING(convert);
1772 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1777 regnode *n = convert+NODE_SZ_STR(convert);
1778 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1779 trie->startstate = state;
1780 trie->minlen -= (state - 1);
1781 trie->maxlen -= (state - 1);
1783 regnode *fix = convert;
1785 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1786 while( ++fix < n ) {
1787 Set_Node_Offset_Length(fix, 0, 0);
1793 NEXT_OFF(convert) = (U16)(tail - convert);
1797 if ( trie->maxlen ) {
1798 OP( convert ) = TRIE;
1799 NEXT_OFF( convert ) = (U16)(tail - convert);
1800 ARG_SET( convert, data_slot );
1802 /* store the type in the flags */
1803 convert->flags = nodetype;
1804 /* XXX We really should free up the resource in trie now, as we wont use them */
1806 /* needed for dumping*/
1808 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1809 regnode *opt = convert;
1810 while (++opt<optimize) {
1811 Set_Node_Offset_Length(opt,0,0);
1813 /* We now need to mark all of the space originally used by the
1814 branches as optimized away. This keeps the dumpuntil from
1815 throwing a wobbly as it doesnt use regnext() to traverse the
1817 We also "fix" the offsets
1819 while( optimize < last ) {
1820 mjd_nodelen += Node_Length((optimize));
1821 OP( optimize ) = OPTIMIZED;
1822 Set_Node_Offset_Length(optimize,0,0);
1825 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1827 } /* end node insert */
1829 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1835 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1836 * These need to be revisited when a newer toolchain becomes available.
1838 #if defined(__sparc64__) && defined(__GNUC__)
1839 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1840 # undef SPARC64_GCC_WORKAROUND
1841 # define SPARC64_GCC_WORKAROUND 1
1845 #define DEBUG_PEEP(str,scan,depth) \
1846 DEBUG_OPTIMISE_r({ \
1847 SV * const mysv=sv_newmortal(); \
1848 regnode *Next = regnext(scan); \
1849 regprop(RExC_rx, mysv, scan); \
1850 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1851 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1852 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1855 #define JOIN_EXACT(scan,min,flags) \
1856 if (PL_regkind[OP(scan)] == EXACT) \
1857 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1860 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1861 /* Merge several consecutive EXACTish nodes into one. */
1862 regnode *n = regnext(scan);
1864 regnode *next = scan + NODE_SZ_STR(scan);
1868 regnode *stop = scan;
1870 GET_RE_DEBUG_FLAGS_DECL;
1871 DEBUG_PEEP("join",scan,depth);
1873 /* Skip NOTHING, merge EXACT*. */
1875 ( PL_regkind[OP(n)] == NOTHING ||
1876 (stringok && (OP(n) == OP(scan))))
1878 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1880 if (OP(n) == TAIL || n > next)
1882 if (PL_regkind[OP(n)] == NOTHING) {
1884 DEBUG_PEEP("skip:",n,depth);
1885 NEXT_OFF(scan) += NEXT_OFF(n);
1886 next = n + NODE_STEP_REGNODE;
1893 else if (stringok) {
1894 const int oldl = STR_LEN(scan);
1895 regnode * const nnext = regnext(n);
1897 DEBUG_PEEP("merg",n,depth);
1900 if (oldl + STR_LEN(n) > U8_MAX)
1902 NEXT_OFF(scan) += NEXT_OFF(n);
1903 STR_LEN(scan) += STR_LEN(n);
1904 next = n + NODE_SZ_STR(n);
1905 /* Now we can overwrite *n : */
1906 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1914 #ifdef EXPERIMENTAL_INPLACESCAN
1915 if (flags && !NEXT_OFF(n)) {
1916 DEBUG_PEEP("atch",val,depth);
1917 if (reg_off_by_arg[OP(n)]) {
1918 ARG_SET(n, val - n);
1921 NEXT_OFF(n) = val - n;
1928 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1930 Two problematic code points in Unicode casefolding of EXACT nodes:
1932 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1933 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1939 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1940 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1942 This means that in case-insensitive matching (or "loose matching",
1943 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1944 length of the above casefolded versions) can match a target string
1945 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1946 This would rather mess up the minimum length computation.
1948 What we'll do is to look for the tail four bytes, and then peek
1949 at the preceding two bytes to see whether we need to decrease
1950 the minimum length by four (six minus two).
1952 Thanks to the design of UTF-8, there cannot be false matches:
1953 A sequence of valid UTF-8 bytes cannot be a subsequence of
1954 another valid sequence of UTF-8 bytes.
1957 char * const s0 = STRING(scan), *s, *t;
1958 char * const s1 = s0 + STR_LEN(scan) - 1;
1959 char * const s2 = s1 - 4;
1960 const char t0[] = "\xcc\x88\xcc\x81";
1961 const char * const t1 = t0 + 3;
1964 s < s2 && (t = ninstr(s, s1, t0, t1));
1966 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1967 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1974 n = scan + NODE_SZ_STR(scan);
1976 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1983 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
1987 /* REx optimizer. Converts nodes into quickier variants "in place".
1988 Finds fixed substrings. */
1990 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1991 to the position after last scanned or to NULL. */
1996 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1997 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1998 /* scanp: Start here (read-write). */
1999 /* deltap: Write maxlen-minlen here. */
2000 /* last: Stop before this one. */
2003 I32 min = 0, pars = 0, code;
2004 regnode *scan = *scanp, *next;
2006 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2007 int is_inf_internal = 0; /* The studied chunk is infinite */
2008 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2009 scan_data_t data_fake;
2010 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2011 SV *re_trie_maxbuff = NULL;
2013 GET_RE_DEBUG_FLAGS_DECL;
2015 while (scan && OP(scan) != END && scan < last) {
2016 /* Peephole optimizer: */
2017 DEBUG_PEEP("Peep",scan,depth);
2019 JOIN_EXACT(scan,&min,0);
2021 /* Follow the next-chain of the current node and optimize
2022 away all the NOTHINGs from it. */
2023 if (OP(scan) != CURLYX) {
2024 const int max = (reg_off_by_arg[OP(scan)]
2026 /* I32 may be smaller than U16 on CRAYs! */
2027 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2028 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2032 /* Skip NOTHING and LONGJMP. */
2033 while ((n = regnext(n))
2034 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2035 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2036 && off + noff < max)
2038 if (reg_off_by_arg[OP(scan)])
2041 NEXT_OFF(scan) = off;
2046 /* The principal pseudo-switch. Cannot be a switch, since we
2047 look into several different things. */
2048 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2049 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2050 next = regnext(scan);
2052 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2054 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2055 I32 max1 = 0, min1 = I32_MAX, num = 0;
2056 struct regnode_charclass_class accum;
2057 regnode * const startbranch=scan;
2059 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2060 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2061 if (flags & SCF_DO_STCLASS)
2062 cl_init_zero(pRExC_state, &accum);
2064 while (OP(scan) == code) {
2065 I32 deltanext, minnext, f = 0, fake;
2066 struct regnode_charclass_class this_class;
2069 data_fake.flags = 0;
2071 data_fake.whilem_c = data->whilem_c;
2072 data_fake.last_closep = data->last_closep;
2075 data_fake.last_closep = &fake;
2076 next = regnext(scan);
2077 scan = NEXTOPER(scan);
2079 scan = NEXTOPER(scan);
2080 if (flags & SCF_DO_STCLASS) {
2081 cl_init(pRExC_state, &this_class);
2082 data_fake.start_class = &this_class;
2083 f = SCF_DO_STCLASS_AND;
2085 if (flags & SCF_WHILEM_VISITED_POS)
2086 f |= SCF_WHILEM_VISITED_POS;
2088 /* we suppose the run is continuous, last=next...*/
2089 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2090 next, &data_fake, f,depth+1);
2093 if (max1 < minnext + deltanext)
2094 max1 = minnext + deltanext;
2095 if (deltanext == I32_MAX)
2096 is_inf = is_inf_internal = 1;
2098 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2101 if (data_fake.flags & SF_HAS_EVAL)
2102 data->flags |= SF_HAS_EVAL;
2103 data->whilem_c = data_fake.whilem_c;
2105 if (flags & SCF_DO_STCLASS)
2106 cl_or(pRExC_state, &accum, &this_class);
2107 if (code == SUSPEND)
2110 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2112 if (flags & SCF_DO_SUBSTR) {
2113 data->pos_min += min1;
2114 data->pos_delta += max1 - min1;
2115 if (max1 != min1 || is_inf)
2116 data->longest = &(data->longest_float);
2119 delta += max1 - min1;
2120 if (flags & SCF_DO_STCLASS_OR) {
2121 cl_or(pRExC_state, data->start_class, &accum);
2123 cl_and(data->start_class, &and_with);
2124 flags &= ~SCF_DO_STCLASS;
2127 else if (flags & SCF_DO_STCLASS_AND) {
2129 cl_and(data->start_class, &accum);
2130 flags &= ~SCF_DO_STCLASS;
2133 /* Switch to OR mode: cache the old value of
2134 * data->start_class */
2135 StructCopy(data->start_class, &and_with,
2136 struct regnode_charclass_class);
2137 flags &= ~SCF_DO_STCLASS_AND;
2138 StructCopy(&accum, data->start_class,
2139 struct regnode_charclass_class);
2140 flags |= SCF_DO_STCLASS_OR;
2141 data->start_class->flags |= ANYOF_EOS;
2147 Assuming this was/is a branch we are dealing with: 'scan' now
2148 points at the item that follows the branch sequence, whatever
2149 it is. We now start at the beginning of the sequence and look
2155 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2157 If we can find such a subseqence we need to turn the first
2158 element into a trie and then add the subsequent branch exact
2159 strings to the trie.
2163 1. patterns where the whole set of branch can be converted to a trie,
2165 2. patterns where only a subset of the alternations can be
2166 converted to a trie.
2168 In case 1 we can replace the whole set with a single regop
2169 for the trie. In case 2 we need to keep the start and end
2172 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2173 becomes BRANCH TRIE; BRANCH X;
2175 Hypthetically when we know the regex isnt anchored we can
2176 turn a case 1 into a DFA and let it rip... Every time it finds a match
2177 it would just call its tail, no WHILEM/CURLY needed.
2180 if (PERL_ENABLE_TRIE_OPTIMISATION) {
2182 if (!re_trie_maxbuff) {
2183 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2184 if (!SvIOK(re_trie_maxbuff))
2185 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2187 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2189 regnode *first = (regnode *)NULL;
2190 regnode *last = (regnode *)NULL;
2191 regnode *tail = scan;
2196 SV * const mysv = sv_newmortal(); /* for dumping */
2198 /* var tail is used because there may be a TAIL
2199 regop in the way. Ie, the exacts will point to the
2200 thing following the TAIL, but the last branch will
2201 point at the TAIL. So we advance tail. If we
2202 have nested (?:) we may have to move through several
2206 while ( OP( tail ) == TAIL ) {
2207 /* this is the TAIL generated by (?:) */
2208 tail = regnext( tail );
2213 regprop(RExC_rx, mysv, tail );
2214 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2215 (int)depth * 2 + 2, "",
2216 "Looking for TRIE'able sequences. Tail node is: ",
2217 SvPV_nolen_const( mysv )
2223 step through the branches, cur represents each
2224 branch, noper is the first thing to be matched
2225 as part of that branch and noper_next is the
2226 regnext() of that node. if noper is an EXACT
2227 and noper_next is the same as scan (our current
2228 position in the regex) then the EXACT branch is
2229 a possible optimization target. Once we have
2230 two or more consequetive such branches we can
2231 create a trie of the EXACT's contents and stich
2232 it in place. If the sequence represents all of
2233 the branches we eliminate the whole thing and
2234 replace it with a single TRIE. If it is a
2235 subsequence then we need to stitch it in. This
2236 means the first branch has to remain, and needs
2237 to be repointed at the item on the branch chain
2238 following the last branch optimized. This could
2239 be either a BRANCH, in which case the
2240 subsequence is internal, or it could be the
2241 item following the branch sequence in which
2242 case the subsequence is at the end.
2246 /* dont use tail as the end marker for this traverse */
2247 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2248 regnode * const noper = NEXTOPER( cur );
2249 regnode * const noper_next = regnext( noper );
2252 regprop(RExC_rx, mysv, cur);
2253 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2254 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2256 regprop(RExC_rx, mysv, noper);
2257 PerlIO_printf( Perl_debug_log, " -> %s",
2258 SvPV_nolen_const(mysv));
2261 regprop(RExC_rx, mysv, noper_next );
2262 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2263 SvPV_nolen_const(mysv));
2265 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2266 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2268 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2269 : PL_regkind[ OP( noper ) ] == EXACT )
2270 || OP(noper) == NOTHING )
2271 && noper_next == tail && count<U16_MAX)
2274 if ( !first || optype == NOTHING ) {
2275 if (!first) first = cur;
2276 optype = OP( noper );
2282 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2284 if ( PL_regkind[ OP( noper ) ] == EXACT
2285 && noper_next == tail )
2289 optype = OP( noper );
2299 regprop(RExC_rx, mysv, cur);
2300 PerlIO_printf( Perl_debug_log,
2301 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2302 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2306 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2307 #ifdef TRIE_STUDY_OPT
2308 if ( made && startbranch == first ) {
2309 if ( OP(first)!=TRIE )
2310 flags |= SCF_EXACT_TRIE;
2312 regnode *chk=*scanp;
2313 while ( OP( chk ) == OPEN )
2314 chk = regnext( chk );
2316 flags |= SCF_EXACT_TRIE;
2325 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2326 scan = NEXTOPER(NEXTOPER(scan));
2327 } else /* single branch is optimized. */
2328 scan = NEXTOPER(scan);
2331 else if (OP(scan) == EXACT) {
2332 I32 l = STR_LEN(scan);
2335 const U8 * const s = (U8*)STRING(scan);
2336 l = utf8_length(s, s + l);
2337 uc = utf8_to_uvchr(s, NULL);
2339 uc = *((U8*)STRING(scan));
2342 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2343 /* The code below prefers earlier match for fixed
2344 offset, later match for variable offset. */
2345 if (data->last_end == -1) { /* Update the start info. */
2346 data->last_start_min = data->pos_min;
2347 data->last_start_max = is_inf
2348 ? I32_MAX : data->pos_min + data->pos_delta;
2350 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2352 SvUTF8_on(data->last_found);
2354 SV * const sv = data->last_found;
2355 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2356 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2357 if (mg && mg->mg_len >= 0)
2358 mg->mg_len += utf8_length((U8*)STRING(scan),
2359 (U8*)STRING(scan)+STR_LEN(scan));
2361 data->last_end = data->pos_min + l;
2362 data->pos_min += l; /* As in the first entry. */
2363 data->flags &= ~SF_BEFORE_EOL;
2365 if (flags & SCF_DO_STCLASS_AND) {
2366 /* Check whether it is compatible with what we know already! */
2370 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2371 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2372 && (!(data->start_class->flags & ANYOF_FOLD)
2373 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2376 ANYOF_CLASS_ZERO(data->start_class);
2377 ANYOF_BITMAP_ZERO(data->start_class);
2379 ANYOF_BITMAP_SET(data->start_class, uc);
2380 data->start_class->flags &= ~ANYOF_EOS;
2382 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2384 else if (flags & SCF_DO_STCLASS_OR) {
2385 /* false positive possible if the class is case-folded */
2387 ANYOF_BITMAP_SET(data->start_class, uc);
2389 data->start_class->flags |= ANYOF_UNICODE_ALL;
2390 data->start_class->flags &= ~ANYOF_EOS;
2391 cl_and(data->start_class, &and_with);
2393 flags &= ~SCF_DO_STCLASS;
2395 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2396 I32 l = STR_LEN(scan);
2397 UV uc = *((U8*)STRING(scan));
2399 /* Search for fixed substrings supports EXACT only. */
2400 if (flags & SCF_DO_SUBSTR) {
2402 scan_commit(pRExC_state, data);
2405 const U8 * const s = (U8 *)STRING(scan);
2406 l = utf8_length(s, s + l);
2407 uc = utf8_to_uvchr(s, NULL);
2410 if (flags & SCF_DO_SUBSTR)
2412 if (flags & SCF_DO_STCLASS_AND) {
2413 /* Check whether it is compatible with what we know already! */
2417 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2418 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2419 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2421 ANYOF_CLASS_ZERO(data->start_class);
2422 ANYOF_BITMAP_ZERO(data->start_class);
2424 ANYOF_BITMAP_SET(data->start_class, uc);
2425 data->start_class->flags &= ~ANYOF_EOS;
2426 data->start_class->flags |= ANYOF_FOLD;
2427 if (OP(scan) == EXACTFL)
2428 data->start_class->flags |= ANYOF_LOCALE;
2431 else if (flags & SCF_DO_STCLASS_OR) {
2432 if (data->start_class->flags & ANYOF_FOLD) {
2433 /* false positive possible if the class is case-folded.
2434 Assume that the locale settings are the same... */
2436 ANYOF_BITMAP_SET(data->start_class, uc);
2437 data->start_class->flags &= ~ANYOF_EOS;
2439 cl_and(data->start_class, &and_with);
2441 flags &= ~SCF_DO_STCLASS;
2443 #ifdef TRIE_STUDY_OPT
2444 else if (OP(scan) == TRIE) {
2445 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2446 min += trie->minlen;
2447 delta += (trie->maxlen - trie->minlen);
2448 flags &= ~SCF_DO_STCLASS; /* xxx */
2449 if (flags & SCF_DO_SUBSTR) {
2450 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2451 data->pos_min += trie->minlen;
2452 data->pos_delta += (trie->maxlen - trie->minlen);
2453 if (trie->maxlen != trie->minlen)
2454 data->longest = &(data->longest_float);
2458 else if (strchr((const char*)PL_varies,OP(scan))) {
2459 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2460 I32 f = flags, pos_before = 0;
2461 regnode * const oscan = scan;
2462 struct regnode_charclass_class this_class;
2463 struct regnode_charclass_class *oclass = NULL;
2464 I32 next_is_eval = 0;
2466 switch (PL_regkind[OP(scan)]) {
2467 case WHILEM: /* End of (?:...)* . */
2468 scan = NEXTOPER(scan);
2471 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2472 next = NEXTOPER(scan);
2473 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2475 maxcount = REG_INFTY;
2476 next = regnext(scan);
2477 scan = NEXTOPER(scan);
2481 if (flags & SCF_DO_SUBSTR)
2486 if (flags & SCF_DO_STCLASS) {
2488 maxcount = REG_INFTY;
2489 next = regnext(scan);
2490 scan = NEXTOPER(scan);
2493 is_inf = is_inf_internal = 1;
2494 scan = regnext(scan);
2495 if (flags & SCF_DO_SUBSTR) {
2496 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2497 data->longest = &(data->longest_float);
2499 goto optimize_curly_tail;
2501 mincount = ARG1(scan);
2502 maxcount = ARG2(scan);
2503 next = regnext(scan);
2504 if (OP(scan) == CURLYX) {
2505 I32 lp = (data ? *(data->last_closep) : 0);
2506 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2508 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2509 next_is_eval = (OP(scan) == EVAL);
2511 if (flags & SCF_DO_SUBSTR) {
2512 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2513 pos_before = data->pos_min;
2517 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2519 data->flags |= SF_IS_INF;
2521 if (flags & SCF_DO_STCLASS) {
2522 cl_init(pRExC_state, &this_class);
2523 oclass = data->start_class;
2524 data->start_class = &this_class;
2525 f |= SCF_DO_STCLASS_AND;
2526 f &= ~SCF_DO_STCLASS_OR;
2528 /* These are the cases when once a subexpression
2529 fails at a particular position, it cannot succeed
2530 even after backtracking at the enclosing scope.
2532 XXXX what if minimal match and we are at the
2533 initial run of {n,m}? */
2534 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2535 f &= ~SCF_WHILEM_VISITED_POS;
2537 /* This will finish on WHILEM, setting scan, or on NULL: */
2538 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2540 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2542 if (flags & SCF_DO_STCLASS)
2543 data->start_class = oclass;
2544 if (mincount == 0 || minnext == 0) {
2545 if (flags & SCF_DO_STCLASS_OR) {
2546 cl_or(pRExC_state, data->start_class, &this_class);
2548 else if (flags & SCF_DO_STCLASS_AND) {
2549 /* Switch to OR mode: cache the old value of
2550 * data->start_class */
2551 StructCopy(data->start_class, &and_with,
2552 struct regnode_charclass_class);
2553 flags &= ~SCF_DO_STCLASS_AND;
2554 StructCopy(&this_class, data->start_class,
2555 struct regnode_charclass_class);
2556 flags |= SCF_DO_STCLASS_OR;
2557 data->start_class->flags |= ANYOF_EOS;
2559 } else { /* Non-zero len */
2560 if (flags & SCF_DO_STCLASS_OR) {
2561 cl_or(pRExC_state, data->start_class, &this_class);
2562 cl_and(data->start_class, &and_with);
2564 else if (flags & SCF_DO_STCLASS_AND)
2565 cl_and(data->start_class, &this_class);
2566 flags &= ~SCF_DO_STCLASS;
2568 if (!scan) /* It was not CURLYX, but CURLY. */
2570 if ( /* ? quantifier ok, except for (?{ ... }) */
2571 (next_is_eval || !(mincount == 0 && maxcount == 1))
2572 && (minnext == 0) && (deltanext == 0)
2573 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2574 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2575 && ckWARN(WARN_REGEXP))
2578 "Quantifier unexpected on zero-length expression");
2581 min += minnext * mincount;
2582 is_inf_internal |= ((maxcount == REG_INFTY
2583 && (minnext + deltanext) > 0)
2584 || deltanext == I32_MAX);
2585 is_inf |= is_inf_internal;
2586 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2588 /* Try powerful optimization CURLYX => CURLYN. */
2589 if ( OP(oscan) == CURLYX && data
2590 && data->flags & SF_IN_PAR
2591 && !(data->flags & SF_HAS_EVAL)
2592 && !deltanext && minnext == 1 ) {
2593 /* Try to optimize to CURLYN. */
2594 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2595 regnode * const nxt1 = nxt;
2602 if (!strchr((const char*)PL_simple,OP(nxt))
2603 && !(PL_regkind[OP(nxt)] == EXACT
2604 && STR_LEN(nxt) == 1))
2610 if (OP(nxt) != CLOSE)
2612 /* Now we know that nxt2 is the only contents: */
2613 oscan->flags = (U8)ARG(nxt);
2615 OP(nxt1) = NOTHING; /* was OPEN. */
2617 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2618 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2619 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2620 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2621 OP(nxt + 1) = OPTIMIZED; /* was count. */
2622 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2627 /* Try optimization CURLYX => CURLYM. */
2628 if ( OP(oscan) == CURLYX && data
2629 && !(data->flags & SF_HAS_PAR)
2630 && !(data->flags & SF_HAS_EVAL)
2631 && !deltanext /* atom is fixed width */
2632 && minnext != 0 /* CURLYM can't handle zero width */
2634 /* XXXX How to optimize if data == 0? */
2635 /* Optimize to a simpler form. */
2636 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2640 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2641 && (OP(nxt2) != WHILEM))
2643 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2644 /* Need to optimize away parenths. */
2645 if (data->flags & SF_IN_PAR) {
2646 /* Set the parenth number. */
2647 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2649 if (OP(nxt) != CLOSE)
2650 FAIL("Panic opt close");
2651 oscan->flags = (U8)ARG(nxt);
2652 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2653 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2655 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2656 OP(nxt + 1) = OPTIMIZED; /* was count. */
2657 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2658 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2661 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2662 regnode *nnxt = regnext(nxt1);
2665 if (reg_off_by_arg[OP(nxt1)])
2666 ARG_SET(nxt1, nxt2 - nxt1);
2667 else if (nxt2 - nxt1 < U16_MAX)
2668 NEXT_OFF(nxt1) = nxt2 - nxt1;
2670 OP(nxt) = NOTHING; /* Cannot beautify */
2675 /* Optimize again: */
2676 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2682 else if ((OP(oscan) == CURLYX)
2683 && (flags & SCF_WHILEM_VISITED_POS)
2684 /* See the comment on a similar expression above.
2685 However, this time it not a subexpression
2686 we care about, but the expression itself. */
2687 && (maxcount == REG_INFTY)
2688 && data && ++data->whilem_c < 16) {
2689 /* This stays as CURLYX, we can put the count/of pair. */
2690 /* Find WHILEM (as in regexec.c) */
2691 regnode *nxt = oscan + NEXT_OFF(oscan);
2693 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2695 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2696 | (RExC_whilem_seen << 4)); /* On WHILEM */
2698 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2700 if (flags & SCF_DO_SUBSTR) {
2701 SV *last_str = NULL;
2702 int counted = mincount != 0;
2704 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2705 #if defined(SPARC64_GCC_WORKAROUND)
2708 const char *s = NULL;
2711 if (pos_before >= data->last_start_min)
2714 b = data->last_start_min;
2717 s = SvPV_const(data->last_found, l);
2718 old = b - data->last_start_min;
2721 I32 b = pos_before >= data->last_start_min
2722 ? pos_before : data->last_start_min;
2724 const char * const s = SvPV_const(data->last_found, l);
2725 I32 old = b - data->last_start_min;
2729 old = utf8_hop((U8*)s, old) - (U8*)s;
2732 /* Get the added string: */
2733 last_str = newSVpvn(s + old, l);
2735 SvUTF8_on(last_str);
2736 if (deltanext == 0 && pos_before == b) {
2737 /* What was added is a constant string */
2739 SvGROW(last_str, (mincount * l) + 1);
2740 repeatcpy(SvPVX(last_str) + l,
2741 SvPVX_const(last_str), l, mincount - 1);
2742 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2743 /* Add additional parts. */
2744 SvCUR_set(data->last_found,
2745 SvCUR(data->last_found) - l);
2746 sv_catsv(data->last_found, last_str);
2748 SV * sv = data->last_found;
2750 SvUTF8(sv) && SvMAGICAL(sv) ?
2751 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2752 if (mg && mg->mg_len >= 0)
2753 mg->mg_len += CHR_SVLEN(last_str);
2755 data->last_end += l * (mincount - 1);
2758 /* start offset must point into the last copy */
2759 data->last_start_min += minnext * (mincount - 1);
2760 data->last_start_max += is_inf ? I32_MAX
2761 : (maxcount - 1) * (minnext + data->pos_delta);
2764 /* It is counted once already... */
2765 data->pos_min += minnext * (mincount - counted);
2766 data->pos_delta += - counted * deltanext +
2767 (minnext + deltanext) * maxcount - minnext * mincount;
2768 if (mincount != maxcount) {
2769 /* Cannot extend fixed substrings found inside
2771 scan_commit(pRExC_state,data);
2772 if (mincount && last_str) {
2773 SV * const sv = data->last_found;
2774 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2775 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2779 sv_setsv(sv, last_str);
2780 data->last_end = data->pos_min;
2781 data->last_start_min =
2782 data->pos_min - CHR_SVLEN(last_str);
2783 data->last_start_max = is_inf
2785 : data->pos_min + data->pos_delta
2786 - CHR_SVLEN(last_str);
2788 data->longest = &(data->longest_float);
2790 SvREFCNT_dec(last_str);
2792 if (data && (fl & SF_HAS_EVAL))
2793 data->flags |= SF_HAS_EVAL;
2794 optimize_curly_tail:
2795 if (OP(oscan) != CURLYX) {
2796 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2798 NEXT_OFF(oscan) += NEXT_OFF(next);
2801 default: /* REF and CLUMP only? */
2802 if (flags & SCF_DO_SUBSTR) {
2803 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2804 data->longest = &(data->longest_float);
2806 is_inf = is_inf_internal = 1;
2807 if (flags & SCF_DO_STCLASS_OR)
2808 cl_anything(pRExC_state, data->start_class);
2809 flags &= ~SCF_DO_STCLASS;
2813 else if (strchr((const char*)PL_simple,OP(scan))) {
2816 if (flags & SCF_DO_SUBSTR) {
2817 scan_commit(pRExC_state,data);
2821 if (flags & SCF_DO_STCLASS) {
2822 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2824 /* Some of the logic below assumes that switching
2825 locale on will only add false positives. */
2826 switch (PL_regkind[OP(scan)]) {
2830 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2831 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2832 cl_anything(pRExC_state, data->start_class);
2835 if (OP(scan) == SANY)
2837 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2838 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2839 || (data->start_class->flags & ANYOF_CLASS));
2840 cl_anything(pRExC_state, data->start_class);
2842 if (flags & SCF_DO_STCLASS_AND || !value)
2843 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2846 if (flags & SCF_DO_STCLASS_AND)
2847 cl_and(data->start_class,
2848 (struct regnode_charclass_class*)scan);
2850 cl_or(pRExC_state, data->start_class,
2851 (struct regnode_charclass_class*)scan);
2854 if (flags & SCF_DO_STCLASS_AND) {
2855 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2856 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2857 for (value = 0; value < 256; value++)
2858 if (!isALNUM(value))
2859 ANYOF_BITMAP_CLEAR(data->start_class, value);
2863 if (data->start_class->flags & ANYOF_LOCALE)
2864 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2866 for (value = 0; value < 256; value++)
2868 ANYOF_BITMAP_SET(data->start_class, value);
2873 if (flags & SCF_DO_STCLASS_AND) {
2874 if (data->start_class->flags & ANYOF_LOCALE)
2875 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2878 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2879 data->start_class->flags |= ANYOF_LOCALE;
2883 if (flags & SCF_DO_STCLASS_AND) {
2884 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2885 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2886 for (value = 0; value < 256; value++)
2888 ANYOF_BITMAP_CLEAR(data->start_class, value);
2892 if (data->start_class->flags & ANYOF_LOCALE)
2893 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2895 for (value = 0; value < 256; value++)
2896 if (!isALNUM(value))
2897 ANYOF_BITMAP_SET(data->start_class, value);
2902 if (flags & SCF_DO_STCLASS_AND) {
2903 if (data->start_class->flags & ANYOF_LOCALE)
2904 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2907 data->start_class->flags |= ANYOF_LOCALE;
2908 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2912 if (flags & SCF_DO_STCLASS_AND) {
2913 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2914 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2915 for (value = 0; value < 256; value++)
2916 if (!isSPACE(value))
2917 ANYOF_BITMAP_CLEAR(data->start_class, value);
2921 if (data->start_class->flags & ANYOF_LOCALE)
2922 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2924 for (value = 0; value < 256; value++)
2926 ANYOF_BITMAP_SET(data->start_class, value);
2931 if (flags & SCF_DO_STCLASS_AND) {
2932 if (data->start_class->flags & ANYOF_LOCALE)
2933 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2936 data->start_class->flags |= ANYOF_LOCALE;
2937 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2941 if (flags & SCF_DO_STCLASS_AND) {
2942 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2943 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2944 for (value = 0; value < 256; value++)
2946 ANYOF_BITMAP_CLEAR(data->start_class, value);
2950 if (data->start_class->flags & ANYOF_LOCALE)
2951 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2953 for (value = 0; value < 256; value++)
2954 if (!isSPACE(value))
2955 ANYOF_BITMAP_SET(data->start_class, value);
2960 if (flags & SCF_DO_STCLASS_AND) {
2961 if (data->start_class->flags & ANYOF_LOCALE) {
2962 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2963 for (value = 0; value < 256; value++)
2964 if (!isSPACE(value))
2965 ANYOF_BITMAP_CLEAR(data->start_class, value);
2969 data->start_class->flags |= ANYOF_LOCALE;
2970 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2974 if (flags & SCF_DO_STCLASS_AND) {
2975 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2976 for (value = 0; value < 256; value++)
2977 if (!isDIGIT(value))
2978 ANYOF_BITMAP_CLEAR(data->start_class, value);
2981 if (data->start_class->flags & ANYOF_LOCALE)
2982 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2984 for (value = 0; value < 256; value++)
2986 ANYOF_BITMAP_SET(data->start_class, value);
2991 if (flags & SCF_DO_STCLASS_AND) {
2992 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2993 for (value = 0; value < 256; value++)
2995 ANYOF_BITMAP_CLEAR(data->start_class, value);
2998 if (data->start_class->flags & ANYOF_LOCALE)
2999 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3001 for (value = 0; value < 256; value++)
3002 if (!isDIGIT(value))
3003 ANYOF_BITMAP_SET(data->start_class, value);
3008 if (flags & SCF_DO_STCLASS_OR)
3009 cl_and(data->start_class, &and_with);
3010 flags &= ~SCF_DO_STCLASS;
3013 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3014 data->flags |= (OP(scan) == MEOL
3018 else if ( PL_regkind[OP(scan)] == BRANCHJ
3019 /* Lookbehind, or need to calculate parens/evals/stclass: */
3020 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3021 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3022 /* Lookahead/lookbehind */
3023 I32 deltanext, minnext, fake = 0;
3025 struct regnode_charclass_class intrnl;
3028 data_fake.flags = 0;
3030 data_fake.whilem_c = data->whilem_c;
3031 data_fake.last_closep = data->last_closep;
3034 data_fake.last_closep = &fake;
3035 if ( flags & SCF_DO_STCLASS && !scan->flags
3036 && OP(scan) == IFMATCH ) { /* Lookahead */
3037 cl_init(pRExC_state, &intrnl);
3038 data_fake.start_class = &intrnl;
3039 f |= SCF_DO_STCLASS_AND;
3041 if (flags & SCF_WHILEM_VISITED_POS)
3042 f |= SCF_WHILEM_VISITED_POS;
3043 next = regnext(scan);
3044 nscan = NEXTOPER(NEXTOPER(scan));
3045 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3048 vFAIL("Variable length lookbehind not implemented");
3050 else if (minnext > U8_MAX) {
3051 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3053 scan->flags = (U8)minnext;
3056 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3058 if (data_fake.flags & SF_HAS_EVAL)
3059 data->flags |= SF_HAS_EVAL;
3060 data->whilem_c = data_fake.whilem_c;
3062 if (f & SCF_DO_STCLASS_AND) {
3063 const int was = (data->start_class->flags & ANYOF_EOS);
3065 cl_and(data->start_class, &intrnl);
3067 data->start_class->flags |= ANYOF_EOS;
3070 else if (OP(scan) == OPEN) {
3073 else if (OP(scan) == CLOSE) {
3074 if ((I32)ARG(scan) == is_par) {
3075 next = regnext(scan);
3077 if ( next && (OP(next) != WHILEM) && next < last)
3078 is_par = 0; /* Disable optimization */
3081 *(data->last_closep) = ARG(scan);
3083 else if (OP(scan) == EVAL) {
3085 data->flags |= SF_HAS_EVAL;
3087 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3088 if (flags & SCF_DO_SUBSTR) {
3089 scan_commit(pRExC_state,data);
3090 data->longest = &(data->longest_float);
3092 is_inf = is_inf_internal = 1;
3093 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3094 cl_anything(pRExC_state, data->start_class);
3095 flags &= ~SCF_DO_STCLASS;
3097 /* Else: zero-length, ignore. */
3098 scan = regnext(scan);
3103 *deltap = is_inf_internal ? I32_MAX : delta;
3104 if (flags & SCF_DO_SUBSTR && is_inf)
3105 data->pos_delta = I32_MAX - data->pos_min;
3106 if (is_par > U8_MAX)
3108 if (is_par && pars==1 && data) {
3109 data->flags |= SF_IN_PAR;
3110 data->flags &= ~SF_HAS_PAR;
3112 else if (pars && data) {
3113 data->flags |= SF_HAS_PAR;
3114 data->flags &= ~SF_IN_PAR;
3116 if (flags & SCF_DO_STCLASS_OR)
3117 cl_and(data->start_class, &and_with);
3118 if (flags & SCF_EXACT_TRIE)
3119 data->flags |= SCF_EXACT_TRIE;
3124 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3126 if (RExC_rx->data) {
3127 Renewc(RExC_rx->data,
3128 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3129 char, struct reg_data);
3130 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3131 RExC_rx->data->count += n;
3134 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3135 char, struct reg_data);
3136 Newx(RExC_rx->data->what, n, U8);
3137 RExC_rx->data->count = n;
3139 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3140 return RExC_rx->data->count - n;
3143 #ifndef PERL_IN_XSUB_RE
3145 Perl_reginitcolors(pTHX)
3148 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3150 char *t = savepv(s);
3154 t = strchr(t, '\t');
3160 PL_colors[i] = t = (char *)"";
3165 PL_colors[i++] = (char *)"";
3173 - pregcomp - compile a regular expression into internal code
3175 * We can't allocate space until we know how big the compiled form will be,
3176 * but we can't compile it (and thus know how big it is) until we've got a
3177 * place to put the code. So we cheat: we compile it twice, once with code
3178 * generation turned off and size counting turned on, and once "for real".
3179 * This also means that we don't allocate space until we are sure that the
3180 * thing really will compile successfully, and we never have to move the
3181 * code and thus invalidate pointers into it. (Note that it has to be in
3182 * one piece because free() must be able to free it all.) [NB: not true in perl]
3184 * Beware that the optimization-preparation code in here knows about some
3185 * of the structure of the compiled regexp. [I'll say.]
3188 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3199 RExC_state_t RExC_state;
3200 RExC_state_t * const pRExC_state = &RExC_state;
3201 #ifdef TRIE_STUDY_OPT
3203 RExC_state_t copyRExC_state;
3206 GET_RE_DEBUG_FLAGS_DECL;
3209 FAIL("NULL regexp argument");
3211 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3214 DEBUG_r(if (!PL_colorset) reginitcolors());
3216 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
3217 PL_colors[4],PL_colors[5],PL_colors[0],
3218 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3220 RExC_flags = pm->op_pmflags;
3224 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3225 RExC_seen_evals = 0;
3228 /* First pass: determine size, legality. */
3235 RExC_emit = &PL_regdummy;
3236 RExC_whilem_seen = 0;
3237 #if 0 /* REGC() is (currently) a NOP at the first pass.
3238 * Clever compilers notice this and complain. --jhi */
3239 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3241 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3242 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3243 RExC_precomp = NULL;
3246 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3247 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3248 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3251 RExC_lastparse=NULL;
3255 /* Small enough for pointer-storage convention?
3256 If extralen==0, this means that we will not need long jumps. */
3257 if (RExC_size >= 0x10000L && RExC_extralen)
3258 RExC_size += RExC_extralen;
3261 if (RExC_whilem_seen > 15)
3262 RExC_whilem_seen = 15;
3264 /* Allocate space and initialize. */
3265 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3268 FAIL("Regexp out of space");
3271 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3272 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3275 r->prelen = xend - exp;
3276 r->precomp = savepvn(RExC_precomp, r->prelen);
3278 #ifdef PERL_OLD_COPY_ON_WRITE
3279 r->saved_copy = NULL;
3281 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3282 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3283 r->lastparen = 0; /* mg.c reads this. */
3285 r->substrs = 0; /* Useful during FAIL. */
3286 r->startp = 0; /* Useful during FAIL. */
3287 r->endp = 0; /* Useful during FAIL. */
3289 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3291 r->offsets[0] = RExC_size;
3293 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3294 "%s %"UVuf" bytes for offset annotations.\n",
3295 r->offsets ? "Got" : "Couldn't get",
3296 (UV)((2*RExC_size+1) * sizeof(U32))));
3300 /* Second pass: emit code. */
3301 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3306 RExC_emit_start = r->program;
3307 RExC_emit = r->program;
3308 /* Store the count of eval-groups for security checks: */
3309 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3310 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3312 if (reg(pRExC_state, 0, &flags,1) == NULL)
3314 /* XXXX To minimize changes to RE engine we always allocate
3315 3-units-long substrs field. */
3316 Newx(r->substrs, 1, struct reg_substr_data);
3319 Zero(r->substrs, 1, struct reg_substr_data);
3320 StructCopy(&zero_scan_data, &data, scan_data_t);
3322 #ifdef TRIE_STUDY_OPT
3324 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3325 RExC_state=copyRExC_state;
3326 if (data.longest_fixed)
3327 SvREFCNT_dec(data.longest_fixed);
3328 if (data.longest_float)
3329 SvREFCNT_dec(data.longest_float);
3330 if (data.last_found)
3331 SvREFCNT_dec(data.last_found);
3333 copyRExC_state=RExC_state;
3336 /* Dig out information for optimizations. */
3337 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3338 pm->op_pmflags = RExC_flags;
3340 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3341 r->regstclass = NULL;
3342 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3343 r->reganch |= ROPT_NAUGHTY;
3344 scan = r->program + 1; /* First BRANCH. */
3346 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3347 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3349 STRLEN longest_float_length, longest_fixed_length;
3350 struct regnode_charclass_class ch_class; /* pointed to by data */
3352 I32 last_close = 0; /* pointed to by data */
3355 /* Skip introductions and multiplicators >= 1. */
3356 while ((OP(first) == OPEN && (sawopen = 1)) ||
3357 /* An OR of *one* alternative - should not happen now. */
3358 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3359 /* for now we can't handle lookbehind IFMATCH*/
3360 (OP(first) == IFMATCH && !first->flags) ||
3361 (OP(first) == PLUS) ||
3362 (OP(first) == MINMOD) ||
3363 /* An {n,m} with n>0 */
3364 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3366 DEBUG_PEEP("first:",first,0);
3367 if (OP(first) == PLUS)
3370 first += regarglen[OP(first)];
3371 if (OP(first) == IFMATCH) {
3372 first = NEXTOPER(first);
3373 first += EXTRA_STEP_2ARGS;
3374 } else /*xxx possible optimisation for /(?=)/*/
3375 first = NEXTOPER(first);
3378 /* Starting-point info. */
3380 /* Ignore EXACT as we deal with it later. */
3381 if (PL_regkind[OP(first)] == EXACT) {
3382 if (OP(first) == EXACT)
3383 NOOP; /* Empty, get anchored substr later. */
3384 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3385 r->regstclass = first;
3388 else if (OP(first) == TRIE &&
3389 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3391 /* this can happen only on restudy */
3392 struct regnode_1 *trie_op;
3393 Newxz(trie_op,1,struct regnode_1);
3394 StructCopy(first,trie_op,struct regnode_1);
3395 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3396 r->regstclass = (regnode *)trie_op;
3399 else if (strchr((const char*)PL_simple,OP(first)))
3400 r->regstclass = first;
3401 else if (PL_regkind[OP(first)] == BOUND ||
3402 PL_regkind[OP(first)] == NBOUND)
3403 r->regstclass = first;
3404 else if (PL_regkind[OP(first)] == BOL) {
3405 r->reganch |= (OP(first) == MBOL
3407 : (OP(first) == SBOL
3410 first = NEXTOPER(first);
3413 else if (OP(first) == GPOS) {
3414 r->reganch |= ROPT_ANCH_GPOS;
3415 first = NEXTOPER(first);
3418 else if (!sawopen && (OP(first) == STAR &&
3419 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3420 !(r->reganch & ROPT_ANCH) )
3422 /* turn .* into ^.* with an implied $*=1 */
3424 (OP(NEXTOPER(first)) == REG_ANY)
3427 r->reganch |= type | ROPT_IMPLICIT;
3428 first = NEXTOPER(first);
3431 if (sawplus && (!sawopen || !RExC_sawback)
3432 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3433 /* x+ must match at the 1st pos of run of x's */
3434 r->reganch |= ROPT_SKIP;
3436 /* Scan is after the zeroth branch, first is atomic matcher. */
3437 #ifdef TRIE_STUDY_OPT
3440 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3441 (IV)(first - scan + 1))
3445 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3446 (IV)(first - scan + 1))
3452 * If there's something expensive in the r.e., find the
3453 * longest literal string that must appear and make it the
3454 * regmust. Resolve ties in favor of later strings, since
3455 * the regstart check works with the beginning of the r.e.
3456 * and avoiding duplication strengthens checking. Not a
3457 * strong reason, but sufficient in the absence of others.
3458 * [Now we resolve ties in favor of the earlier string if
3459 * it happens that c_offset_min has been invalidated, since the
3460 * earlier string may buy us something the later one won't.]
3464 data.longest_fixed = newSVpvs("");
3465 data.longest_float = newSVpvs("");
3466 data.last_found = newSVpvs("");
3467 data.longest = &(data.longest_fixed);
3469 if (!r->regstclass) {
3470 cl_init(pRExC_state, &ch_class);
3471 data.start_class = &ch_class;
3472 stclass_flag = SCF_DO_STCLASS_AND;
3473 } else /* XXXX Check for BOUND? */
3475 data.last_closep = &last_close;
3477 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3478 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3480 #ifdef TRIE_STUDY_OPT
3481 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3486 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3487 && data.last_start_min == 0 && data.last_end > 0
3488 && !RExC_seen_zerolen
3489 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3490 r->reganch |= ROPT_CHECK_ALL;
3491 scan_commit(pRExC_state, &data);
3492 SvREFCNT_dec(data.last_found);
3494 longest_float_length = CHR_SVLEN(data.longest_float);
3495 if (longest_float_length
3496 || (data.flags & SF_FL_BEFORE_EOL
3497 && (!(data.flags & SF_FL_BEFORE_MEOL)
3498 || (RExC_flags & PMf_MULTILINE)))) {
3501 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3502 && data.offset_fixed == data.offset_float_min
3503 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3504 goto remove_float; /* As in (a)+. */
3506 if (SvUTF8(data.longest_float)) {
3507 r->float_utf8 = data.longest_float;
3508 r->float_substr = NULL;
3510 r->float_substr = data.longest_float;
3511 r->float_utf8 = NULL;
3513 r->float_min_offset = data.offset_float_min;
3514 r->float_max_offset = data.offset_float_max;
3515 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3516 && (!(data.flags & SF_FL_BEFORE_MEOL)
3517 || (RExC_flags & PMf_MULTILINE)));
3518 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3522 r->float_substr = r->float_utf8 = NULL;
3523 SvREFCNT_dec(data.longest_float);
3524 longest_float_length = 0;
3527 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3528 if (longest_fixed_length
3529 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3530 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3531 || (RExC_flags & PMf_MULTILINE)))) {
3534 if (SvUTF8(data.longest_fixed)) {
3535 r->anchored_utf8 = data.longest_fixed;
3536 r->anchored_substr = NULL;
3538 r->anchored_substr = data.longest_fixed;
3539 r->anchored_utf8 = NULL;
3541 r->anchored_offset = data.offset_fixed;
3542 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3543 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3544 || (RExC_flags & PMf_MULTILINE)));
3545 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3548 r->anchored_substr = r->anchored_utf8 = NULL;
3549 SvREFCNT_dec(data.longest_fixed);
3550 longest_fixed_length = 0;
3553 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3554 r->regstclass = NULL;
3555 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3557 && !(data.start_class->flags & ANYOF_EOS)
3558 && !cl_is_anything(data.start_class))
3560 const I32 n = add_data(pRExC_state, 1, "f");
3562 Newx(RExC_rx->data->data[n], 1,
3563 struct regnode_charclass_class);
3564 StructCopy(data.start_class,
3565 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3566 struct regnode_charclass_class);
3567 r->regstclass = (regnode*)RExC_rx->data->data[n];
3568 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3569 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3570 regprop(r, sv, (regnode*)data.start_class);
3571 PerlIO_printf(Perl_debug_log,
3572 "synthetic stclass \"%s\".\n",
3573 SvPVX_const(sv));});
3576 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3577 if (longest_fixed_length > longest_float_length) {
3578 r->check_substr = r->anchored_substr;
3579 r->check_utf8 = r->anchored_utf8;
3580 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3581 if (r->reganch & ROPT_ANCH_SINGLE)
3582 r->reganch |= ROPT_NOSCAN;
3585 r->check_substr = r->float_substr;
3586 r->check_utf8 = r->float_utf8;
3587 r->check_offset_min = data.offset_float_min;
3588 r->check_offset_max = data.offset_float_max;
3590 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3591 This should be changed ASAP! */
3592 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3593 r->reganch |= RE_USE_INTUIT;
3594 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3595 r->reganch |= RE_INTUIT_TAIL;
3599 /* Several toplevels. Best we can is to set minlen. */
3601 struct regnode_charclass_class ch_class;
3604 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3606 scan = r->program + 1;
3607 cl_init(pRExC_state, &ch_class);
3608 data.start_class = &ch_class;
3609 data.last_closep = &last_close;
3611 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3612 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3614 #ifdef TRIE_STUDY_OPT
3615 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3620 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3621 = r->float_substr = r->float_utf8 = NULL;
3622 if (!(data.start_class->flags & ANYOF_EOS)
3623 && !cl_is_anything(data.start_class))
3625 const I32 n = add_data(pRExC_state, 1, "f");
3627 Newx(RExC_rx->data->data[n], 1,
3628 struct regnode_charclass_class);
3629 StructCopy(data.start_class,
3630 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3631 struct regnode_charclass_class);
3632 r->regstclass = (regnode*)RExC_rx->data->data[n];
3633 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3634 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3635 regprop(r, sv, (regnode*)data.start_class);
3636 PerlIO_printf(Perl_debug_log,
3637 "synthetic stclass \"%s\".\n",
3638 SvPVX_const(sv));});
3643 if (RExC_seen & REG_SEEN_GPOS)
3644 r->reganch |= ROPT_GPOS_SEEN;
3645 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3646 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3647 if (RExC_seen & REG_SEEN_EVAL)
3648 r->reganch |= ROPT_EVAL_SEEN;
3649 if (RExC_seen & REG_SEEN_CANY)
3650 r->reganch |= ROPT_CANY_SEEN;
3651 Newxz(r->startp, RExC_npar, I32);
3652 Newxz(r->endp, RExC_npar, I32);
3654 DEBUG_r( RX_DEBUG_on(r) );
3656 PerlIO_printf(Perl_debug_log,"Final program:\n");
3663 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3664 int rem=(int)(RExC_end - RExC_parse); \
3673 if (RExC_lastparse!=RExC_parse) \
3674 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3677 iscut ? "..." : "<" \
3680 PerlIO_printf(Perl_debug_log,"%16s",""); \
3685 num=REG_NODE_NUM(RExC_emit); \
3686 if (RExC_lastnum!=num) \
3687 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3689 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3690 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3691 (int)((depth*2)), "", \
3695 RExC_lastparse=RExC_parse; \
3700 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3701 DEBUG_PARSE_MSG((funcname)); \
3702 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3705 - reg - regular expression, i.e. main body or parenthesized thing
3707 * Caller must absorb opening parenthesis.
3709 * Combining parenthesis handling with the base level of regular expression
3710 * is a trifle forced, but the need to tie the tails of the branches to what
3711 * follows makes it hard to avoid.
3713 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3715 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3717 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3721 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3722 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3725 register regnode *ret; /* Will be the head of the group. */
3726 register regnode *br;
3727 register regnode *lastbr;
3728 register regnode *ender = NULL;
3729 register I32 parno = 0;
3731 const I32 oregflags = RExC_flags;
3732 bool have_branch = 0;
3735 /* for (?g), (?gc), and (?o) warnings; warning
3736 about (?c) will warn about (?g) -- japhy */
3738 #define WASTED_O 0x01
3739 #define WASTED_G 0x02
3740 #define WASTED_C 0x04
3741 #define WASTED_GC (0x02|0x04)
3742 I32 wastedflags = 0x00;
3744 char * parse_start = RExC_parse; /* MJD */
3745 char * const oregcomp_parse = RExC_parse;
3747 GET_RE_DEBUG_FLAGS_DECL;
3748 DEBUG_PARSE("reg ");
3751 *flagp = 0; /* Tentatively. */
3754 /* Make an OPEN node, if parenthesized. */
3756 if (*RExC_parse == '?') { /* (?...) */
3757 U32 posflags = 0, negflags = 0;
3758 U32 *flagsp = &posflags;
3759 bool is_logical = 0;
3760 const char * const seqstart = RExC_parse;
3763 paren = *RExC_parse++;
3764 ret = NULL; /* For look-ahead/behind. */
3766 case '<': /* (?<...) */
3767 RExC_seen |= REG_SEEN_LOOKBEHIND;
3768 if (*RExC_parse == '!')
3770 if (*RExC_parse != '=' && *RExC_parse != '!')
3773 case '=': /* (?=...) */
3774 case '!': /* (?!...) */
3775 RExC_seen_zerolen++;
3776 case ':': /* (?:...) */
3777 case '>': /* (?>...) */
3779 case '$': /* (?$...) */
3780 case '@': /* (?@...) */
3781 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3783 case '#': /* (?#...) */
3784 while (*RExC_parse && *RExC_parse != ')')
3786 if (*RExC_parse != ')')
3787 FAIL("Sequence (?#... not terminated");
3788 nextchar(pRExC_state);
3791 case 'p': /* (?p...) */
3792 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3793 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3795 case '?': /* (??...) */
3797 if (*RExC_parse != '{')
3799 paren = *RExC_parse++;
3801 case '{': /* (?{...}) */
3803 I32 count = 1, n = 0;
3805 char *s = RExC_parse;
3807 RExC_seen_zerolen++;
3808 RExC_seen |= REG_SEEN_EVAL;
3809 while (count && (c = *RExC_parse)) {
3820 if (*RExC_parse != ')') {
3822 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3826 OP_4tree *sop, *rop;
3827 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3830 Perl_save_re_context(aTHX);
3831 rop = sv_compile_2op(sv, &sop, "re", &pad);
3832 sop->op_private |= OPpREFCOUNTED;
3833 /* re_dup will OpREFCNT_inc */
3834 OpREFCNT_set(sop, 1);
3837 n = add_data(pRExC_state, 3, "nop");
3838 RExC_rx->data->data[n] = (void*)rop;
3839 RExC_rx->data->data[n+1] = (void*)sop;
3840 RExC_rx->data->data[n+2] = (void*)pad;
3843 else { /* First pass */
3844 if (PL_reginterp_cnt < ++RExC_seen_evals
3846 /* No compiled RE interpolated, has runtime
3847 components ===> unsafe. */
3848 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3849 if (PL_tainting && PL_tainted)
3850 FAIL("Eval-group in insecure regular expression");
3851 #if PERL_VERSION > 8
3852 if (IN_PERL_COMPILETIME)
3857 nextchar(pRExC_state);
3859 ret = reg_node(pRExC_state, LOGICAL);
3862 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3863 /* deal with the length of this later - MJD */
3866 ret = reganode(pRExC_state, EVAL, n);
3867 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3868 Set_Node_Offset(ret, parse_start);
3871 case '(': /* (?(?{...})...) and (?(?=...)...) */
3873 if (RExC_parse[0] == '?') { /* (?(?...)) */
3874 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3875 || RExC_parse[1] == '<'
3876 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3879 ret = reg_node(pRExC_state, LOGICAL);
3882 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3886 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3889 parno = atoi(RExC_parse++);
3891 while (isDIGIT(*RExC_parse))
3893 ret = reganode(pRExC_state, GROUPP, parno);
3895 if ((c = *nextchar(pRExC_state)) != ')')
3896 vFAIL("Switch condition not recognized");
3898 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3899 br = regbranch(pRExC_state, &flags, 1,depth+1);
3901 br = reganode(pRExC_state, LONGJMP, 0);
3903 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3904 c = *nextchar(pRExC_state);
3908 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3909 regbranch(pRExC_state, &flags, 1,depth+1);
3910 REGTAIL(pRExC_state, ret, lastbr);
3913 c = *nextchar(pRExC_state);
3918 vFAIL("Switch (?(condition)... contains too many branches");
3919 ender = reg_node(pRExC_state, TAIL);
3920 REGTAIL(pRExC_state, br, ender);
3922 REGTAIL(pRExC_state, lastbr, ender);
3923 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3926 REGTAIL(pRExC_state, ret, ender);
3930 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3934 RExC_parse--; /* for vFAIL to print correctly */
3935 vFAIL("Sequence (? incomplete");
3939 parse_flags: /* (?i) */
3940 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3941 /* (?g), (?gc) and (?o) are useless here
3942 and must be globally applied -- japhy */
3944 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3945 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3946 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3947 if (! (wastedflags & wflagbit) ) {
3948 wastedflags |= wflagbit;
3951 "Useless (%s%c) - %suse /%c modifier",
3952 flagsp == &negflags ? "?-" : "?",
3954 flagsp == &negflags ? "don't " : "",
3960 else if (*RExC_parse == 'c') {
3961 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3962 if (! (wastedflags & WASTED_C) ) {
3963 wastedflags |= WASTED_GC;
3966 "Useless (%sc) - %suse /gc modifier",
3967 flagsp == &negflags ? "?-" : "?",
3968 flagsp == &negflags ? "don't " : ""
3973 else { pmflag(flagsp, *RExC_parse); }
3977 if (*RExC_parse == '-') {
3979 wastedflags = 0; /* reset so (?g-c) warns twice */
3983 RExC_flags |= posflags;
3984 RExC_flags &= ~negflags;
3985 if (*RExC_parse == ':') {
3991 if (*RExC_parse != ')') {
3993 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3995 nextchar(pRExC_state);
4003 ret = reganode(pRExC_state, OPEN, parno);
4004 Set_Node_Length(ret, 1); /* MJD */
4005 Set_Node_Offset(ret, RExC_parse); /* MJD */
4012 /* Pick up the branches, linking them together. */
4013 parse_start = RExC_parse; /* MJD */
4014 br = regbranch(pRExC_state, &flags, 1,depth+1);
4015 /* branch_len = (paren != 0); */
4019 if (*RExC_parse == '|') {
4020 if (!SIZE_ONLY && RExC_extralen) {
4021 reginsert(pRExC_state, BRANCHJ, br);
4024 reginsert(pRExC_state, BRANCH, br);
4025 Set_Node_Length(br, paren != 0);
4026 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4030 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4032 else if (paren == ':') {
4033 *flagp |= flags&SIMPLE;
4035 if (is_open) { /* Starts with OPEN. */
4036 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4038 else if (paren != '?') /* Not Conditional */
4040 *flagp |= flags & (SPSTART | HASWIDTH);
4042 while (*RExC_parse == '|') {
4043 if (!SIZE_ONLY && RExC_extralen) {
4044 ender = reganode(pRExC_state, LONGJMP,0);
4045 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4048 RExC_extralen += 2; /* Account for LONGJMP. */
4049 nextchar(pRExC_state);
4050 br = regbranch(pRExC_state, &flags, 0, depth+1);
4054 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4058 *flagp |= flags&SPSTART;
4061 if (have_branch || paren != ':') {
4062 /* Make a closing node, and hook it on the end. */
4065 ender = reg_node(pRExC_state, TAIL);
4068 ender = reganode(pRExC_state, CLOSE, parno);
4069 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4070 Set_Node_Length(ender,1); /* MJD */
4076 *flagp &= ~HASWIDTH;
4079 ender = reg_node(pRExC_state, SUCCEED);
4082 ender = reg_node(pRExC_state, END);
4085 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4087 if (have_branch && !SIZE_ONLY) {
4088 /* Hook the tails of the branches to the closing node. */
4089 for (br = ret; br; br = regnext(br)) {
4090 const U8 op = PL_regkind[OP(br)];
4092 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4094 else if (op == BRANCHJ) {
4095 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4103 static const char parens[] = "=!<,>";
4105 if (paren && (p = strchr(parens, paren))) {
4106 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4107 int flag = (p - parens) > 1;
4110 node = SUSPEND, flag = 0;
4111 reginsert(pRExC_state, node,ret);
4112 Set_Node_Cur_Length(ret);
4113 Set_Node_Offset(ret, parse_start + 1);
4115 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4119 /* Check for proper termination. */
4121 RExC_flags = oregflags;
4122 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4123 RExC_parse = oregcomp_parse;
4124 vFAIL("Unmatched (");
4127 else if (!paren && RExC_parse < RExC_end) {
4128 if (*RExC_parse == ')') {
4130 vFAIL("Unmatched )");
4133 FAIL("Junk on end of regexp"); /* "Can't happen". */
4141 - regbranch - one alternative of an | operator
4143 * Implements the concatenation operator.
4146 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4149 register regnode *ret;
4150 register regnode *chain = NULL;
4151 register regnode *latest;
4152 I32 flags = 0, c = 0;
4153 GET_RE_DEBUG_FLAGS_DECL;
4154 DEBUG_PARSE("brnc");
4158 if (!SIZE_ONLY && RExC_extralen)
4159 ret = reganode(pRExC_state, BRANCHJ,0);
4161 ret = reg_node(pRExC_state, BRANCH);
4162 Set_Node_Length(ret, 1);
4166 if (!first && SIZE_ONLY)
4167 RExC_extralen += 1; /* BRANCHJ */
4169 *flagp = WORST; /* Tentatively. */
4172 nextchar(pRExC_state);
4173 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4175 latest = regpiece(pRExC_state, &flags,depth+1);
4176 if (latest == NULL) {
4177 if (flags & TRYAGAIN)
4181 else if (ret == NULL)
4183 *flagp |= flags&HASWIDTH;
4184 if (chain == NULL) /* First piece. */
4185 *flagp |= flags&SPSTART;
4188 REGTAIL(pRExC_state, chain, latest);
4193 if (chain == NULL) { /* Loop ran zero times. */
4194 chain = reg_node(pRExC_state, NOTHING);
4199 *flagp |= flags&SIMPLE;
4206 - regpiece - something followed by possible [*+?]
4208 * Note that the branching code sequences used for ? and the general cases
4209 * of * and + are somewhat optimized: they use the same NOTHING node as
4210 * both the endmarker for their branch list and the body of the last branch.
4211 * It might seem that this node could be dispensed with entirely, but the
4212 * endmarker role is not redundant.
4215 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4218 register regnode *ret;
4220 register char *next;
4222 const char * const origparse = RExC_parse;
4224 I32 max = REG_INFTY;
4226 GET_RE_DEBUG_FLAGS_DECL;
4227 DEBUG_PARSE("piec");
4229 ret = regatom(pRExC_state, &flags,depth+1);
4231 if (flags & TRYAGAIN)
4238 if (op == '{' && regcurly(RExC_parse)) {
4239 const char *maxpos = NULL;
4240 parse_start = RExC_parse; /* MJD */
4241 next = RExC_parse + 1;
4242 while (isDIGIT(*next) || *next == ',') {
4251 if (*next == '}') { /* got one */
4255 min = atoi(RExC_parse);
4259 maxpos = RExC_parse;
4261 if (!max && *maxpos != '0')
4262 max = REG_INFTY; /* meaning "infinity" */
4263 else if (max >= REG_INFTY)
4264 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4266 nextchar(pRExC_state);
4269 if ((flags&SIMPLE)) {
4270 RExC_naughty += 2 + RExC_naughty / 2;
4271 reginsert(pRExC_state, CURLY, ret);
4272 Set_Node_Offset(ret, parse_start+1); /* MJD */
4273 Set_Node_Cur_Length(ret);
4276 regnode * const w = reg_node(pRExC_state, WHILEM);
4279 REGTAIL(pRExC_state, ret, w);
4280 if (!SIZE_ONLY && RExC_extralen) {
4281 reginsert(pRExC_state, LONGJMP,ret);
4282 reginsert(pRExC_state, NOTHING,ret);
4283 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4285 reginsert(pRExC_state, CURLYX,ret);
4287 Set_Node_Offset(ret, parse_start+1);
4288 Set_Node_Length(ret,
4289 op == '{' ? (RExC_parse - parse_start) : 1);
4291 if (!SIZE_ONLY && RExC_extralen)
4292 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4293 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4295 RExC_whilem_seen++, RExC_extralen += 3;
4296 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4304 if (max && max < min)
4305 vFAIL("Can't do {n,m} with n > m");
4307 ARG1_SET(ret, (U16)min);
4308 ARG2_SET(ret, (U16)max);
4320 #if 0 /* Now runtime fix should be reliable. */
4322 /* if this is reinstated, don't forget to put this back into perldiag:
4324 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4326 (F) The part of the regexp subject to either the * or + quantifier
4327 could match an empty string. The {#} shows in the regular
4328 expression about where the problem was discovered.
4332 if (!(flags&HASWIDTH) && op != '?')
4333 vFAIL("Regexp *+ operand could be empty");
4336 parse_start = RExC_parse;
4337 nextchar(pRExC_state);
4339 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4341 if (op == '*' && (flags&SIMPLE)) {
4342 reginsert(pRExC_state, STAR, ret);
4346 else if (op == '*') {
4350 else if (op == '+' && (flags&SIMPLE)) {
4351 reginsert(pRExC_state, PLUS, ret);
4355 else if (op == '+') {
4359 else if (op == '?') {
4364 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4366 "%.*s matches null string many times",
4367 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4371 if (*RExC_parse == '?') {
4372 nextchar(pRExC_state);
4373 reginsert(pRExC_state, MINMOD, ret);
4374 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4376 if (ISMULT2(RExC_parse)) {
4378 vFAIL("Nested quantifiers");
4385 - regatom - the lowest level
4387 * Optimization: gobbles an entire sequence of ordinary characters so that
4388 * it can turn them into a single node, which is smaller to store and
4389 * faster to run. Backslashed characters are exceptions, each becoming a
4390 * separate node; the code is simpler that way and it's not worth fixing.
4392 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4393 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4396 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4399 register regnode *ret = NULL;
4401 char *parse_start = RExC_parse;
4402 GET_RE_DEBUG_FLAGS_DECL;
4403 DEBUG_PARSE("atom");
4404 *flagp = WORST; /* Tentatively. */
4407 switch (*RExC_parse) {
4409 RExC_seen_zerolen++;
4410 nextchar(pRExC_state);
4411 if (RExC_flags & PMf_MULTILINE)
4412 ret = reg_node(pRExC_state, MBOL);
4413 else if (RExC_flags & PMf_SINGLELINE)
4414 ret = reg_node(pRExC_state, SBOL);
4416 ret = reg_node(pRExC_state, BOL);
4417 Set_Node_Length(ret, 1); /* MJD */
4420 nextchar(pRExC_state);
4422 RExC_seen_zerolen++;
4423 if (RExC_flags & PMf_MULTILINE)
4424 ret = reg_node(pRExC_state, MEOL);
4425 else if (RExC_flags & PMf_SINGLELINE)
4426 ret = reg_node(pRExC_state, SEOL);
4428 ret = reg_node(pRExC_state, EOL);
4429 Set_Node_Length(ret, 1); /* MJD */
4432 nextchar(pRExC_state);
4433 if (RExC_flags & PMf_SINGLELINE)
4434 ret = reg_node(pRExC_state, SANY);
4436 ret = reg_node(pRExC_state, REG_ANY);
4437 *flagp |= HASWIDTH|SIMPLE;
4439 Set_Node_Length(ret, 1); /* MJD */
4443 char * const oregcomp_parse = ++RExC_parse;
4444 ret = regclass(pRExC_state,depth+1);
4445 if (*RExC_parse != ']') {
4446 RExC_parse = oregcomp_parse;
4447 vFAIL("Unmatched [");
4449 nextchar(pRExC_state);
4450 *flagp |= HASWIDTH|SIMPLE;
4451 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4455 nextchar(pRExC_state);
4456 ret = reg(pRExC_state, 1, &flags,depth+1);
4458 if (flags & TRYAGAIN) {
4459 if (RExC_parse == RExC_end) {
4460 /* Make parent create an empty node if needed. */
4468 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4472 if (flags & TRYAGAIN) {
4476 vFAIL("Internal urp");
4477 /* Supposed to be caught earlier. */
4480 if (!regcurly(RExC_parse)) {
4489 vFAIL("Quantifier follows nothing");
4492 switch (*++RExC_parse) {
4494 RExC_seen_zerolen++;
4495 ret = reg_node(pRExC_state, SBOL);
4497 nextchar(pRExC_state);
4498 Set_Node_Length(ret, 2); /* MJD */
4501 ret = reg_node(pRExC_state, GPOS);
4502 RExC_seen |= REG_SEEN_GPOS;
4504 nextchar(pRExC_state);
4505 Set_Node_Length(ret, 2); /* MJD */
4508 ret = reg_node(pRExC_state, SEOL);
4510 RExC_seen_zerolen++; /* Do not optimize RE away */
4511 nextchar(pRExC_state);
4514 ret = reg_node(pRExC_state, EOS);
4516 RExC_seen_zerolen++; /* Do not optimize RE away */
4517 nextchar(pRExC_state);
4518 Set_Node_Length(ret, 2); /* MJD */
4521 ret = reg_node(pRExC_state, CANY);
4522 RExC_seen |= REG_SEEN_CANY;
4523 *flagp |= HASWIDTH|SIMPLE;
4524 nextchar(pRExC_state);
4525 Set_Node_Length(ret, 2); /* MJD */
4528 ret = reg_node(pRExC_state, CLUMP);
4530 nextchar(pRExC_state);
4531 Set_Node_Length(ret, 2); /* MJD */
4534 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4535 *flagp |= HASWIDTH|SIMPLE;
4536 nextchar(pRExC_state);
4537 Set_Node_Length(ret, 2); /* MJD */
4540 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4541 *flagp |= HASWIDTH|SIMPLE;
4542 nextchar(pRExC_state);
4543 Set_Node_Length(ret, 2); /* MJD */
4546 RExC_seen_zerolen++;
4547 RExC_seen |= REG_SEEN_LOOKBEHIND;
4548 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4550 nextchar(pRExC_state);
4551 Set_Node_Length(ret, 2); /* MJD */
4554 RExC_seen_zerolen++;
4555 RExC_seen |= REG_SEEN_LOOKBEHIND;
4556 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4558 nextchar(pRExC_state);
4559 Set_Node_Length(ret, 2); /* MJD */
4562 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4563 *flagp |= HASWIDTH|SIMPLE;
4564 nextchar(pRExC_state);
4565 Set_Node_Length(ret, 2); /* MJD */
4568 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4569 *flagp |= HASWIDTH|SIMPLE;
4570 nextchar(pRExC_state);
4571 Set_Node_Length(ret, 2); /* MJD */
4574 ret = reg_node(pRExC_state, DIGIT);
4575 *flagp |= HASWIDTH|SIMPLE;
4576 nextchar(pRExC_state);
4577 Set_Node_Length(ret, 2); /* MJD */
4580 ret = reg_node(pRExC_state, NDIGIT);
4581 *flagp |= HASWIDTH|SIMPLE;
4582 nextchar(pRExC_state);
4583 Set_Node_Length(ret, 2); /* MJD */
4588 char* const oldregxend = RExC_end;
4589 char* parse_start = RExC_parse - 2;
4591 if (RExC_parse[1] == '{') {
4592 /* a lovely hack--pretend we saw [\pX] instead */
4593 RExC_end = strchr(RExC_parse, '}');
4595 const U8 c = (U8)*RExC_parse;
4597 RExC_end = oldregxend;
4598 vFAIL2("Missing right brace on \\%c{}", c);
4603 RExC_end = RExC_parse + 2;
4604 if (RExC_end > oldregxend)
4605 RExC_end = oldregxend;
4609 ret = regclass(pRExC_state,depth+1);
4611 RExC_end = oldregxend;
4614 Set_Node_Offset(ret, parse_start + 2);
4615 Set_Node_Cur_Length(ret);
4616 nextchar(pRExC_state);
4617 *flagp |= HASWIDTH|SIMPLE;
4630 case '1': case '2': case '3': case '4':
4631 case '5': case '6': case '7': case '8': case '9':
4633 const I32 num = atoi(RExC_parse);
4635 if (num > 9 && num >= RExC_npar)
4638 char * const parse_start = RExC_parse - 1; /* MJD */
4639 while (isDIGIT(*RExC_parse))
4642 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4643 vFAIL("Reference to nonexistent group");
4645 ret = reganode(pRExC_state,
4646 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4650 /* override incorrect value set in reganode MJD */
4651 Set_Node_Offset(ret, parse_start+1);
4652 Set_Node_Cur_Length(ret); /* MJD */
4654 nextchar(pRExC_state);
4659 if (RExC_parse >= RExC_end)
4660 FAIL("Trailing \\");
4663 /* Do not generate "unrecognized" warnings here, we fall
4664 back into the quick-grab loop below */
4671 if (RExC_flags & PMf_EXTENDED) {
4672 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4674 if (RExC_parse < RExC_end)
4680 register STRLEN len;
4685 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4687 parse_start = RExC_parse - 1;
4693 ret = reg_node(pRExC_state,
4694 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4696 for (len = 0, p = RExC_parse - 1;
4697 len < 127 && p < RExC_end;
4700 char * const oldp = p;
4702 if (RExC_flags & PMf_EXTENDED)
4703 p = regwhite(p, RExC_end);
4750 ender = ASCII_TO_NATIVE('\033');
4754 ender = ASCII_TO_NATIVE('\007');
4759 char* const e = strchr(p, '}');
4763 vFAIL("Missing right brace on \\x{}");
4766 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4767 | PERL_SCAN_DISALLOW_PREFIX;
4768 STRLEN numlen = e - p - 1;
4769 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4776 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4778 ender = grok_hex(p, &numlen, &flags, NULL);
4784 ender = UCHARAT(p++);
4785 ender = toCTRL(ender);
4787 case '0': case '1': case '2': case '3':case '4':
4788 case '5': case '6': case '7': case '8':case '9':
4790 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4793 ender = grok_oct(p, &numlen, &flags, NULL);
4803 FAIL("Trailing \\");
4806 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4807 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4808 goto normal_default;
4813 if (UTF8_IS_START(*p) && UTF) {
4815 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4816 &numlen, UTF8_ALLOW_DEFAULT);
4823 if (RExC_flags & PMf_EXTENDED)
4824 p = regwhite(p, RExC_end);
4826 /* Prime the casefolded buffer. */
4827 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4829 if (ISMULT2(p)) { /* Back off on ?+*. */
4834 /* Emit all the Unicode characters. */
4836 for (foldbuf = tmpbuf;
4838 foldlen -= numlen) {
4839 ender = utf8_to_uvchr(foldbuf, &numlen);
4841 const STRLEN unilen = reguni(pRExC_state, ender, s);
4844 /* In EBCDIC the numlen
4845 * and unilen can differ. */
4847 if (numlen >= foldlen)
4851 break; /* "Can't happen." */
4855 const STRLEN unilen = reguni(pRExC_state, ender, s);
4864 REGC((char)ender, s++);
4870 /* Emit all the Unicode characters. */
4872 for (foldbuf = tmpbuf;
4874 foldlen -= numlen) {
4875 ender = utf8_to_uvchr(foldbuf, &numlen);
4877 const STRLEN unilen = reguni(pRExC_state, ender, s);
4880 /* In EBCDIC the numlen
4881 * and unilen can differ. */
4883 if (numlen >= foldlen)
4891 const STRLEN unilen = reguni(pRExC_state, ender, s);
4900 REGC((char)ender, s++);
4904 Set_Node_Cur_Length(ret); /* MJD */
4905 nextchar(pRExC_state);
4907 /* len is STRLEN which is unsigned, need to copy to signed */
4910 vFAIL("Internal disaster");
4914 if (len == 1 && UNI_IS_INVARIANT(ender))
4918 RExC_size += STR_SZ(len);
4921 RExC_emit += STR_SZ(len);
4927 /* If the encoding pragma is in effect recode the text of
4928 * any EXACT-kind nodes. */
4929 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4930 const STRLEN oldlen = STR_LEN(ret);
4931 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4935 if (sv_utf8_downgrade(sv, TRUE)) {
4936 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4937 const STRLEN newlen = SvCUR(sv);
4942 GET_RE_DEBUG_FLAGS_DECL;
4943 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4944 (int)oldlen, STRING(ret),
4946 Copy(s, STRING(ret), newlen, char);
4947 STR_LEN(ret) += newlen - oldlen;
4948 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4950 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4958 S_regwhite(char *p, const char *e)
4963 else if (*p == '#') {
4966 } while (p < e && *p != '\n');
4974 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4975 Character classes ([:foo:]) can also be negated ([:^foo:]).
4976 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4977 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4978 but trigger failures because they are currently unimplemented. */
4980 #define POSIXCC_DONE(c) ((c) == ':')
4981 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4982 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4985 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4988 I32 namedclass = OOB_NAMEDCLASS;
4990 if (value == '[' && RExC_parse + 1 < RExC_end &&
4991 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4992 POSIXCC(UCHARAT(RExC_parse))) {
4993 const char c = UCHARAT(RExC_parse);
4994 char* const s = RExC_parse++;
4996 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4998 if (RExC_parse == RExC_end)
4999 /* Grandfather lone [:, [=, [. */
5002 const char* const t = RExC_parse++; /* skip over the c */
5005 if (UCHARAT(RExC_parse) == ']') {
5006 const char *posixcc = s + 1;
5007 RExC_parse++; /* skip over the ending ] */
5010 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5011 const I32 skip = t - posixcc;
5013 /* Initially switch on the length of the name. */
5016 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5017 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5020 /* Names all of length 5. */
5021 /* alnum alpha ascii blank cntrl digit graph lower
5022 print punct space upper */
5023 /* Offset 4 gives the best switch position. */
5024 switch (posixcc[4]) {
5026 if (memEQ(posixcc, "alph", 4)) /* alpha */
5027 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5030 if (memEQ(posixcc, "spac", 4)) /* space */
5031 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5034 if (memEQ(posixcc, "grap", 4)) /* graph */
5035 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5038 if (memEQ(posixcc, "asci", 4)) /* ascii */
5039 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5042 if (memEQ(posixcc, "blan", 4)) /* blank */
5043 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5046 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5047 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5050 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5051 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5054 if (memEQ(posixcc, "lowe", 4)) /* lower */
5055 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5056 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5057 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5060 if (memEQ(posixcc, "digi", 4)) /* digit */
5061 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5062 else if (memEQ(posixcc, "prin", 4)) /* print */
5063 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5064 else if (memEQ(posixcc, "punc", 4)) /* punct */
5065 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5070 if (memEQ(posixcc, "xdigit", 6))
5071 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5075 if (namedclass == OOB_NAMEDCLASS)
5076 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5078 assert (posixcc[skip] == ':');
5079 assert (posixcc[skip+1] == ']');
5080 } else if (!SIZE_ONLY) {
5081 /* [[=foo=]] and [[.foo.]] are still future. */
5083 /* adjust RExC_parse so the warning shows after
5085 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5087 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5090 /* Maternal grandfather:
5091 * "[:" ending in ":" but not in ":]" */
5101 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5104 if (POSIXCC(UCHARAT(RExC_parse))) {
5105 const char *s = RExC_parse;
5106 const char c = *s++;
5110 if (*s && c == *s && s[1] == ']') {
5111 if (ckWARN(WARN_REGEXP))
5113 "POSIX syntax [%c %c] belongs inside character classes",
5116 /* [[=foo=]] and [[.foo.]] are still future. */
5117 if (POSIXCC_NOTYET(c)) {
5118 /* adjust RExC_parse so the error shows after
5120 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5122 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5130 parse a class specification and produce either an ANYOF node that
5131 matches the pattern. If the pattern matches a single char only and
5132 that char is < 256 then we produce an EXACT node instead.
5135 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5139 register UV nextvalue;
5140 register IV prevvalue = OOB_UNICODE;
5141 register IV range = 0;
5142 register regnode *ret;
5145 char *rangebegin = NULL;
5146 bool need_class = 0;
5149 bool optimize_invert = TRUE;
5150 AV* unicode_alternate = NULL;
5152 UV literal_endpoint = 0;
5154 UV stored = 0; /* number of chars stored in the class */
5156 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5157 case we need to change the emitted regop to an EXACT. */
5158 const char * orig_parse = RExC_parse;
5159 GET_RE_DEBUG_FLAGS_DECL;
5160 DEBUG_PARSE("clas");
5162 /* Assume we are going to generate an ANYOF node. */
5163 ret = reganode(pRExC_state, ANYOF, 0);
5166 ANYOF_FLAGS(ret) = 0;
5168 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5172 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5176 RExC_size += ANYOF_SKIP;
5177 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5180 RExC_emit += ANYOF_SKIP;
5182 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5184 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5185 ANYOF_BITMAP_ZERO(ret);
5186 listsv = newSVpvs("# comment\n");
5189 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5191 if (!SIZE_ONLY && POSIXCC(nextvalue))
5192 checkposixcc(pRExC_state);
5194 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5195 if (UCHARAT(RExC_parse) == ']')
5198 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5202 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5205 rangebegin = RExC_parse;
5207 value = utf8n_to_uvchr((U8*)RExC_parse,
5208 RExC_end - RExC_parse,
5209 &numlen, UTF8_ALLOW_DEFAULT);
5210 RExC_parse += numlen;
5213 value = UCHARAT(RExC_parse++);
5215 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5216 if (value == '[' && POSIXCC(nextvalue))
5217 namedclass = regpposixcc(pRExC_state, value);
5218 else if (value == '\\') {
5220 value = utf8n_to_uvchr((U8*)RExC_parse,
5221 RExC_end - RExC_parse,
5222 &numlen, UTF8_ALLOW_DEFAULT);
5223 RExC_parse += numlen;
5226 value = UCHARAT(RExC_parse++);
5227 /* Some compilers cannot handle switching on 64-bit integer
5228 * values, therefore value cannot be an UV. Yes, this will
5229 * be a problem later if we want switch on Unicode.
5230 * A similar issue a little bit later when switching on
5231 * namedclass. --jhi */
5232 switch ((I32)value) {
5233 case 'w': namedclass = ANYOF_ALNUM; break;
5234 case 'W': namedclass = ANYOF_NALNUM; break;
5235 case 's': namedclass = ANYOF_SPACE; break;
5236 case 'S': namedclass = ANYOF_NSPACE; break;
5237 case 'd': namedclass = ANYOF_DIGIT; break;
5238 case 'D': namedclass = ANYOF_NDIGIT; break;
5243 if (RExC_parse >= RExC_end)
5244 vFAIL2("Empty \\%c{}", (U8)value);
5245 if (*RExC_parse == '{') {
5246 const U8 c = (U8)value;
5247 e = strchr(RExC_parse++, '}');
5249 vFAIL2("Missing right brace on \\%c{}", c);
5250 while (isSPACE(UCHARAT(RExC_parse)))
5252 if (e == RExC_parse)
5253 vFAIL2("Empty \\%c{}", c);
5255 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5263 if (UCHARAT(RExC_parse) == '^') {
5266 value = value == 'p' ? 'P' : 'p'; /* toggle */
5267 while (isSPACE(UCHARAT(RExC_parse))) {
5272 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5273 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5276 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5277 namedclass = ANYOF_MAX; /* no official name, but it's named */
5280 case 'n': value = '\n'; break;
5281 case 'r': value = '\r'; break;
5282 case 't': value = '\t'; break;
5283 case 'f': value = '\f'; break;
5284 case 'b': value = '\b'; break;
5285 case 'e': value = ASCII_TO_NATIVE('\033');break;
5286 case 'a': value = ASCII_TO_NATIVE('\007');break;
5288 if (*RExC_parse == '{') {
5289 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5290 | PERL_SCAN_DISALLOW_PREFIX;
5291 char * const e = strchr(RExC_parse++, '}');
5293 vFAIL("Missing right brace on \\x{}");
5295 numlen = e - RExC_parse;
5296 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5300 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5302 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5303 RExC_parse += numlen;
5307 value = UCHARAT(RExC_parse++);
5308 value = toCTRL(value);
5310 case '0': case '1': case '2': case '3': case '4':
5311 case '5': case '6': case '7': case '8': case '9':
5315 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5316 RExC_parse += numlen;
5320 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5322 "Unrecognized escape \\%c in character class passed through",
5326 } /* end of \blah */
5332 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5334 if (!SIZE_ONLY && !need_class)
5335 ANYOF_CLASS_ZERO(ret);
5339 /* a bad range like a-\d, a-[:digit:] ? */
5342 if (ckWARN(WARN_REGEXP)) {
5344 RExC_parse >= rangebegin ?
5345 RExC_parse - rangebegin : 0;
5347 "False [] range \"%*.*s\"",
5350 if (prevvalue < 256) {
5351 ANYOF_BITMAP_SET(ret, prevvalue);
5352 ANYOF_BITMAP_SET(ret, '-');
5355 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5356 Perl_sv_catpvf(aTHX_ listsv,
5357 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5361 range = 0; /* this was not a true range */
5365 const char *what = NULL;
5368 if (namedclass > OOB_NAMEDCLASS)
5369 optimize_invert = FALSE;
5370 /* Possible truncation here but in some 64-bit environments
5371 * the compiler gets heartburn about switch on 64-bit values.
5372 * A similar issue a little earlier when switching on value.
5374 switch ((I32)namedclass) {
5377 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5379 for (value = 0; value < 256; value++)
5381 ANYOF_BITMAP_SET(ret, value);
5388 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5390 for (value = 0; value < 256; value++)
5391 if (!isALNUM(value))
5392 ANYOF_BITMAP_SET(ret, value);
5399 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5401 for (value = 0; value < 256; value++)
5402 if (isALNUMC(value))
5403 ANYOF_BITMAP_SET(ret, value);
5410 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5412 for (value = 0; value < 256; value++)
5413 if (!isALNUMC(value))
5414 ANYOF_BITMAP_SET(ret, value);
5421 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5423 for (value = 0; value < 256; value++)
5425 ANYOF_BITMAP_SET(ret, value);
5432 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5434 for (value = 0; value < 256; value++)
5435 if (!isALPHA(value))
5436 ANYOF_BITMAP_SET(ret, value);
5443 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5446 for (value = 0; value < 128; value++)
5447 ANYOF_BITMAP_SET(ret, value);
5449 for (value = 0; value < 256; value++) {
5451 ANYOF_BITMAP_SET(ret, value);
5460 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5463 for (value = 128; value < 256; value++)
5464 ANYOF_BITMAP_SET(ret, value);
5466 for (value = 0; value < 256; value++) {
5467 if (!isASCII(value))
5468 ANYOF_BITMAP_SET(ret, value);
5477 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5479 for (value = 0; value < 256; value++)
5481 ANYOF_BITMAP_SET(ret, value);
5488 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5490 for (value = 0; value < 256; value++)
5491 if (!isBLANK(value))
5492 ANYOF_BITMAP_SET(ret, value);
5499 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5501 for (value = 0; value < 256; value++)
5503 ANYOF_BITMAP_SET(ret, value);
5510 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5512 for (value = 0; value < 256; value++)
5513 if (!isCNTRL(value))
5514 ANYOF_BITMAP_SET(ret, value);
5521 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5523 /* consecutive digits assumed */
5524 for (value = '0'; value <= '9'; value++)
5525 ANYOF_BITMAP_SET(ret, value);
5532 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5534 /* consecutive digits assumed */
5535 for (value = 0; value < '0'; value++)
5536 ANYOF_BITMAP_SET(ret, value);
5537 for (value = '9' + 1; value < 256; value++)
5538 ANYOF_BITMAP_SET(ret, value);
5545 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5547 for (value = 0; value < 256; value++)
5549 ANYOF_BITMAP_SET(ret, value);
5556 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5558 for (value = 0; value < 256; value++)
5559 if (!isGRAPH(value))
5560 ANYOF_BITMAP_SET(ret, value);
5567 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5569 for (value = 0; value < 256; value++)
5571 ANYOF_BITMAP_SET(ret, value);
5578 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5580 for (value = 0; value < 256; value++)
5581 if (!isLOWER(value))
5582 ANYOF_BITMAP_SET(ret, value);
5589 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5591 for (value = 0; value < 256; value++)
5593 ANYOF_BITMAP_SET(ret, value);
5600 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5602 for (value = 0; value < 256; value++)
5603 if (!isPRINT(value))
5604 ANYOF_BITMAP_SET(ret, value);
5611 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5613 for (value = 0; value < 256; value++)
5614 if (isPSXSPC(value))
5615 ANYOF_BITMAP_SET(ret, value);
5622 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5624 for (value = 0; value < 256; value++)
5625 if (!isPSXSPC(value))
5626 ANYOF_BITMAP_SET(ret, value);
5633 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5635 for (value = 0; value < 256; value++)
5637 ANYOF_BITMAP_SET(ret, value);
5644 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5646 for (value = 0; value < 256; value++)
5647 if (!isPUNCT(value))
5648 ANYOF_BITMAP_SET(ret, value);
5655 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5657 for (value = 0; value < 256; value++)
5659 ANYOF_BITMAP_SET(ret, value);
5666 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5668 for (value = 0; value < 256; value++)
5669 if (!isSPACE(value))
5670 ANYOF_BITMAP_SET(ret, value);
5677 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5679 for (value = 0; value < 256; value++)
5681 ANYOF_BITMAP_SET(ret, value);
5688 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5690 for (value = 0; value < 256; value++)
5691 if (!isUPPER(value))
5692 ANYOF_BITMAP_SET(ret, value);
5699 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5701 for (value = 0; value < 256; value++)
5702 if (isXDIGIT(value))
5703 ANYOF_BITMAP_SET(ret, value);
5710 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5712 for (value = 0; value < 256; value++)
5713 if (!isXDIGIT(value))
5714 ANYOF_BITMAP_SET(ret, value);
5720 /* this is to handle \p and \P */
5723 vFAIL("Invalid [::] class");
5727 /* Strings such as "+utf8::isWord\n" */
5728 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5731 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5734 } /* end of namedclass \blah */
5737 if (prevvalue > (IV)value) /* b-a */ {
5738 const int w = RExC_parse - rangebegin;
5739 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5740 range = 0; /* not a valid range */
5744 prevvalue = value; /* save the beginning of the range */
5745 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5746 RExC_parse[1] != ']') {
5749 /* a bad range like \w-, [:word:]- ? */
5750 if (namedclass > OOB_NAMEDCLASS) {
5751 if (ckWARN(WARN_REGEXP)) {
5753 RExC_parse >= rangebegin ?
5754 RExC_parse - rangebegin : 0;
5756 "False [] range \"%*.*s\"",
5760 ANYOF_BITMAP_SET(ret, '-');
5762 range = 1; /* yeah, it's a range! */
5763 continue; /* but do it the next time */
5767 /* now is the next time */
5768 /*stored += (value - prevvalue + 1);*/
5770 if (prevvalue < 256) {
5771 const IV ceilvalue = value < 256 ? value : 255;
5774 /* In EBCDIC [\x89-\x91] should include
5775 * the \x8e but [i-j] should not. */
5776 if (literal_endpoint == 2 &&
5777 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5778 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5780 if (isLOWER(prevvalue)) {
5781 for (i = prevvalue; i <= ceilvalue; i++)
5783 ANYOF_BITMAP_SET(ret, i);
5785 for (i = prevvalue; i <= ceilvalue; i++)
5787 ANYOF_BITMAP_SET(ret, i);
5792 for (i = prevvalue; i <= ceilvalue; i++) {
5793 if (!ANYOF_BITMAP_TEST(ret,i)) {
5795 ANYOF_BITMAP_SET(ret, i);
5799 if (value > 255 || UTF) {
5800 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5801 const UV natvalue = NATIVE_TO_UNI(value);
5802 stored+=2; /* can't optimize this class */
5803 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5804 if (prevnatvalue < natvalue) { /* what about > ? */
5805 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5806 prevnatvalue, natvalue);
5808 else if (prevnatvalue == natvalue) {
5809 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5811 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5813 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5815 /* If folding and foldable and a single
5816 * character, insert also the folded version
5817 * to the charclass. */
5819 if (foldlen == (STRLEN)UNISKIP(f))
5820 Perl_sv_catpvf(aTHX_ listsv,
5823 /* Any multicharacter foldings
5824 * require the following transform:
5825 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5826 * where E folds into "pq" and F folds
5827 * into "rst", all other characters
5828 * fold to single characters. We save
5829 * away these multicharacter foldings,
5830 * to be later saved as part of the
5831 * additional "s" data. */
5834 if (!unicode_alternate)
5835 unicode_alternate = newAV();
5836 sv = newSVpvn((char*)foldbuf, foldlen);
5838 av_push(unicode_alternate, sv);
5842 /* If folding and the value is one of the Greek
5843 * sigmas insert a few more sigmas to make the
5844 * folding rules of the sigmas to work right.
5845 * Note that not all the possible combinations
5846 * are handled here: some of them are handled
5847 * by the standard folding rules, and some of
5848 * them (literal or EXACTF cases) are handled
5849 * during runtime in regexec.c:S_find_byclass(). */
5850 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5851 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5852 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5853 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5854 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5856 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5857 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5858 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5863 literal_endpoint = 0;
5867 range = 0; /* this range (if it was one) is done now */
5871 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5873 RExC_size += ANYOF_CLASS_ADD_SKIP;
5875 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5881 /****** !SIZE_ONLY AFTER HERE *********/
5883 if( stored == 1 && value < 256
5884 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5886 /* optimize single char class to an EXACT node
5887 but *only* when its not a UTF/high char */
5888 const char * cur_parse= RExC_parse;
5889 RExC_emit = (regnode *)orig_emit;
5890 RExC_parse = (char *)orig_parse;
5891 ret = reg_node(pRExC_state,
5892 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5893 RExC_parse = (char *)cur_parse;
5894 *STRING(ret)= (char)value;
5896 RExC_emit += STR_SZ(1);
5899 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5900 if ( /* If the only flag is folding (plus possibly inversion). */
5901 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5903 for (value = 0; value < 256; ++value) {
5904 if (ANYOF_BITMAP_TEST(ret, value)) {
5905 UV fold = PL_fold[value];
5908 ANYOF_BITMAP_SET(ret, fold);
5911 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5914 /* optimize inverted simple patterns (e.g. [^a-z]) */
5915 if (optimize_invert &&
5916 /* If the only flag is inversion. */
5917 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5918 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5919 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5920 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5923 AV * const av = newAV();
5925 /* The 0th element stores the character class description
5926 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5927 * to initialize the appropriate swash (which gets stored in
5928 * the 1st element), and also useful for dumping the regnode.
5929 * The 2nd element stores the multicharacter foldings,
5930 * used later (regexec.c:S_reginclass()). */
5931 av_store(av, 0, listsv);
5932 av_store(av, 1, NULL);
5933 av_store(av, 2, (SV*)unicode_alternate);
5934 rv = newRV_noinc((SV*)av);
5935 n = add_data(pRExC_state, 1, "s");
5936 RExC_rx->data->data[n] = (void*)rv;
5943 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5945 char* const retval = RExC_parse++;
5948 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5949 RExC_parse[2] == '#') {
5950 while (*RExC_parse != ')') {
5951 if (RExC_parse == RExC_end)
5952 FAIL("Sequence (?#... not terminated");
5958 if (RExC_flags & PMf_EXTENDED) {
5959 if (isSPACE(*RExC_parse)) {
5963 else if (*RExC_parse == '#') {
5964 while (RExC_parse < RExC_end)
5965 if (*RExC_parse++ == '\n') break;
5974 - reg_node - emit a node
5976 STATIC regnode * /* Location. */
5977 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5980 register regnode *ptr;
5981 regnode * const ret = RExC_emit;
5982 GET_RE_DEBUG_FLAGS_DECL;
5985 SIZE_ALIGN(RExC_size);
5989 NODE_ALIGN_FILL(ret);
5991 FILL_ADVANCE_NODE(ptr, op);
5992 if (RExC_offsets) { /* MJD */
5993 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
5994 "reg_node", __LINE__,
5996 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
5997 ? "Overwriting end of array!\n" : "OK",
5998 (UV)(RExC_emit - RExC_emit_start),
5999 (UV)(RExC_parse - RExC_start),
6000 (UV)RExC_offsets[0]));
6001 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
6010 - reganode - emit a node with an argument
6012 STATIC regnode * /* Location. */
6013 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6016 register regnode *ptr;
6017 regnode * const ret = RExC_emit;
6018 GET_RE_DEBUG_FLAGS_DECL;
6021 SIZE_ALIGN(RExC_size);
6026 NODE_ALIGN_FILL(ret);
6028 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6029 if (RExC_offsets) { /* MJD */
6030 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6034 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6035 "Overwriting end of array!\n" : "OK",
6036 (UV)(RExC_emit - RExC_emit_start),
6037 (UV)(RExC_parse - RExC_start),
6038 (UV)RExC_offsets[0]));
6039 Set_Cur_Node_Offset;
6048 - reguni - emit (if appropriate) a Unicode character
6051 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6054 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6058 - reginsert - insert an operator in front of already-emitted operand
6060 * Means relocating the operand.
6063 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6066 register regnode *src;
6067 register regnode *dst;
6068 register regnode *place;
6069 const int offset = regarglen[(U8)op];
6070 GET_RE_DEBUG_FLAGS_DECL;
6071 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6074 RExC_size += NODE_STEP_REGNODE + offset;
6079 RExC_emit += NODE_STEP_REGNODE + offset;
6081 while (src > opnd) {
6082 StructCopy(--src, --dst, regnode);
6083 if (RExC_offsets) { /* MJD 20010112 */
6084 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6088 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6089 ? "Overwriting end of array!\n" : "OK",
6090 (UV)(src - RExC_emit_start),
6091 (UV)(dst - RExC_emit_start),
6092 (UV)RExC_offsets[0]));
6093 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6094 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6099 place = opnd; /* Op node, where operand used to be. */
6100 if (RExC_offsets) { /* MJD */
6101 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6105 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6106 ? "Overwriting end of array!\n" : "OK",
6107 (UV)(place - RExC_emit_start),
6108 (UV)(RExC_parse - RExC_start),
6110 Set_Node_Offset(place, RExC_parse);
6111 Set_Node_Length(place, 1);
6113 src = NEXTOPER(place);
6114 FILL_ADVANCE_NODE(place, op);
6115 Zero(src, offset, regnode);
6119 - regtail - set the next-pointer at the end of a node chain of p to val.
6120 - SEE ALSO: regtail_study
6122 /* TODO: All three parms should be const */
6124 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6127 register regnode *scan;
6128 GET_RE_DEBUG_FLAGS_DECL;
6133 /* Find last node. */
6136 regnode * const temp = regnext(scan);
6138 SV * const mysv=sv_newmortal();
6139 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6140 regprop(RExC_rx, mysv, scan);
6141 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6142 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6149 if (reg_off_by_arg[OP(scan)]) {
6150 ARG_SET(scan, val - scan);
6153 NEXT_OFF(scan) = val - scan;
6159 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6160 - Look for optimizable sequences at the same time.
6161 - currently only looks for EXACT chains.
6163 This is expermental code. The idea is to use this routine to perform
6164 in place optimizations on branches and groups as they are constructed,
6165 with the long term intention of removing optimization from study_chunk so
6166 that it is purely analytical.
6168 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6169 to control which is which.
6172 /* TODO: All four parms should be const */
6175 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6178 register regnode *scan;
6180 #ifdef EXPERIMENTAL_INPLACESCAN
6184 GET_RE_DEBUG_FLAGS_DECL;
6190 /* Find last node. */
6194 regnode * const temp = regnext(scan);
6195 #ifdef EXPERIMENTAL_INPLACESCAN
6196 if (PL_regkind[OP(scan)] == EXACT)
6197 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6205 if( exact == PSEUDO )
6207 else if ( exact != OP(scan) )
6216 SV * const mysv=sv_newmortal();
6217 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6218 regprop(RExC_rx, mysv, scan);
6219 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6220 SvPV_nolen_const(mysv),
6222 REG_NODE_NUM(scan));
6229 SV * const mysv_val=sv_newmortal();
6230 DEBUG_PARSE_MSG("");
6231 regprop(RExC_rx, mysv_val, val);
6232 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6233 SvPV_nolen_const(mysv_val),
6238 if (reg_off_by_arg[OP(scan)]) {
6239 ARG_SET(scan, val - scan);
6242 NEXT_OFF(scan) = val - scan;
6250 - regcurly - a little FSA that accepts {\d+,?\d*}
6253 S_regcurly(register const char *s)
6272 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6275 Perl_regdump(pTHX_ const regexp *r)
6279 SV * const sv = sv_newmortal();
6281 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
6283 /* Header fields of interest. */
6284 if (r->anchored_substr)
6285 PerlIO_printf(Perl_debug_log,
6286 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
6288 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
6289 SvPVX_const(r->anchored_substr),
6291 SvTAIL(r->anchored_substr) ? "$" : "",
6292 (IV)r->anchored_offset);
6293 else if (r->anchored_utf8)
6294 PerlIO_printf(Perl_debug_log,
6295 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
6297 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
6298 SvPVX_const(r->anchored_utf8),
6300 SvTAIL(r->anchored_utf8) ? "$" : "",
6301 (IV)r->anchored_offset);
6302 if (r->float_substr)
6303 PerlIO_printf(Perl_debug_log,
6304 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6306 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
6307 SvPVX_const(r->float_substr),
6309 SvTAIL(r->float_substr) ? "$" : "",
6310 (IV)r->float_min_offset, (UV)r->float_max_offset);
6311 else if (r->float_utf8)
6312 PerlIO_printf(Perl_debug_log,
6313 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6315 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
6316 SvPVX_const(r->float_utf8),
6318 SvTAIL(r->float_utf8) ? "$" : "",
6319 (IV)r->float_min_offset, (UV)r->float_max_offset);
6320 if (r->check_substr || r->check_utf8)
6321 PerlIO_printf(Perl_debug_log,
6322 r->check_substr == r->float_substr
6323 && r->check_utf8 == r->float_utf8
6324 ? "(checking floating" : "(checking anchored");
6325 if (r->reganch & ROPT_NOSCAN)
6326 PerlIO_printf(Perl_debug_log, " noscan");
6327 if (r->reganch & ROPT_CHECK_ALL)
6328 PerlIO_printf(Perl_debug_log, " isall");
6329 if (r->check_substr || r->check_utf8)
6330 PerlIO_printf(Perl_debug_log, ") ");
6332 if (r->regstclass) {
6333 regprop(r, sv, r->regstclass);
6334 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6336 if (r->reganch & ROPT_ANCH) {
6337 PerlIO_printf(Perl_debug_log, "anchored");
6338 if (r->reganch & ROPT_ANCH_BOL)
6339 PerlIO_printf(Perl_debug_log, "(BOL)");
6340 if (r->reganch & ROPT_ANCH_MBOL)
6341 PerlIO_printf(Perl_debug_log, "(MBOL)");
6342 if (r->reganch & ROPT_ANCH_SBOL)
6343 PerlIO_printf(Perl_debug_log, "(SBOL)");
6344 if (r->reganch & ROPT_ANCH_GPOS)
6345 PerlIO_printf(Perl_debug_log, "(GPOS)");
6346 PerlIO_putc(Perl_debug_log, ' ');
6348 if (r->reganch & ROPT_GPOS_SEEN)
6349 PerlIO_printf(Perl_debug_log, "GPOS ");
6350 if (r->reganch & ROPT_SKIP)
6351 PerlIO_printf(Perl_debug_log, "plus ");
6352 if (r->reganch & ROPT_IMPLICIT)
6353 PerlIO_printf(Perl_debug_log, "implicit ");
6354 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6355 if (r->reganch & ROPT_EVAL_SEEN)
6356 PerlIO_printf(Perl_debug_log, "with eval ");
6357 PerlIO_printf(Perl_debug_log, "\n");
6359 const U32 len = r->offsets[0];
6360 GET_RE_DEBUG_FLAGS_DECL;
6363 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
6364 for (i = 1; i <= len; i++) {
6365 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6366 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
6368 PerlIO_printf(Perl_debug_log, "\n");
6372 PERL_UNUSED_CONTEXT;
6374 #endif /* DEBUGGING */
6378 - regprop - printable representation of opcode
6381 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6387 sv_setpvn(sv, "", 0);
6388 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6389 /* It would be nice to FAIL() here, but this may be called from
6390 regexec.c, and it would be hard to supply pRExC_state. */
6391 Perl_croak(aTHX_ "Corrupted regexp opcode");
6392 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6394 k = PL_regkind[OP(o)];
6397 SV * const dsv = sv_2mortal(newSVpvs(""));
6398 /* Using is_utf8_string() is a crude hack but it may
6399 * be the best for now since we have no flag "this EXACTish
6400 * node was UTF-8" --jhi */
6401 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
6402 const char * const s = do_utf8 ?
6403 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6404 UNI_DISPLAY_REGEX) :
6406 const int len = do_utf8 ?
6409 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6413 } else if (k == TRIE) {
6414 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6415 /* print the details of the trie in dumpuntil instead, as
6416 * prog->data isn't available here */
6417 } else if (k == CURLY) {
6418 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6419 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6420 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6422 else if (k == WHILEM && o->flags) /* Ordinal/of */
6423 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6424 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6425 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6426 else if (k == LOGICAL)
6427 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6428 else if (k == ANYOF) {
6429 int i, rangestart = -1;
6430 const U8 flags = ANYOF_FLAGS(o);
6432 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6433 static const char * const anyofs[] = {
6466 if (flags & ANYOF_LOCALE)
6467 sv_catpvs(sv, "{loc}");
6468 if (flags & ANYOF_FOLD)
6469 sv_catpvs(sv, "{i}");
6470 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6471 if (flags & ANYOF_INVERT)
6473 for (i = 0; i <= 256; i++) {
6474 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6475 if (rangestart == -1)
6477 } else if (rangestart != -1) {
6478 if (i <= rangestart + 3)
6479 for (; rangestart < i; rangestart++)
6480 put_byte(sv, rangestart);
6482 put_byte(sv, rangestart);
6484 put_byte(sv, i - 1);
6490 if (o->flags & ANYOF_CLASS)
6491 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6492 if (ANYOF_CLASS_TEST(o,i))
6493 sv_catpv(sv, anyofs[i]);
6495 if (flags & ANYOF_UNICODE)
6496 sv_catpvs(sv, "{unicode}");
6497 else if (flags & ANYOF_UNICODE_ALL)
6498 sv_catpvs(sv, "{unicode_all}");
6502 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6506 U8 s[UTF8_MAXBYTES_CASE+1];
6508 for (i = 0; i <= 256; i++) { /* just the first 256 */
6509 uvchr_to_utf8(s, i);
6511 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6512 if (rangestart == -1)
6514 } else if (rangestart != -1) {
6515 if (i <= rangestart + 3)
6516 for (; rangestart < i; rangestart++) {
6517 const U8 * const e = uvchr_to_utf8(s,rangestart);
6519 for(p = s; p < e; p++)
6523 const U8 *e = uvchr_to_utf8(s,rangestart);
6525 for (p = s; p < e; p++)
6528 e = uvchr_to_utf8(s, i-1);
6529 for (p = s; p < e; p++)
6536 sv_catpvs(sv, "..."); /* et cetera */
6540 char *s = savesvpv(lv);
6541 char * const origs = s;
6543 while (*s && *s != '\n')
6547 const char * const t = ++s;
6565 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6567 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6568 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
6570 PERL_UNUSED_CONTEXT;
6571 PERL_UNUSED_ARG(sv);
6573 #endif /* DEBUGGING */
6577 Perl_re_intuit_string(pTHX_ regexp *prog)
6578 { /* Assume that RE_INTUIT is set */
6580 GET_RE_DEBUG_FLAGS_DECL;
6581 PERL_UNUSED_CONTEXT;
6585 const char * const s = SvPV_nolen_const(prog->check_substr
6586 ? prog->check_substr : prog->check_utf8);
6588 if (!PL_colorset) reginitcolors();
6589 PerlIO_printf(Perl_debug_log,
6590 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6592 prog->check_substr ? "" : "utf8 ",
6593 PL_colors[5],PL_colors[0],
6596 (strlen(s) > 60 ? "..." : ""));
6599 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6603 Perl_pregfree(pTHX_ struct regexp *r)
6607 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6609 GET_RE_DEBUG_FLAGS_DECL;
6611 if (!r || (--r->refcnt > 0))
6613 DEBUG_COMPILE_r(if (RX_DEBUG(r)){
6614 const char * const s = (r->reganch & ROPT_UTF8)
6615 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6616 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6617 const int len = SvCUR(dsv);
6620 PerlIO_printf(Perl_debug_log,
6621 "%sFreeing REx:%s %s%*.*s%s%s\n",
6622 PL_colors[4],PL_colors[5],PL_colors[0],
6625 len > 60 ? "..." : "");
6628 /* gcov results gave these as non-null 100% of the time, so there's no
6629 optimisation in checking them before calling Safefree */
6630 Safefree(r->precomp);
6631 Safefree(r->offsets); /* 20010421 MJD */
6632 RX_MATCH_COPY_FREE(r);
6633 #ifdef PERL_OLD_COPY_ON_WRITE
6635 SvREFCNT_dec(r->saved_copy);
6638 if (r->anchored_substr)
6639 SvREFCNT_dec(r->anchored_substr);
6640 if (r->anchored_utf8)
6641 SvREFCNT_dec(r->anchored_utf8);
6642 if (r->float_substr)
6643 SvREFCNT_dec(r->float_substr);
6645 SvREFCNT_dec(r->float_utf8);
6646 Safefree(r->substrs);
6649 int n = r->data->count;
6650 PAD* new_comppad = NULL;
6655 /* If you add a ->what type here, update the comment in regcomp.h */
6656 switch (r->data->what[n]) {
6658 SvREFCNT_dec((SV*)r->data->data[n]);
6661 Safefree(r->data->data[n]);
6664 new_comppad = (AV*)r->data->data[n];
6667 if (new_comppad == NULL)
6668 Perl_croak(aTHX_ "panic: pregfree comppad");
6669 PAD_SAVE_LOCAL(old_comppad,
6670 /* Watch out for global destruction's random ordering. */
6671 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6674 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6677 op_free((OP_4tree*)r->data->data[n]);
6679 PAD_RESTORE_LOCAL(old_comppad);
6680 SvREFCNT_dec((SV*)new_comppad);
6686 { /* Aho Corasick add-on structure for a trie node.
6687 Used in stclass optimization only */
6689 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6691 refcount = --aho->refcount;
6694 Safefree(aho->states);
6695 Safefree(aho->fail);
6696 aho->trie=NULL; /* not necessary to free this as it is
6697 handled by the 't' case */
6698 Safefree(r->data->data[n]); /* do this last!!!! */
6699 Safefree(r->regstclass);
6705 /* trie structure. */
6707 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6709 refcount = --trie->refcount;
6712 Safefree(trie->charmap);
6713 if (trie->widecharmap)
6714 SvREFCNT_dec((SV*)trie->widecharmap);
6715 Safefree(trie->states);
6716 Safefree(trie->trans);
6718 Safefree(trie->bitmap);
6720 Safefree(trie->wordlen);
6724 SvREFCNT_dec((SV*)trie->words);
6725 if (trie->revcharmap)
6726 SvREFCNT_dec((SV*)trie->revcharmap);
6729 Safefree(r->data->data[n]); /* do this last!!!! */
6734 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6737 Safefree(r->data->what);
6740 Safefree(r->startp);
6745 #ifndef PERL_IN_XSUB_RE
6747 - regnext - dig the "next" pointer out of a node
6750 Perl_regnext(pTHX_ register regnode *p)
6753 register I32 offset;
6755 if (p == &PL_regdummy)
6758 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6767 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6770 STRLEN l1 = strlen(pat1);
6771 STRLEN l2 = strlen(pat2);
6774 const char *message;
6780 Copy(pat1, buf, l1 , char);
6781 Copy(pat2, buf + l1, l2 , char);
6782 buf[l1 + l2] = '\n';
6783 buf[l1 + l2 + 1] = '\0';
6785 /* ANSI variant takes additional second argument */
6786 va_start(args, pat2);
6790 msv = vmess(buf, &args);
6792 message = SvPV_const(msv,l1);
6795 Copy(message, buf, l1 , char);
6796 buf[l1-1] = '\0'; /* Overwrite \n */
6797 Perl_croak(aTHX_ "%s", buf);
6800 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6802 #ifndef PERL_IN_XSUB_RE
6804 Perl_save_re_context(pTHX)
6808 struct re_save_state *state;
6810 SAVEVPTR(PL_curcop);
6811 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6813 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6814 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6815 SSPUSHINT(SAVEt_RE_STATE);
6817 Copy(&PL_reg_state, state, 1, struct re_save_state);
6819 PL_reg_start_tmp = 0;
6820 PL_reg_start_tmpl = 0;
6821 PL_reg_oldsaved = NULL;
6822 PL_reg_oldsavedlen = 0;
6824 PL_reg_leftiter = 0;
6825 PL_reg_poscache = NULL;
6826 PL_reg_poscache_size = 0;
6827 #ifdef PERL_OLD_COPY_ON_WRITE
6831 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6833 const REGEXP * const rx = PM_GETRE(PL_curpm);
6836 for (i = 1; i <= rx->nparens; i++) {
6837 char digits[TYPE_CHARS(long)];
6838 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
6839 GV *const *const gvp
6840 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6843 GV * const gv = *gvp;
6844 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6854 clear_re(pTHX_ void *r)
6857 ReREFCNT_dec((regexp *)r);
6863 S_put_byte(pTHX_ SV *sv, int c)
6865 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6866 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6867 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6868 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6870 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6873 #define CLEAR_OPTSTART \
6874 if (optstart) STMT_START { \
6875 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
6879 #define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6881 STATIC const regnode *
6882 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6883 const regnode *last, SV* sv, I32 l)
6886 register U8 op = EXACT; /* Arbitrary non-END op. */
6887 register const regnode *next;
6888 const regnode *optstart= NULL;
6889 GET_RE_DEBUG_FLAGS_DECL;
6891 while (op != END && (!last || node < last)) {
6892 /* While that wasn't END last time... */
6898 next = regnext((regnode *)node);
6901 if (OP(node) == OPTIMIZED) {
6902 if (!optstart && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE_OPTIMISE))
6909 regprop(r, sv, node);
6910 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6911 (int)(2*l + 1), "", SvPVX_const(sv));
6913 if (OP(node) != OPTIMIZED) {
6914 if (next == NULL) /* Next ptr. */
6915 PerlIO_printf(Perl_debug_log, "(0)");
6917 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6918 (void)PerlIO_putc(Perl_debug_log, '\n');
6922 if (PL_regkind[(U8)op] == BRANCHJ) {
6925 register const regnode *nnode = (OP(next) == LONGJMP
6926 ? regnext((regnode *)next)
6928 if (last && nnode > last)
6930 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6933 else if (PL_regkind[(U8)op] == BRANCH) {
6935 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
6937 else if ( PL_regkind[(U8)op] == TRIE ) {
6938 const I32 n = ARG(node);
6939 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6940 const I32 arry_len = av_len(trie->words)+1;
6942 PerlIO_printf(Perl_debug_log,
6943 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
6947 TRIE_WORDCOUNT(trie),
6948 (int)TRIE_CHARCOUNT(trie),
6949 trie->uniquecharcount,
6950 (IV)TRIE_LASTSTATE(trie)-1,
6957 sv_setpvn(sv, "", 0);
6958 for (i = 0; i <= 256; i++) {
6959 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6960 if (rangestart == -1)
6962 } else if (rangestart != -1) {
6963 if (i <= rangestart + 3)
6964 for (; rangestart < i; rangestart++)
6965 put_byte(sv, rangestart);
6967 put_byte(sv, rangestart);
6969 put_byte(sv, i - 1);
6974 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
6976 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
6978 for (word_idx=0; word_idx < arry_len; word_idx++) {
6979 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
6981 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6984 SvPV_nolen_const(*elem_ptr),
6990 node = NEXTOPER(node);
6991 node += regarglen[(U8)op];
6994 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6995 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6996 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6998 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
7000 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7003 else if ( op == PLUS || op == STAR) {
7004 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
7006 else if (op == ANYOF) {
7007 /* arglen 1 + class block */
7008 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7009 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7010 node = NEXTOPER(node);
7012 else if (PL_regkind[(U8)op] == EXACT) {
7013 /* Literal string, where present. */
7014 node += NODE_SZ_STR(node) - 1;
7015 node = NEXTOPER(node);
7018 node = NEXTOPER(node);
7019 node += regarglen[(U8)op];
7021 if (op == CURLYX || op == OPEN)
7023 else if (op == WHILEM)
7030 #endif /* DEBUGGING */
7034 * c-indentation-style: bsd
7036 * indent-tabs-mode: t
7039 * ex: set ts=8 sts=4 sw=4 noet: