5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
287 #define Simple_vFAIL(m) STMT_START { \
288 const IV offset = RExC_parse - RExC_precomp; \
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
296 #define vFAIL(m) STMT_START { \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
303 * Like Simple_vFAIL(), but accepts two arguments.
305 #define Simple_vFAIL2(m,a1) STMT_START { \
306 const IV offset = RExC_parse - RExC_precomp; \
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
314 #define vFAIL2(m,a1) STMT_START { \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
322 * Like Simple_vFAIL(), but accepts three arguments.
324 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
325 const IV offset = RExC_parse - RExC_precomp; \
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
333 #define vFAIL3(m,a1,a2) STMT_START { \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
340 * Like Simple_vFAIL(), but accepts four arguments.
342 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
343 const IV offset = RExC_parse - RExC_precomp; \
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
348 #define vWARN(loc,m) STMT_START { \
349 const IV offset = loc - RExC_precomp; \
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
354 #define vWARNdep(loc,m) STMT_START { \
355 const IV offset = loc - RExC_precomp; \
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
362 #define vWARN2(loc, m, a1) STMT_START { \
363 const IV offset = loc - RExC_precomp; \
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
368 #define vWARN3(loc, m, a1, a2) STMT_START { \
369 const IV offset = loc - RExC_precomp; \
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
374 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
375 const IV offset = loc - RExC_precomp; \
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
381 const IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
387 /* Allow for side effects in s */
388 #define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
392 /* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
398 #define MJD_OFFSET_DEBUG(x)
399 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
402 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
409 RExC_offsets[2*(node)-1] = (byte); \
414 #define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
418 #define Set_Node_Length_To_R(node,len) STMT_START { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
421 __LINE__, (int)(node), (int)(len))); \
423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
425 RExC_offsets[2*(node)] = (len); \
430 #define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433 #define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
436 /* Get offsets and lengths */
437 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
440 static void clear_re(pTHX_ void *r);
442 /* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
447 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
453 SvSetMagicSV(*data->longest, data->last_found);
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
460 data->flags &= ~SF_FIX_BEFORE_EOL;
463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
464 data->offset_float_max = (l
465 ? data->last_start_max
466 : data->pos_min + data->pos_delta);
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
469 if (data->flags & SF_BEFORE_EOL)
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
473 data->flags &= ~SF_FL_BEFORE_EOL;
476 SvCUR_set(data->last_found, 0);
478 SV * const sv = data->last_found;
479 if (SvUTF8(sv) && SvMAGICAL(sv)) {
480 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
486 data->flags &= ~SF_BEFORE_EOL;
489 /* Can match anything (initialization) */
491 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
493 ANYOF_CLASS_ZERO(cl);
494 ANYOF_BITMAP_SETALL(cl);
495 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
497 cl->flags |= ANYOF_LOCALE;
500 /* Can match anything (initialization) */
502 S_cl_is_anything(const struct regnode_charclass_class *cl)
506 for (value = 0; value <= ANYOF_MAX; value += 2)
507 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
509 if (!(cl->flags & ANYOF_UNICODE_ALL))
511 if (!ANYOF_BITMAP_TESTALLSET(cl))
516 /* Can match anything (initialization) */
518 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
520 Zero(cl, 1, struct regnode_charclass_class);
522 cl_anything(pRExC_state, cl);
526 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
528 Zero(cl, 1, struct regnode_charclass_class);
530 cl_anything(pRExC_state, cl);
532 cl->flags |= ANYOF_LOCALE;
535 /* 'And' a given class with another one. Can create false positives */
536 /* We assume that cl is not inverted */
538 S_cl_and(struct regnode_charclass_class *cl,
539 const struct regnode_charclass_class *and_with)
541 if (!(and_with->flags & ANYOF_CLASS)
542 && !(cl->flags & ANYOF_CLASS)
543 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
544 && !(and_with->flags & ANYOF_FOLD)
545 && !(cl->flags & ANYOF_FOLD)) {
548 if (and_with->flags & ANYOF_INVERT)
549 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
550 cl->bitmap[i] &= ~and_with->bitmap[i];
552 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
553 cl->bitmap[i] &= and_with->bitmap[i];
554 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
555 if (!(and_with->flags & ANYOF_EOS))
556 cl->flags &= ~ANYOF_EOS;
558 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
559 !(and_with->flags & ANYOF_INVERT)) {
560 cl->flags &= ~ANYOF_UNICODE_ALL;
561 cl->flags |= ANYOF_UNICODE;
562 ARG_SET(cl, ARG(and_with));
564 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
565 !(and_with->flags & ANYOF_INVERT))
566 cl->flags &= ~ANYOF_UNICODE_ALL;
567 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
568 !(and_with->flags & ANYOF_INVERT))
569 cl->flags &= ~ANYOF_UNICODE;
572 /* 'OR' a given class with another one. Can create false positives */
573 /* We assume that cl is not inverted */
575 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
577 if (or_with->flags & ANYOF_INVERT) {
579 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
580 * <= (B1 | !B2) | (CL1 | !CL2)
581 * which is wasteful if CL2 is small, but we ignore CL2:
582 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
583 * XXXX Can we handle case-fold? Unclear:
584 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
585 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
587 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
588 && !(or_with->flags & ANYOF_FOLD)
589 && !(cl->flags & ANYOF_FOLD) ) {
592 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
593 cl->bitmap[i] |= ~or_with->bitmap[i];
594 } /* XXXX: logic is complicated otherwise */
596 cl_anything(pRExC_state, cl);
599 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
600 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
601 && (!(or_with->flags & ANYOF_FOLD)
602 || (cl->flags & ANYOF_FOLD)) ) {
605 /* OR char bitmap and class bitmap separately */
606 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
607 cl->bitmap[i] |= or_with->bitmap[i];
608 if (or_with->flags & ANYOF_CLASS) {
609 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
610 cl->classflags[i] |= or_with->classflags[i];
611 cl->flags |= ANYOF_CLASS;
614 else { /* XXXX: logic is complicated, leave it along for a moment. */
615 cl_anything(pRExC_state, cl);
618 if (or_with->flags & ANYOF_EOS)
619 cl->flags |= ANYOF_EOS;
621 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
622 ARG(cl) != ARG(or_with)) {
623 cl->flags |= ANYOF_UNICODE_ALL;
624 cl->flags &= ~ANYOF_UNICODE;
626 if (or_with->flags & ANYOF_UNICODE_ALL) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
634 make_trie(startbranch,first,last,tail,flags)
635 startbranch: the first branch in the whole branch sequence
636 first : start branch of sequence of branch-exact nodes.
637 May be the same as startbranch
638 last : Thing following the last branch.
639 May be the same as tail.
640 tail : item following the branch sequence
641 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
643 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
645 A trie is an N'ary tree where the branches are determined by digital
646 decomposition of the key. IE, at the root node you look up the 1st character and
647 follow that branch repeat until you find the end of the branches. Nodes can be
648 marked as "accepting" meaning they represent a complete word. Eg:
652 would convert into the following structure. Numbers represent states, letters
653 following numbers represent valid transitions on the letter from that state, if
654 the number is in square brackets it represents an accepting state, otherwise it
655 will be in parenthesis.
657 +-h->+-e->[3]-+-r->(8)-+-s->[9]
661 (1) +-i->(6)-+-s->[7]
663 +-s->(3)-+-h->(4)-+-e->[5]
665 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
667 This shows that when matching against the string 'hers' we will begin at state 1
668 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
669 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
670 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
671 single traverse. We store a mapping from accepting to state to which word was
672 matched, and then when we have multiple possibilities we try to complete the
673 rest of the regex in the order in which they occured in the alternation.
675 The only prior NFA like behaviour that would be changed by the TRIE support is
676 the silent ignoring of duplicate alternations which are of the form:
678 / (DUPE|DUPE) X? (?{ ... }) Y /x
680 Thus EVAL blocks follwing a trie may be called a different number of times with
681 and without the optimisation. With the optimisations dupes will be silently
682 ignored. This inconsistant behaviour of EVAL type nodes is well established as
683 the following demonstrates:
685 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
687 which prints out 'word' three times, but
689 'words'=~/(word|word|word)(?{ print $1 })S/
691 which doesnt print it out at all. This is due to other optimisations kicking in.
693 Example of what happens on a structural level:
695 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
697 1: CURLYM[1] {1,32767}(18)
708 This would be optimizable with startbranch=5, first=5, last=16, tail=16
709 and should turn into:
711 1: CURLYM[1] {1,32767}(18)
713 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
721 Cases where tail != last would be like /(?foo|bar)baz/:
731 which would be optimizable with startbranch=1, first=1, last=7, tail=8
732 and would end up looking like:
735 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
744 #define TRIE_DEBUG_CHAR \
745 DEBUG_TRIE_COMPILE_r({ \
748 tmp = newSVpvs( "" ); \
749 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
751 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
753 av_push( trie->revcharmap, tmp ); \
756 #define TRIE_READ_CHAR STMT_START { \
759 if ( foldlen > 0 ) { \
760 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
765 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
766 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
767 foldlen -= UNISKIP( uvc ); \
768 scan = foldbuf + UNISKIP( uvc ); \
771 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
780 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
781 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
782 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
783 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
785 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
786 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
787 TRIE_LIST_LEN( state ) *= 2; \
788 Renew( trie->states[ state ].trans.list, \
789 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
791 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
792 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
793 TRIE_LIST_CUR( state )++; \
796 #define TRIE_LIST_NEW(state) STMT_START { \
797 Newxz( trie->states[ state ].trans.list, \
798 4, reg_trie_trans_le ); \
799 TRIE_LIST_CUR( state ) = 1; \
800 TRIE_LIST_LEN( state ) = 4; \
804 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
807 /* first pass, loop through and scan words */
810 const U32 uniflags = UTF8_ALLOW_DEFAULT;
815 /* we just use folder as a flag in utf8 */
816 const U8 * const folder = ( flags == EXACTF
824 const U32 data_slot = add_data( pRExC_state, 1, "t" );
827 GET_RE_DEBUG_FLAGS_DECL;
829 Newxz( trie, 1, reg_trie_data );
831 RExC_rx->data->data[ data_slot ] = (void*)trie;
832 Newxz( trie->charmap, 256, U16 );
834 trie->words = newAV();
835 trie->revcharmap = newAV();
839 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
840 if (!SvIOK(re_trie_maxbuff)) {
841 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
844 /* -- First loop and Setup --
846 We first traverse the branches and scan each word to determine if it
847 contains widechars, and how many unique chars there are, this is
848 important as we have to build a table with at least as many columns as we
851 We use an array of integers to represent the character codes 0..255
852 (trie->charmap) and we use a an HV* to store unicode characters. We use the
853 native representation of the character value as the key and IV's for the
856 *TODO* If we keep track of how many times each character is used we can
857 remap the columns so that the table compression later on is more
858 efficient in terms of memory by ensuring most common value is in the
859 middle and the least common are on the outside. IMO this would be better
860 than a most to least common mapping as theres a decent chance the most
861 common letter will share a node with the least common, meaning the node
862 will not be compressable. With a middle is most common approach the worst
863 case is when we have the least common nodes twice.
868 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
869 regnode * const noper = NEXTOPER( cur );
870 const U8 *uc = (U8*)STRING( noper );
871 const U8 * const e = uc + STR_LEN( noper );
873 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
874 const U8 *scan = (U8*)NULL;
876 for ( ; uc < e ; uc += len ) {
880 if ( !trie->charmap[ uvc ] ) {
881 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
883 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
888 if ( !trie->widecharmap )
889 trie->widecharmap = newHV();
891 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
894 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
896 if ( !SvTRUE( *svpp ) ) {
897 sv_setiv( *svpp, ++trie->uniquecharcount );
903 } /* end first pass */
904 DEBUG_TRIE_COMPILE_r(
905 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
906 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
907 (int)trie->charcount, trie->uniquecharcount )
912 We now know what we are dealing with in terms of unique chars and
913 string sizes so we can calculate how much memory a naive
914 representation using a flat table will take. If it's over a reasonable
915 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
916 conservative but potentially much slower representation using an array
919 At the end we convert both representations into the same compressed
920 form that will be used in regexec.c for matching with. The latter
921 is a form that cannot be used to construct with but has memory
922 properties similar to the list form and access properties similar
923 to the table form making it both suitable for fast searches and
924 small enough that its feasable to store for the duration of a program.
926 See the comment in the code where the compressed table is produced
927 inplace from the flat tabe representation for an explanation of how
928 the compression works.
933 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
935 Second Pass -- Array Of Lists Representation
937 Each state will be represented by a list of charid:state records
938 (reg_trie_trans_le) the first such element holds the CUR and LEN
939 points of the allocated array. (See defines above).
941 We build the initial structure using the lists, and then convert
942 it into the compressed table form which allows faster lookups
943 (but cant be modified once converted).
949 STRLEN transcount = 1;
951 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
955 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
957 regnode * const noper = NEXTOPER( cur );
958 U8 *uc = (U8*)STRING( noper );
959 const U8 * const e = uc + STR_LEN( noper );
960 U32 state = 1; /* required init */
961 U16 charid = 0; /* sanity init */
962 U8 *scan = (U8*)NULL; /* sanity init */
963 STRLEN foldlen = 0; /* required init */
964 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
966 for ( ; uc < e ; uc += len ) {
971 charid = trie->charmap[ uvc ];
973 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
977 charid=(U16)SvIV( *svpp );
986 if ( !trie->states[ state ].trans.list ) {
987 TRIE_LIST_NEW( state );
989 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
990 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
991 newstate = TRIE_LIST_ITEM( state, check ).newstate;
996 newstate = next_alloc++;
997 TRIE_LIST_PUSH( state, charid, newstate );
1002 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1004 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1007 if ( !trie->states[ state ].wordnum ) {
1008 /* we havent inserted this word into the structure yet. */
1009 trie->states[ state ].wordnum = ++curword;
1012 /* store the word for dumping */
1013 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1014 if ( UTF ) SvUTF8_on( tmp );
1015 av_push( trie->words, tmp );
1019 /*EMPTY*/; /* It's a dupe. So ignore it. */
1022 } /* end second pass */
1024 trie->laststate = next_alloc;
1025 Renew( trie->states, next_alloc, reg_trie_state );
1027 DEBUG_TRIE_COMPILE_MORE_r({
1030 /* print out the table precompression. */
1032 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1033 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1035 for( state=1 ; state < next_alloc ; state ++ ) {
1038 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1039 if ( ! trie->states[ state ].wordnum ) {
1040 PerlIO_printf( Perl_debug_log, "%5s| ","");
1042 PerlIO_printf( Perl_debug_log, "W%04x| ",
1043 trie->states[ state ].wordnum
1046 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1047 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1048 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1049 SvPV_nolen_const( *tmp ),
1050 TRIE_LIST_ITEM(state,charid).forid,
1051 (UV)TRIE_LIST_ITEM(state,charid).newstate
1056 PerlIO_printf( Perl_debug_log, "\n\n" );
1059 Newxz( trie->trans, transcount ,reg_trie_trans );
1066 for( state=1 ; state < next_alloc ; state ++ ) {
1070 DEBUG_TRIE_COMPILE_MORE_r(
1071 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1075 if (trie->states[state].trans.list) {
1076 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1080 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1081 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1082 if ( forid < minid ) {
1084 } else if ( forid > maxid ) {
1088 if ( transcount < tp + maxid - minid + 1) {
1090 Renew( trie->trans, transcount, reg_trie_trans );
1091 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1093 base = trie->uniquecharcount + tp - minid;
1094 if ( maxid == minid ) {
1096 for ( ; zp < tp ; zp++ ) {
1097 if ( ! trie->trans[ zp ].next ) {
1098 base = trie->uniquecharcount + zp - minid;
1099 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1100 trie->trans[ zp ].check = state;
1106 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1107 trie->trans[ tp ].check = state;
1112 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1113 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1114 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1115 trie->trans[ tid ].check = state;
1117 tp += ( maxid - minid + 1 );
1119 Safefree(trie->states[ state ].trans.list);
1122 DEBUG_TRIE_COMPILE_MORE_r(
1123 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1126 trie->states[ state ].trans.base=base;
1128 trie->lasttrans = tp + 1;
1132 Second Pass -- Flat Table Representation.
1134 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1135 We know that we will need Charcount+1 trans at most to store the data
1136 (one row per char at worst case) So we preallocate both structures
1137 assuming worst case.
1139 We then construct the trie using only the .next slots of the entry
1142 We use the .check field of the first entry of the node temporarily to
1143 make compression both faster and easier by keeping track of how many non
1144 zero fields are in the node.
1146 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1149 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1150 number representing the first entry of the node, and state as a
1151 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1152 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1153 are 2 entrys per node. eg:
1161 The table is internally in the right hand, idx form. However as we also
1162 have to deal with the states array which is indexed by nodenum we have to
1163 use TRIE_NODENUM() to convert.
1167 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1169 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
1170 next_alloc = trie->uniquecharcount + 1;
1172 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1174 regnode * const noper = NEXTOPER( cur );
1175 const U8 *uc = (U8*)STRING( noper );
1176 const U8 * const e = uc + STR_LEN( noper );
1178 U32 state = 1; /* required init */
1180 U16 charid = 0; /* sanity init */
1181 U32 accept_state = 0; /* sanity init */
1182 U8 *scan = (U8*)NULL; /* sanity init */
1184 STRLEN foldlen = 0; /* required init */
1185 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1188 for ( ; uc < e ; uc += len ) {
1193 charid = trie->charmap[ uvc ];
1195 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1196 charid = svpp ? (U16)SvIV(*svpp) : 0;
1200 if ( !trie->trans[ state + charid ].next ) {
1201 trie->trans[ state + charid ].next = next_alloc;
1202 trie->trans[ state ].check++;
1203 next_alloc += trie->uniquecharcount;
1205 state = trie->trans[ state + charid ].next;
1207 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1209 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1212 accept_state = TRIE_NODENUM( state );
1213 if ( !trie->states[ accept_state ].wordnum ) {
1214 /* we havent inserted this word into the structure yet. */
1215 trie->states[ accept_state ].wordnum = ++curword;
1218 /* store the word for dumping */
1219 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1220 if ( UTF ) SvUTF8_on( tmp );
1221 av_push( trie->words, tmp );
1225 /*EMPTY*/; /* Its a dupe. So ignore it. */
1228 } /* end second pass */
1230 DEBUG_TRIE_COMPILE_MORE_r({
1232 print out the table precompression so that we can do a visual check
1233 that they are identical.
1237 PerlIO_printf( Perl_debug_log, "\nChar : " );
1239 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1240 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1242 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1246 PerlIO_printf( Perl_debug_log, "\nState+-" );
1248 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1249 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1252 PerlIO_printf( Perl_debug_log, "\n" );
1254 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1256 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1258 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1259 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1260 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1262 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1263 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1265 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1266 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1269 PerlIO_printf( Perl_debug_log, "\n\n" );
1273 * Inplace compress the table.*
1275 For sparse data sets the table constructed by the trie algorithm will
1276 be mostly 0/FAIL transitions or to put it another way mostly empty.
1277 (Note that leaf nodes will not contain any transitions.)
1279 This algorithm compresses the tables by eliminating most such
1280 transitions, at the cost of a modest bit of extra work during lookup:
1282 - Each states[] entry contains a .base field which indicates the
1283 index in the state[] array wheres its transition data is stored.
1285 - If .base is 0 there are no valid transitions from that node.
1287 - If .base is nonzero then charid is added to it to find an entry in
1290 -If trans[states[state].base+charid].check!=state then the
1291 transition is taken to be a 0/Fail transition. Thus if there are fail
1292 transitions at the front of the node then the .base offset will point
1293 somewhere inside the previous nodes data (or maybe even into a node
1294 even earlier), but the .check field determines if the transition is
1297 The following process inplace converts the table to the compressed
1298 table: We first do not compress the root node 1,and mark its all its
1299 .check pointers as 1 and set its .base pointer as 1 as well. This
1300 allows to do a DFA construction from the compressed table later, and
1301 ensures that any .base pointers we calculate later are greater than
1304 - We set 'pos' to indicate the first entry of the second node.
1306 - We then iterate over the columns of the node, finding the first and
1307 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1308 and set the .check pointers accordingly, and advance pos
1309 appropriately and repreat for the next node. Note that when we copy
1310 the next pointers we have to convert them from the original
1311 NODEIDX form to NODENUM form as the former is not valid post
1314 - If a node has no transitions used we mark its base as 0 and do not
1315 advance the pos pointer.
1317 - If a node only has one transition we use a second pointer into the
1318 structure to fill in allocated fail transitions from other states.
1319 This pointer is independent of the main pointer and scans forward
1320 looking for null transitions that are allocated to a state. When it
1321 finds one it writes the single transition into the "hole". If the
1322 pointer doesnt find one the single transition is appeneded as normal.
1324 - Once compressed we can Renew/realloc the structures to release the
1327 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1328 specifically Fig 3.47 and the associated pseudocode.
1332 const U32 laststate = TRIE_NODENUM( next_alloc );
1335 trie->laststate = laststate;
1337 for ( state = 1 ; state < laststate ; state++ ) {
1339 const U32 stateidx = TRIE_NODEIDX( state );
1340 const U32 o_used = trie->trans[ stateidx ].check;
1341 U32 used = trie->trans[ stateidx ].check;
1342 trie->trans[ stateidx ].check = 0;
1344 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1345 if ( flag || trie->trans[ stateidx + charid ].next ) {
1346 if ( trie->trans[ stateidx + charid ].next ) {
1348 for ( ; zp < pos ; zp++ ) {
1349 if ( ! trie->trans[ zp ].next ) {
1353 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1354 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1355 trie->trans[ zp ].check = state;
1356 if ( ++zp > pos ) pos = zp;
1363 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1365 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1366 trie->trans[ pos ].check = state;
1371 trie->lasttrans = pos + 1;
1372 Renew( trie->states, laststate + 1, reg_trie_state);
1373 DEBUG_TRIE_COMPILE_MORE_r(
1374 PerlIO_printf( Perl_debug_log,
1375 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1376 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1379 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1382 } /* end table compress */
1384 /* resize the trans array to remove unused space */
1385 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1387 DEBUG_TRIE_COMPILE_r({
1390 Now we print it out again, in a slightly different form as there is additional
1391 info we want to be able to see when its compressed. They are close enough for
1392 visual comparison though.
1394 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1396 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1397 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1399 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1402 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1404 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1405 PerlIO_printf( Perl_debug_log, "-----");
1406 PerlIO_printf( Perl_debug_log, "\n");
1408 for( state = 1 ; state < trie->laststate ; state++ ) {
1409 const U32 base = trie->states[ state ].trans.base;
1411 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1413 if ( trie->states[ state ].wordnum ) {
1414 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1416 PerlIO_printf( Perl_debug_log, "%6s", "" );
1419 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1424 while( ( base + ofs < trie->uniquecharcount ) ||
1425 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1426 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1429 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1431 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1432 if ( ( base + ofs >= trie->uniquecharcount ) &&
1433 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1434 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1436 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1437 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1439 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1443 PerlIO_printf( Perl_debug_log, "]");
1446 PerlIO_printf( Perl_debug_log, "\n" );
1451 /* now finally we "stitch in" the new TRIE node
1452 This means we convert either the first branch or the first Exact,
1453 depending on whether the thing following (in 'last') is a branch
1454 or not and whther first is the startbranch (ie is it a sub part of
1455 the alternation or is it the whole thing.)
1456 Assuming its a sub part we conver the EXACT otherwise we convert
1457 the whole branch sequence, including the first.
1464 if ( first == startbranch && OP( last ) != BRANCH ) {
1467 convert = NEXTOPER( first );
1468 NEXT_OFF( first ) = (U16)(last - first);
1471 OP( convert ) = TRIE + (U8)( flags - EXACT );
1472 NEXT_OFF( convert ) = (U16)(tail - convert);
1473 ARG_SET( convert, data_slot );
1475 /* tells us if we need to handle accept buffers specially */
1476 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1479 /* needed for dumping*/
1481 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1482 /* We now need to mark all of the space originally used by the
1483 branches as optimized away. This keeps the dumpuntil from
1484 throwing a wobbly as it doesnt use regnext() to traverse the
1487 while( optimize < last ) {
1488 OP( optimize ) = OPTIMIZED;
1492 } /* end node insert */
1499 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1500 * These need to be revisited when a newer toolchain becomes available.
1502 #if defined(__sparc64__) && defined(__GNUC__)
1503 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1504 # undef SPARC64_GCC_WORKAROUND
1505 # define SPARC64_GCC_WORKAROUND 1
1509 /* REx optimizer. Converts nodes into quickier variants "in place".
1510 Finds fixed substrings. */
1512 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1513 to the position after last scanned or to NULL. */
1517 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1518 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1519 /* scanp: Start here (read-write). */
1520 /* deltap: Write maxlen-minlen here. */
1521 /* last: Stop before this one. */
1524 I32 min = 0, pars = 0, code;
1525 regnode *scan = *scanp, *next;
1527 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1528 int is_inf_internal = 0; /* The studied chunk is infinite */
1529 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1530 scan_data_t data_fake;
1531 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1532 SV *re_trie_maxbuff = NULL;
1534 GET_RE_DEBUG_FLAGS_DECL;
1536 while (scan && OP(scan) != END && scan < last) {
1537 /* Peephole optimizer: */
1539 SV * const mysv=sv_newmortal();
1540 regprop(RExC_rx, mysv, scan);
1541 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1542 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1545 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1546 /* Merge several consecutive EXACTish nodes into one. */
1547 regnode *n = regnext(scan);
1550 regnode *stop = scan;
1553 next = scan + NODE_SZ_STR(scan);
1554 /* Skip NOTHING, merge EXACT*. */
1556 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1557 (stringok && (OP(n) == OP(scan))))
1559 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1560 if (OP(n) == TAIL || n > next)
1562 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1563 NEXT_OFF(scan) += NEXT_OFF(n);
1564 next = n + NODE_STEP_REGNODE;
1571 else if (stringok) {
1572 const int oldl = STR_LEN(scan);
1573 regnode * const nnext = regnext(n);
1575 if (oldl + STR_LEN(n) > U8_MAX)
1577 NEXT_OFF(scan) += NEXT_OFF(n);
1578 STR_LEN(scan) += STR_LEN(n);
1579 next = n + NODE_SZ_STR(n);
1580 /* Now we can overwrite *n : */
1581 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1589 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1591 Two problematic code points in Unicode casefolding of EXACT nodes:
1593 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1594 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1600 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1601 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1603 This means that in case-insensitive matching (or "loose matching",
1604 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1605 length of the above casefolded versions) can match a target string
1606 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1607 This would rather mess up the minimum length computation.
1609 What we'll do is to look for the tail four bytes, and then peek
1610 at the preceding two bytes to see whether we need to decrease
1611 the minimum length by four (six minus two).
1613 Thanks to the design of UTF-8, there cannot be false matches:
1614 A sequence of valid UTF-8 bytes cannot be a subsequence of
1615 another valid sequence of UTF-8 bytes.
1618 char * const s0 = STRING(scan), *s, *t;
1619 char * const s1 = s0 + STR_LEN(scan) - 1;
1620 char * const s2 = s1 - 4;
1621 const char t0[] = "\xcc\x88\xcc\x81";
1622 const char * const t1 = t0 + 3;
1625 s < s2 && (t = ninstr(s, s1, t0, t1));
1627 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1628 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1635 n = scan + NODE_SZ_STR(scan);
1637 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1648 /* Follow the next-chain of the current node and optimize
1649 away all the NOTHINGs from it. */
1650 if (OP(scan) != CURLYX) {
1651 const int max = (reg_off_by_arg[OP(scan)]
1653 /* I32 may be smaller than U16 on CRAYs! */
1654 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1655 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1659 /* Skip NOTHING and LONGJMP. */
1660 while ((n = regnext(n))
1661 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1662 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1663 && off + noff < max)
1665 if (reg_off_by_arg[OP(scan)])
1668 NEXT_OFF(scan) = off;
1671 /* The principal pseudo-switch. Cannot be a switch, since we
1672 look into several different things. */
1673 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1674 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1675 next = regnext(scan);
1677 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1679 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1680 I32 max1 = 0, min1 = I32_MAX, num = 0;
1681 struct regnode_charclass_class accum;
1682 regnode * const startbranch=scan;
1684 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1685 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1686 if (flags & SCF_DO_STCLASS)
1687 cl_init_zero(pRExC_state, &accum);
1689 while (OP(scan) == code) {
1690 I32 deltanext, minnext, f = 0, fake;
1691 struct regnode_charclass_class this_class;
1694 data_fake.flags = 0;
1696 data_fake.whilem_c = data->whilem_c;
1697 data_fake.last_closep = data->last_closep;
1700 data_fake.last_closep = &fake;
1701 next = regnext(scan);
1702 scan = NEXTOPER(scan);
1704 scan = NEXTOPER(scan);
1705 if (flags & SCF_DO_STCLASS) {
1706 cl_init(pRExC_state, &this_class);
1707 data_fake.start_class = &this_class;
1708 f = SCF_DO_STCLASS_AND;
1710 if (flags & SCF_WHILEM_VISITED_POS)
1711 f |= SCF_WHILEM_VISITED_POS;
1713 /* we suppose the run is continuous, last=next...*/
1714 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1715 next, &data_fake, f,depth+1);
1718 if (max1 < minnext + deltanext)
1719 max1 = minnext + deltanext;
1720 if (deltanext == I32_MAX)
1721 is_inf = is_inf_internal = 1;
1723 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1725 if (data && (data_fake.flags & SF_HAS_EVAL))
1726 data->flags |= SF_HAS_EVAL;
1728 data->whilem_c = data_fake.whilem_c;
1729 if (flags & SCF_DO_STCLASS)
1730 cl_or(pRExC_state, &accum, &this_class);
1731 if (code == SUSPEND)
1734 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1736 if (flags & SCF_DO_SUBSTR) {
1737 data->pos_min += min1;
1738 data->pos_delta += max1 - min1;
1739 if (max1 != min1 || is_inf)
1740 data->longest = &(data->longest_float);
1743 delta += max1 - min1;
1744 if (flags & SCF_DO_STCLASS_OR) {
1745 cl_or(pRExC_state, data->start_class, &accum);
1747 cl_and(data->start_class, &and_with);
1748 flags &= ~SCF_DO_STCLASS;
1751 else if (flags & SCF_DO_STCLASS_AND) {
1753 cl_and(data->start_class, &accum);
1754 flags &= ~SCF_DO_STCLASS;
1757 /* Switch to OR mode: cache the old value of
1758 * data->start_class */
1759 StructCopy(data->start_class, &and_with,
1760 struct regnode_charclass_class);
1761 flags &= ~SCF_DO_STCLASS_AND;
1762 StructCopy(&accum, data->start_class,
1763 struct regnode_charclass_class);
1764 flags |= SCF_DO_STCLASS_OR;
1765 data->start_class->flags |= ANYOF_EOS;
1771 Assuming this was/is a branch we are dealing with: 'scan' now
1772 points at the item that follows the branch sequence, whatever
1773 it is. We now start at the beginning of the sequence and look
1779 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1781 If we can find such a subseqence we need to turn the first
1782 element into a trie and then add the subsequent branch exact
1783 strings to the trie.
1787 1. patterns where the whole set of branch can be converted to a trie,
1789 2. patterns where only a subset of the alternations can be
1790 converted to a trie.
1792 In case 1 we can replace the whole set with a single regop
1793 for the trie. In case 2 we need to keep the start and end
1796 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1797 becomes BRANCH TRIE; BRANCH X;
1799 Hypthetically when we know the regex isnt anchored we can
1800 turn a case 1 into a DFA and let it rip... Every time it finds a match
1801 it would just call its tail, no WHILEM/CURLY needed.
1805 if (!re_trie_maxbuff) {
1806 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1807 if (!SvIOK(re_trie_maxbuff))
1808 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1810 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1812 regnode *first = (regnode *)NULL;
1813 regnode *last = (regnode *)NULL;
1814 regnode *tail = scan;
1819 SV * const mysv = sv_newmortal(); /* for dumping */
1821 /* var tail is used because there may be a TAIL
1822 regop in the way. Ie, the exacts will point to the
1823 thing following the TAIL, but the last branch will
1824 point at the TAIL. So we advance tail. If we
1825 have nested (?:) we may have to move through several
1829 while ( OP( tail ) == TAIL ) {
1830 /* this is the TAIL generated by (?:) */
1831 tail = regnext( tail );
1835 regprop(RExC_rx, mysv, tail );
1836 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1837 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1838 (RExC_seen_evals) ? "[EVAL]" : ""
1843 step through the branches, cur represents each
1844 branch, noper is the first thing to be matched
1845 as part of that branch and noper_next is the
1846 regnext() of that node. if noper is an EXACT
1847 and noper_next is the same as scan (our current
1848 position in the regex) then the EXACT branch is
1849 a possible optimization target. Once we have
1850 two or more consequetive such branches we can
1851 create a trie of the EXACT's contents and stich
1852 it in place. If the sequence represents all of
1853 the branches we eliminate the whole thing and
1854 replace it with a single TRIE. If it is a
1855 subsequence then we need to stitch it in. This
1856 means the first branch has to remain, and needs
1857 to be repointed at the item on the branch chain
1858 following the last branch optimized. This could
1859 be either a BRANCH, in which case the
1860 subsequence is internal, or it could be the
1861 item following the branch sequence in which
1862 case the subsequence is at the end.
1866 /* dont use tail as the end marker for this traverse */
1867 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1868 regnode * const noper = NEXTOPER( cur );
1869 regnode * const noper_next = regnext( noper );
1872 regprop(RExC_rx, mysv, cur);
1873 PerlIO_printf( Perl_debug_log, "%*s%s",
1874 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1876 regprop(RExC_rx, mysv, noper);
1877 PerlIO_printf( Perl_debug_log, " -> %s",
1878 SvPV_nolen_const(mysv));
1881 regprop(RExC_rx, mysv, noper_next );
1882 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1883 SvPV_nolen_const(mysv));
1885 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1888 if ( ( first ? OP( noper ) == optype
1889 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1890 && noper_next == tail && count<U16_MAX)
1895 optype = OP( noper );
1899 regprop(RExC_rx, mysv, first);
1900 PerlIO_printf( Perl_debug_log, "%*s%s",
1901 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1902 regprop(RExC_rx, mysv, NEXTOPER(first) );
1903 PerlIO_printf( Perl_debug_log, " -> %s\n",
1904 SvPV_nolen_const( mysv ) );
1909 regprop(RExC_rx, mysv, cur);
1910 PerlIO_printf( Perl_debug_log, "%*s%s",
1911 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1912 regprop(RExC_rx, mysv, noper );
1913 PerlIO_printf( Perl_debug_log, " -> %s\n",
1914 SvPV_nolen_const( mysv ) );
1920 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1921 (int)depth * 2 + 2, "E:", "**END**" );
1923 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1925 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1926 && noper_next == tail )
1930 optype = OP( noper );
1940 regprop(RExC_rx, mysv, cur);
1941 PerlIO_printf( Perl_debug_log,
1942 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1943 " ", SvPV_nolen_const( mysv ), first, last, cur);
1948 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1949 (int)depth * 2 + 2, "E:", "==END==" );
1951 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1956 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1957 scan = NEXTOPER(NEXTOPER(scan));
1958 } else /* single branch is optimized. */
1959 scan = NEXTOPER(scan);
1962 else if (OP(scan) == EXACT) {
1963 I32 l = STR_LEN(scan);
1966 const U8 * const s = (U8*)STRING(scan);
1967 l = utf8_length(s, s + l);
1968 uc = utf8_to_uvchr(s, NULL);
1970 uc = *((U8*)STRING(scan));
1973 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1974 /* The code below prefers earlier match for fixed
1975 offset, later match for variable offset. */
1976 if (data->last_end == -1) { /* Update the start info. */
1977 data->last_start_min = data->pos_min;
1978 data->last_start_max = is_inf
1979 ? I32_MAX : data->pos_min + data->pos_delta;
1981 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
1983 SvUTF8_on(data->last_found);
1985 SV * const sv = data->last_found;
1986 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
1987 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1988 if (mg && mg->mg_len >= 0)
1989 mg->mg_len += utf8_length((U8*)STRING(scan),
1990 (U8*)STRING(scan)+STR_LEN(scan));
1992 data->last_end = data->pos_min + l;
1993 data->pos_min += l; /* As in the first entry. */
1994 data->flags &= ~SF_BEFORE_EOL;
1996 if (flags & SCF_DO_STCLASS_AND) {
1997 /* Check whether it is compatible with what we know already! */
2001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2002 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2003 && (!(data->start_class->flags & ANYOF_FOLD)
2004 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2007 ANYOF_CLASS_ZERO(data->start_class);
2008 ANYOF_BITMAP_ZERO(data->start_class);
2010 ANYOF_BITMAP_SET(data->start_class, uc);
2011 data->start_class->flags &= ~ANYOF_EOS;
2013 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2015 else if (flags & SCF_DO_STCLASS_OR) {
2016 /* false positive possible if the class is case-folded */
2018 ANYOF_BITMAP_SET(data->start_class, uc);
2020 data->start_class->flags |= ANYOF_UNICODE_ALL;
2021 data->start_class->flags &= ~ANYOF_EOS;
2022 cl_and(data->start_class, &and_with);
2024 flags &= ~SCF_DO_STCLASS;
2026 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2027 I32 l = STR_LEN(scan);
2028 UV uc = *((U8*)STRING(scan));
2030 /* Search for fixed substrings supports EXACT only. */
2031 if (flags & SCF_DO_SUBSTR)
2032 scan_commit(pRExC_state, data);
2034 const U8 * const s = (U8 *)STRING(scan);
2035 l = utf8_length(s, s + l);
2036 uc = utf8_to_uvchr(s, NULL);
2039 if (data && (flags & SCF_DO_SUBSTR))
2041 if (flags & SCF_DO_STCLASS_AND) {
2042 /* Check whether it is compatible with what we know already! */
2046 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2047 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2048 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2050 ANYOF_CLASS_ZERO(data->start_class);
2051 ANYOF_BITMAP_ZERO(data->start_class);
2053 ANYOF_BITMAP_SET(data->start_class, uc);
2054 data->start_class->flags &= ~ANYOF_EOS;
2055 data->start_class->flags |= ANYOF_FOLD;
2056 if (OP(scan) == EXACTFL)
2057 data->start_class->flags |= ANYOF_LOCALE;
2060 else if (flags & SCF_DO_STCLASS_OR) {
2061 if (data->start_class->flags & ANYOF_FOLD) {
2062 /* false positive possible if the class is case-folded.
2063 Assume that the locale settings are the same... */
2065 ANYOF_BITMAP_SET(data->start_class, uc);
2066 data->start_class->flags &= ~ANYOF_EOS;
2068 cl_and(data->start_class, &and_with);
2070 flags &= ~SCF_DO_STCLASS;
2072 else if (strchr((const char*)PL_varies,OP(scan))) {
2073 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2074 I32 f = flags, pos_before = 0;
2075 regnode * const oscan = scan;
2076 struct regnode_charclass_class this_class;
2077 struct regnode_charclass_class *oclass = NULL;
2078 I32 next_is_eval = 0;
2080 switch (PL_regkind[(U8)OP(scan)]) {
2081 case WHILEM: /* End of (?:...)* . */
2082 scan = NEXTOPER(scan);
2085 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2086 next = NEXTOPER(scan);
2087 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2089 maxcount = REG_INFTY;
2090 next = regnext(scan);
2091 scan = NEXTOPER(scan);
2095 if (flags & SCF_DO_SUBSTR)
2100 if (flags & SCF_DO_STCLASS) {
2102 maxcount = REG_INFTY;
2103 next = regnext(scan);
2104 scan = NEXTOPER(scan);
2107 is_inf = is_inf_internal = 1;
2108 scan = regnext(scan);
2109 if (flags & SCF_DO_SUBSTR) {
2110 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2111 data->longest = &(data->longest_float);
2113 goto optimize_curly_tail;
2115 mincount = ARG1(scan);
2116 maxcount = ARG2(scan);
2117 next = regnext(scan);
2118 if (OP(scan) == CURLYX) {
2119 I32 lp = (data ? *(data->last_closep) : 0);
2120 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2122 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2123 next_is_eval = (OP(scan) == EVAL);
2125 if (flags & SCF_DO_SUBSTR) {
2126 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2127 pos_before = data->pos_min;
2131 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2133 data->flags |= SF_IS_INF;
2135 if (flags & SCF_DO_STCLASS) {
2136 cl_init(pRExC_state, &this_class);
2137 oclass = data->start_class;
2138 data->start_class = &this_class;
2139 f |= SCF_DO_STCLASS_AND;
2140 f &= ~SCF_DO_STCLASS_OR;
2142 /* These are the cases when once a subexpression
2143 fails at a particular position, it cannot succeed
2144 even after backtracking at the enclosing scope.
2146 XXXX what if minimal match and we are at the
2147 initial run of {n,m}? */
2148 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2149 f &= ~SCF_WHILEM_VISITED_POS;
2151 /* This will finish on WHILEM, setting scan, or on NULL: */
2152 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2154 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2156 if (flags & SCF_DO_STCLASS)
2157 data->start_class = oclass;
2158 if (mincount == 0 || minnext == 0) {
2159 if (flags & SCF_DO_STCLASS_OR) {
2160 cl_or(pRExC_state, data->start_class, &this_class);
2162 else if (flags & SCF_DO_STCLASS_AND) {
2163 /* Switch to OR mode: cache the old value of
2164 * data->start_class */
2165 StructCopy(data->start_class, &and_with,
2166 struct regnode_charclass_class);
2167 flags &= ~SCF_DO_STCLASS_AND;
2168 StructCopy(&this_class, data->start_class,
2169 struct regnode_charclass_class);
2170 flags |= SCF_DO_STCLASS_OR;
2171 data->start_class->flags |= ANYOF_EOS;
2173 } else { /* Non-zero len */
2174 if (flags & SCF_DO_STCLASS_OR) {
2175 cl_or(pRExC_state, data->start_class, &this_class);
2176 cl_and(data->start_class, &and_with);
2178 else if (flags & SCF_DO_STCLASS_AND)
2179 cl_and(data->start_class, &this_class);
2180 flags &= ~SCF_DO_STCLASS;
2182 if (!scan) /* It was not CURLYX, but CURLY. */
2184 if ( /* ? quantifier ok, except for (?{ ... }) */
2185 (next_is_eval || !(mincount == 0 && maxcount == 1))
2186 && (minnext == 0) && (deltanext == 0)
2187 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2188 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2189 && ckWARN(WARN_REGEXP))
2192 "Quantifier unexpected on zero-length expression");
2195 min += minnext * mincount;
2196 is_inf_internal |= ((maxcount == REG_INFTY
2197 && (minnext + deltanext) > 0)
2198 || deltanext == I32_MAX);
2199 is_inf |= is_inf_internal;
2200 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2202 /* Try powerful optimization CURLYX => CURLYN. */
2203 if ( OP(oscan) == CURLYX && data
2204 && data->flags & SF_IN_PAR
2205 && !(data->flags & SF_HAS_EVAL)
2206 && !deltanext && minnext == 1 ) {
2207 /* Try to optimize to CURLYN. */
2208 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2209 regnode * const nxt1 = nxt;
2216 if (!strchr((const char*)PL_simple,OP(nxt))
2217 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2218 && STR_LEN(nxt) == 1))
2224 if (OP(nxt) != CLOSE)
2226 /* Now we know that nxt2 is the only contents: */
2227 oscan->flags = (U8)ARG(nxt);
2229 OP(nxt1) = NOTHING; /* was OPEN. */
2231 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2232 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2233 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2234 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2235 OP(nxt + 1) = OPTIMIZED; /* was count. */
2236 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2241 /* Try optimization CURLYX => CURLYM. */
2242 if ( OP(oscan) == CURLYX && data
2243 && !(data->flags & SF_HAS_PAR)
2244 && !(data->flags & SF_HAS_EVAL)
2245 && !deltanext /* atom is fixed width */
2246 && minnext != 0 /* CURLYM can't handle zero width */
2248 /* XXXX How to optimize if data == 0? */
2249 /* Optimize to a simpler form. */
2250 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2254 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2255 && (OP(nxt2) != WHILEM))
2257 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2258 /* Need to optimize away parenths. */
2259 if (data->flags & SF_IN_PAR) {
2260 /* Set the parenth number. */
2261 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2263 if (OP(nxt) != CLOSE)
2264 FAIL("Panic opt close");
2265 oscan->flags = (U8)ARG(nxt);
2266 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2267 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2269 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2270 OP(nxt + 1) = OPTIMIZED; /* was count. */
2271 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2272 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2275 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2276 regnode *nnxt = regnext(nxt1);
2279 if (reg_off_by_arg[OP(nxt1)])
2280 ARG_SET(nxt1, nxt2 - nxt1);
2281 else if (nxt2 - nxt1 < U16_MAX)
2282 NEXT_OFF(nxt1) = nxt2 - nxt1;
2284 OP(nxt) = NOTHING; /* Cannot beautify */
2289 /* Optimize again: */
2290 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2296 else if ((OP(oscan) == CURLYX)
2297 && (flags & SCF_WHILEM_VISITED_POS)
2298 /* See the comment on a similar expression above.
2299 However, this time it not a subexpression
2300 we care about, but the expression itself. */
2301 && (maxcount == REG_INFTY)
2302 && data && ++data->whilem_c < 16) {
2303 /* This stays as CURLYX, we can put the count/of pair. */
2304 /* Find WHILEM (as in regexec.c) */
2305 regnode *nxt = oscan + NEXT_OFF(oscan);
2307 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2309 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2310 | (RExC_whilem_seen << 4)); /* On WHILEM */
2312 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2314 if (flags & SCF_DO_SUBSTR) {
2315 SV *last_str = NULL;
2316 int counted = mincount != 0;
2318 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2319 #if defined(SPARC64_GCC_WORKAROUND)
2322 const char *s = NULL;
2325 if (pos_before >= data->last_start_min)
2328 b = data->last_start_min;
2331 s = SvPV_const(data->last_found, l);
2332 old = b - data->last_start_min;
2335 I32 b = pos_before >= data->last_start_min
2336 ? pos_before : data->last_start_min;
2338 const char * const s = SvPV_const(data->last_found, l);
2339 I32 old = b - data->last_start_min;
2343 old = utf8_hop((U8*)s, old) - (U8*)s;
2346 /* Get the added string: */
2347 last_str = newSVpvn(s + old, l);
2349 SvUTF8_on(last_str);
2350 if (deltanext == 0 && pos_before == b) {
2351 /* What was added is a constant string */
2353 SvGROW(last_str, (mincount * l) + 1);
2354 repeatcpy(SvPVX(last_str) + l,
2355 SvPVX_const(last_str), l, mincount - 1);
2356 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2357 /* Add additional parts. */
2358 SvCUR_set(data->last_found,
2359 SvCUR(data->last_found) - l);
2360 sv_catsv(data->last_found, last_str);
2362 SV * sv = data->last_found;
2364 SvUTF8(sv) && SvMAGICAL(sv) ?
2365 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2366 if (mg && mg->mg_len >= 0)
2367 mg->mg_len += CHR_SVLEN(last_str);
2369 data->last_end += l * (mincount - 1);
2372 /* start offset must point into the last copy */
2373 data->last_start_min += minnext * (mincount - 1);
2374 data->last_start_max += is_inf ? I32_MAX
2375 : (maxcount - 1) * (minnext + data->pos_delta);
2378 /* It is counted once already... */
2379 data->pos_min += minnext * (mincount - counted);
2380 data->pos_delta += - counted * deltanext +
2381 (minnext + deltanext) * maxcount - minnext * mincount;
2382 if (mincount != maxcount) {
2383 /* Cannot extend fixed substrings found inside
2385 scan_commit(pRExC_state,data);
2386 if (mincount && last_str) {
2387 SV * const sv = data->last_found;
2388 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2389 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2393 sv_setsv(sv, last_str);
2394 data->last_end = data->pos_min;
2395 data->last_start_min =
2396 data->pos_min - CHR_SVLEN(last_str);
2397 data->last_start_max = is_inf
2399 : data->pos_min + data->pos_delta
2400 - CHR_SVLEN(last_str);
2402 data->longest = &(data->longest_float);
2404 SvREFCNT_dec(last_str);
2406 if (data && (fl & SF_HAS_EVAL))
2407 data->flags |= SF_HAS_EVAL;
2408 optimize_curly_tail:
2409 if (OP(oscan) != CURLYX) {
2410 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2412 NEXT_OFF(oscan) += NEXT_OFF(next);
2415 default: /* REF and CLUMP only? */
2416 if (flags & SCF_DO_SUBSTR) {
2417 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2418 data->longest = &(data->longest_float);
2420 is_inf = is_inf_internal = 1;
2421 if (flags & SCF_DO_STCLASS_OR)
2422 cl_anything(pRExC_state, data->start_class);
2423 flags &= ~SCF_DO_STCLASS;
2427 else if (strchr((const char*)PL_simple,OP(scan))) {
2430 if (flags & SCF_DO_SUBSTR) {
2431 scan_commit(pRExC_state,data);
2435 if (flags & SCF_DO_STCLASS) {
2436 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2438 /* Some of the logic below assumes that switching
2439 locale on will only add false positives. */
2440 switch (PL_regkind[(U8)OP(scan)]) {
2444 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2445 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2446 cl_anything(pRExC_state, data->start_class);
2449 if (OP(scan) == SANY)
2451 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2452 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2453 || (data->start_class->flags & ANYOF_CLASS));
2454 cl_anything(pRExC_state, data->start_class);
2456 if (flags & SCF_DO_STCLASS_AND || !value)
2457 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2460 if (flags & SCF_DO_STCLASS_AND)
2461 cl_and(data->start_class,
2462 (struct regnode_charclass_class*)scan);
2464 cl_or(pRExC_state, data->start_class,
2465 (struct regnode_charclass_class*)scan);
2468 if (flags & SCF_DO_STCLASS_AND) {
2469 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2470 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2471 for (value = 0; value < 256; value++)
2472 if (!isALNUM(value))
2473 ANYOF_BITMAP_CLEAR(data->start_class, value);
2477 if (data->start_class->flags & ANYOF_LOCALE)
2478 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2480 for (value = 0; value < 256; value++)
2482 ANYOF_BITMAP_SET(data->start_class, value);
2487 if (flags & SCF_DO_STCLASS_AND) {
2488 if (data->start_class->flags & ANYOF_LOCALE)
2489 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2492 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2493 data->start_class->flags |= ANYOF_LOCALE;
2497 if (flags & SCF_DO_STCLASS_AND) {
2498 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2499 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2500 for (value = 0; value < 256; value++)
2502 ANYOF_BITMAP_CLEAR(data->start_class, value);
2506 if (data->start_class->flags & ANYOF_LOCALE)
2507 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2509 for (value = 0; value < 256; value++)
2510 if (!isALNUM(value))
2511 ANYOF_BITMAP_SET(data->start_class, value);
2516 if (flags & SCF_DO_STCLASS_AND) {
2517 if (data->start_class->flags & ANYOF_LOCALE)
2518 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2521 data->start_class->flags |= ANYOF_LOCALE;
2522 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2526 if (flags & SCF_DO_STCLASS_AND) {
2527 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2528 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2529 for (value = 0; value < 256; value++)
2530 if (!isSPACE(value))
2531 ANYOF_BITMAP_CLEAR(data->start_class, value);
2535 if (data->start_class->flags & ANYOF_LOCALE)
2536 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2538 for (value = 0; value < 256; value++)
2540 ANYOF_BITMAP_SET(data->start_class, value);
2545 if (flags & SCF_DO_STCLASS_AND) {
2546 if (data->start_class->flags & ANYOF_LOCALE)
2547 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2550 data->start_class->flags |= ANYOF_LOCALE;
2551 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2555 if (flags & SCF_DO_STCLASS_AND) {
2556 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2557 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2558 for (value = 0; value < 256; value++)
2560 ANYOF_BITMAP_CLEAR(data->start_class, value);
2564 if (data->start_class->flags & ANYOF_LOCALE)
2565 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2567 for (value = 0; value < 256; value++)
2568 if (!isSPACE(value))
2569 ANYOF_BITMAP_SET(data->start_class, value);
2574 if (flags & SCF_DO_STCLASS_AND) {
2575 if (data->start_class->flags & ANYOF_LOCALE) {
2576 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2577 for (value = 0; value < 256; value++)
2578 if (!isSPACE(value))
2579 ANYOF_BITMAP_CLEAR(data->start_class, value);
2583 data->start_class->flags |= ANYOF_LOCALE;
2584 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2588 if (flags & SCF_DO_STCLASS_AND) {
2589 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2590 for (value = 0; value < 256; value++)
2591 if (!isDIGIT(value))
2592 ANYOF_BITMAP_CLEAR(data->start_class, value);
2595 if (data->start_class->flags & ANYOF_LOCALE)
2596 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2598 for (value = 0; value < 256; value++)
2600 ANYOF_BITMAP_SET(data->start_class, value);
2605 if (flags & SCF_DO_STCLASS_AND) {
2606 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2607 for (value = 0; value < 256; value++)
2609 ANYOF_BITMAP_CLEAR(data->start_class, value);
2612 if (data->start_class->flags & ANYOF_LOCALE)
2613 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2615 for (value = 0; value < 256; value++)
2616 if (!isDIGIT(value))
2617 ANYOF_BITMAP_SET(data->start_class, value);
2622 if (flags & SCF_DO_STCLASS_OR)
2623 cl_and(data->start_class, &and_with);
2624 flags &= ~SCF_DO_STCLASS;
2627 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2628 data->flags |= (OP(scan) == MEOL
2632 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2633 /* Lookbehind, or need to calculate parens/evals/stclass: */
2634 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2635 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2636 /* Lookahead/lookbehind */
2637 I32 deltanext, minnext, fake = 0;
2639 struct regnode_charclass_class intrnl;
2642 data_fake.flags = 0;
2644 data_fake.whilem_c = data->whilem_c;
2645 data_fake.last_closep = data->last_closep;
2648 data_fake.last_closep = &fake;
2649 if ( flags & SCF_DO_STCLASS && !scan->flags
2650 && OP(scan) == IFMATCH ) { /* Lookahead */
2651 cl_init(pRExC_state, &intrnl);
2652 data_fake.start_class = &intrnl;
2653 f |= SCF_DO_STCLASS_AND;
2655 if (flags & SCF_WHILEM_VISITED_POS)
2656 f |= SCF_WHILEM_VISITED_POS;
2657 next = regnext(scan);
2658 nscan = NEXTOPER(NEXTOPER(scan));
2659 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2662 vFAIL("Variable length lookbehind not implemented");
2664 else if (minnext > U8_MAX) {
2665 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2667 scan->flags = (U8)minnext;
2669 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2671 if (data && (data_fake.flags & SF_HAS_EVAL))
2672 data->flags |= SF_HAS_EVAL;
2674 data->whilem_c = data_fake.whilem_c;
2675 if (f & SCF_DO_STCLASS_AND) {
2676 const int was = (data->start_class->flags & ANYOF_EOS);
2678 cl_and(data->start_class, &intrnl);
2680 data->start_class->flags |= ANYOF_EOS;
2683 else if (OP(scan) == OPEN) {
2686 else if (OP(scan) == CLOSE) {
2687 if ((I32)ARG(scan) == is_par) {
2688 next = regnext(scan);
2690 if ( next && (OP(next) != WHILEM) && next < last)
2691 is_par = 0; /* Disable optimization */
2694 *(data->last_closep) = ARG(scan);
2696 else if (OP(scan) == EVAL) {
2698 data->flags |= SF_HAS_EVAL;
2700 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2701 if (flags & SCF_DO_SUBSTR) {
2702 scan_commit(pRExC_state,data);
2703 data->longest = &(data->longest_float);
2705 is_inf = is_inf_internal = 1;
2706 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2707 cl_anything(pRExC_state, data->start_class);
2708 flags &= ~SCF_DO_STCLASS;
2710 /* Else: zero-length, ignore. */
2711 scan = regnext(scan);
2716 *deltap = is_inf_internal ? I32_MAX : delta;
2717 if (flags & SCF_DO_SUBSTR && is_inf)
2718 data->pos_delta = I32_MAX - data->pos_min;
2719 if (is_par > U8_MAX)
2721 if (is_par && pars==1 && data) {
2722 data->flags |= SF_IN_PAR;
2723 data->flags &= ~SF_HAS_PAR;
2725 else if (pars && data) {
2726 data->flags |= SF_HAS_PAR;
2727 data->flags &= ~SF_IN_PAR;
2729 if (flags & SCF_DO_STCLASS_OR)
2730 cl_and(data->start_class, &and_with);
2735 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
2737 if (RExC_rx->data) {
2738 Renewc(RExC_rx->data,
2739 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2740 char, struct reg_data);
2741 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2742 RExC_rx->data->count += n;
2745 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2746 char, struct reg_data);
2747 Newx(RExC_rx->data->what, n, U8);
2748 RExC_rx->data->count = n;
2750 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2751 return RExC_rx->data->count - n;
2755 Perl_reginitcolors(pTHX)
2758 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2760 char *t = savepv(s);
2764 t = strchr(t, '\t');
2770 PL_colors[i] = t = (char *)"";
2775 PL_colors[i++] = (char *)"";
2782 - pregcomp - compile a regular expression into internal code
2784 * We can't allocate space until we know how big the compiled form will be,
2785 * but we can't compile it (and thus know how big it is) until we've got a
2786 * place to put the code. So we cheat: we compile it twice, once with code
2787 * generation turned off and size counting turned on, and once "for real".
2788 * This also means that we don't allocate space until we are sure that the
2789 * thing really will compile successfully, and we never have to move the
2790 * code and thus invalidate pointers into it. (Note that it has to be in
2791 * one piece because free() must be able to free it all.) [NB: not true in perl]
2793 * Beware that the optimization-preparation code in here knows about some
2794 * of the structure of the compiled regexp. [I'll say.]
2797 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2808 RExC_state_t RExC_state;
2809 RExC_state_t *pRExC_state = &RExC_state;
2811 GET_RE_DEBUG_FLAGS_DECL;
2814 FAIL("NULL regexp argument");
2816 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2819 DEBUG_r(if (!PL_colorset) reginitcolors());
2821 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2822 PL_colors[4],PL_colors[5],PL_colors[0],
2823 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2825 RExC_flags = pm->op_pmflags;
2829 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2830 RExC_seen_evals = 0;
2833 /* First pass: determine size, legality. */
2840 RExC_emit = &PL_regdummy;
2841 RExC_whilem_seen = 0;
2842 #if 0 /* REGC() is (currently) a NOP at the first pass.
2843 * Clever compilers notice this and complain. --jhi */
2844 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2846 if (reg(pRExC_state, 0, &flags) == NULL) {
2847 RExC_precomp = NULL;
2850 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2852 /* Small enough for pointer-storage convention?
2853 If extralen==0, this means that we will not need long jumps. */
2854 if (RExC_size >= 0x10000L && RExC_extralen)
2855 RExC_size += RExC_extralen;
2858 if (RExC_whilem_seen > 15)
2859 RExC_whilem_seen = 15;
2861 /* Allocate space and initialize. */
2862 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2865 FAIL("Regexp out of space");
2868 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2869 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2872 r->prelen = xend - exp;
2873 r->precomp = savepvn(RExC_precomp, r->prelen);
2875 #ifdef PERL_OLD_COPY_ON_WRITE
2876 r->saved_copy = NULL;
2878 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2879 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2880 r->lastparen = 0; /* mg.c reads this. */
2882 r->substrs = 0; /* Useful during FAIL. */
2883 r->startp = 0; /* Useful during FAIL. */
2884 r->endp = 0; /* Useful during FAIL. */
2886 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2888 r->offsets[0] = RExC_size;
2890 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2891 "%s %"UVuf" bytes for offset annotations.\n",
2892 r->offsets ? "Got" : "Couldn't get",
2893 (UV)((2*RExC_size+1) * sizeof(U32))));
2897 /* Second pass: emit code. */
2898 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2903 RExC_emit_start = r->program;
2904 RExC_emit = r->program;
2905 /* Store the count of eval-groups for security checks: */
2906 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2907 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2909 if (reg(pRExC_state, 0, &flags) == NULL)
2913 /* Dig out information for optimizations. */
2914 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2915 pm->op_pmflags = RExC_flags;
2917 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2918 r->regstclass = NULL;
2919 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2920 r->reganch |= ROPT_NAUGHTY;
2921 scan = r->program + 1; /* First BRANCH. */
2923 /* XXXX To minimize changes to RE engine we always allocate
2924 3-units-long substrs field. */
2925 Newxz(r->substrs, 1, struct reg_substr_data);
2927 StructCopy(&zero_scan_data, &data, scan_data_t);
2928 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2929 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2931 STRLEN longest_float_length, longest_fixed_length;
2932 struct regnode_charclass_class ch_class;
2937 /* Skip introductions and multiplicators >= 1. */
2938 while ((OP(first) == OPEN && (sawopen = 1)) ||
2939 /* An OR of *one* alternative - should not happen now. */
2940 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2941 (OP(first) == PLUS) ||
2942 (OP(first) == MINMOD) ||
2943 /* An {n,m} with n>0 */
2944 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2945 if (OP(first) == PLUS)
2948 first += regarglen[(U8)OP(first)];
2949 first = NEXTOPER(first);
2952 /* Starting-point info. */
2954 if (PL_regkind[(U8)OP(first)] == EXACT) {
2955 if (OP(first) == EXACT)
2956 /*EMPTY*/; /* Empty, get anchored substr later. */
2957 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2958 r->regstclass = first;
2960 else if (strchr((const char*)PL_simple,OP(first)))
2961 r->regstclass = first;
2962 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2963 PL_regkind[(U8)OP(first)] == NBOUND)
2964 r->regstclass = first;
2965 else if (PL_regkind[(U8)OP(first)] == BOL) {
2966 r->reganch |= (OP(first) == MBOL
2968 : (OP(first) == SBOL
2971 first = NEXTOPER(first);
2974 else if (OP(first) == GPOS) {
2975 r->reganch |= ROPT_ANCH_GPOS;
2976 first = NEXTOPER(first);
2979 else if (!sawopen && (OP(first) == STAR &&
2980 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2981 !(r->reganch & ROPT_ANCH) )
2983 /* turn .* into ^.* with an implied $*=1 */
2985 (OP(NEXTOPER(first)) == REG_ANY)
2988 r->reganch |= type | ROPT_IMPLICIT;
2989 first = NEXTOPER(first);
2992 if (sawplus && (!sawopen || !RExC_sawback)
2993 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
2994 /* x+ must match at the 1st pos of run of x's */
2995 r->reganch |= ROPT_SKIP;
2997 /* Scan is after the zeroth branch, first is atomic matcher. */
2998 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
2999 (IV)(first - scan + 1)));
3001 * If there's something expensive in the r.e., find the
3002 * longest literal string that must appear and make it the
3003 * regmust. Resolve ties in favor of later strings, since
3004 * the regstart check works with the beginning of the r.e.
3005 * and avoiding duplication strengthens checking. Not a
3006 * strong reason, but sufficient in the absence of others.
3007 * [Now we resolve ties in favor of the earlier string if
3008 * it happens that c_offset_min has been invalidated, since the
3009 * earlier string may buy us something the later one won't.]
3013 data.longest_fixed = newSVpvs("");
3014 data.longest_float = newSVpvs("");
3015 data.last_found = newSVpvs("");
3016 data.longest = &(data.longest_fixed);
3018 if (!r->regstclass) {
3019 cl_init(pRExC_state, &ch_class);
3020 data.start_class = &ch_class;
3021 stclass_flag = SCF_DO_STCLASS_AND;
3022 } else /* XXXX Check for BOUND? */
3024 data.last_closep = &last_close;
3026 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3027 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3028 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3029 && data.last_start_min == 0 && data.last_end > 0
3030 && !RExC_seen_zerolen
3031 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3032 r->reganch |= ROPT_CHECK_ALL;
3033 scan_commit(pRExC_state, &data);
3034 SvREFCNT_dec(data.last_found);
3036 longest_float_length = CHR_SVLEN(data.longest_float);
3037 if (longest_float_length
3038 || (data.flags & SF_FL_BEFORE_EOL
3039 && (!(data.flags & SF_FL_BEFORE_MEOL)
3040 || (RExC_flags & PMf_MULTILINE)))) {
3043 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3044 && data.offset_fixed == data.offset_float_min
3045 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3046 goto remove_float; /* As in (a)+. */
3048 if (SvUTF8(data.longest_float)) {
3049 r->float_utf8 = data.longest_float;
3050 r->float_substr = NULL;
3052 r->float_substr = data.longest_float;
3053 r->float_utf8 = NULL;
3055 r->float_min_offset = data.offset_float_min;
3056 r->float_max_offset = data.offset_float_max;
3057 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3058 && (!(data.flags & SF_FL_BEFORE_MEOL)
3059 || (RExC_flags & PMf_MULTILINE)));
3060 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3064 r->float_substr = r->float_utf8 = NULL;
3065 SvREFCNT_dec(data.longest_float);
3066 longest_float_length = 0;
3069 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3070 if (longest_fixed_length
3071 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3072 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3073 || (RExC_flags & PMf_MULTILINE)))) {
3076 if (SvUTF8(data.longest_fixed)) {
3077 r->anchored_utf8 = data.longest_fixed;
3078 r->anchored_substr = NULL;
3080 r->anchored_substr = data.longest_fixed;
3081 r->anchored_utf8 = NULL;
3083 r->anchored_offset = data.offset_fixed;
3084 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3085 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3086 || (RExC_flags & PMf_MULTILINE)));
3087 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3090 r->anchored_substr = r->anchored_utf8 = NULL;
3091 SvREFCNT_dec(data.longest_fixed);
3092 longest_fixed_length = 0;
3095 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3096 r->regstclass = NULL;
3097 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3099 && !(data.start_class->flags & ANYOF_EOS)
3100 && !cl_is_anything(data.start_class))
3102 const I32 n = add_data(pRExC_state, 1, "f");
3104 Newx(RExC_rx->data->data[n], 1,
3105 struct regnode_charclass_class);
3106 StructCopy(data.start_class,
3107 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3108 struct regnode_charclass_class);
3109 r->regstclass = (regnode*)RExC_rx->data->data[n];
3110 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3111 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3112 regprop(r, sv, (regnode*)data.start_class);
3113 PerlIO_printf(Perl_debug_log,
3114 "synthetic stclass \"%s\".\n",
3115 SvPVX_const(sv));});
3118 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3119 if (longest_fixed_length > longest_float_length) {
3120 r->check_substr = r->anchored_substr;
3121 r->check_utf8 = r->anchored_utf8;
3122 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3123 if (r->reganch & ROPT_ANCH_SINGLE)
3124 r->reganch |= ROPT_NOSCAN;
3127 r->check_substr = r->float_substr;
3128 r->check_utf8 = r->float_utf8;
3129 r->check_offset_min = data.offset_float_min;
3130 r->check_offset_max = data.offset_float_max;
3132 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3133 This should be changed ASAP! */
3134 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3135 r->reganch |= RE_USE_INTUIT;
3136 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3137 r->reganch |= RE_INTUIT_TAIL;
3141 /* Several toplevels. Best we can is to set minlen. */
3143 struct regnode_charclass_class ch_class;
3146 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3147 scan = r->program + 1;
3148 cl_init(pRExC_state, &ch_class);
3149 data.start_class = &ch_class;
3150 data.last_closep = &last_close;
3151 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3152 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3153 = r->float_substr = r->float_utf8 = NULL;
3154 if (!(data.start_class->flags & ANYOF_EOS)
3155 && !cl_is_anything(data.start_class))
3157 const I32 n = add_data(pRExC_state, 1, "f");
3159 Newx(RExC_rx->data->data[n], 1,
3160 struct regnode_charclass_class);
3161 StructCopy(data.start_class,
3162 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3163 struct regnode_charclass_class);
3164 r->regstclass = (regnode*)RExC_rx->data->data[n];
3165 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3166 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3167 regprop(r, sv, (regnode*)data.start_class);
3168 PerlIO_printf(Perl_debug_log,
3169 "synthetic stclass \"%s\".\n",
3170 SvPVX_const(sv));});
3175 if (RExC_seen & REG_SEEN_GPOS)
3176 r->reganch |= ROPT_GPOS_SEEN;
3177 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3178 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3179 if (RExC_seen & REG_SEEN_EVAL)
3180 r->reganch |= ROPT_EVAL_SEEN;
3181 if (RExC_seen & REG_SEEN_CANY)
3182 r->reganch |= ROPT_CANY_SEEN;
3183 Newxz(r->startp, RExC_npar, I32);
3184 Newxz(r->endp, RExC_npar, I32);
3185 DEBUG_COMPILE_r(regdump(r));
3190 - reg - regular expression, i.e. main body or parenthesized thing
3192 * Caller must absorb opening parenthesis.
3194 * Combining parenthesis handling with the base level of regular expression
3195 * is a trifle forced, but the need to tie the tails of the branches to what
3196 * follows makes it hard to avoid.
3199 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3200 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3203 register regnode *ret; /* Will be the head of the group. */
3204 register regnode *br;
3205 register regnode *lastbr;
3206 register regnode *ender = NULL;
3207 register I32 parno = 0;
3209 const I32 oregflags = RExC_flags;
3210 bool have_branch = 0;
3213 /* for (?g), (?gc), and (?o) warnings; warning
3214 about (?c) will warn about (?g) -- japhy */
3216 #define WASTED_O 0x01
3217 #define WASTED_G 0x02
3218 #define WASTED_C 0x04
3219 #define WASTED_GC (0x02|0x04)
3220 I32 wastedflags = 0x00;
3222 char * parse_start = RExC_parse; /* MJD */
3223 char * const oregcomp_parse = RExC_parse;
3225 *flagp = 0; /* Tentatively. */
3228 /* Make an OPEN node, if parenthesized. */
3230 if (*RExC_parse == '?') { /* (?...) */
3231 U32 posflags = 0, negflags = 0;
3232 U32 *flagsp = &posflags;
3233 bool is_logical = 0;
3234 const char * const seqstart = RExC_parse;
3237 paren = *RExC_parse++;
3238 ret = NULL; /* For look-ahead/behind. */
3240 case '<': /* (?<...) */
3241 RExC_seen |= REG_SEEN_LOOKBEHIND;
3242 if (*RExC_parse == '!')
3244 if (*RExC_parse != '=' && *RExC_parse != '!')
3247 case '=': /* (?=...) */
3248 case '!': /* (?!...) */
3249 RExC_seen_zerolen++;
3250 case ':': /* (?:...) */
3251 case '>': /* (?>...) */
3253 case '$': /* (?$...) */
3254 case '@': /* (?@...) */
3255 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3257 case '#': /* (?#...) */
3258 while (*RExC_parse && *RExC_parse != ')')
3260 if (*RExC_parse != ')')
3261 FAIL("Sequence (?#... not terminated");
3262 nextchar(pRExC_state);
3265 case 'p': /* (?p...) */
3266 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3267 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3269 case '?': /* (??...) */
3271 if (*RExC_parse != '{')
3273 paren = *RExC_parse++;
3275 case '{': /* (?{...}) */
3277 I32 count = 1, n = 0;
3279 char *s = RExC_parse;
3281 RExC_seen_zerolen++;
3282 RExC_seen |= REG_SEEN_EVAL;
3283 while (count && (c = *RExC_parse)) {
3294 if (*RExC_parse != ')') {
3296 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3300 OP_4tree *sop, *rop;
3301 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3304 Perl_save_re_context(aTHX);
3305 rop = sv_compile_2op(sv, &sop, "re", &pad);
3306 sop->op_private |= OPpREFCOUNTED;
3307 /* re_dup will OpREFCNT_inc */
3308 OpREFCNT_set(sop, 1);
3311 n = add_data(pRExC_state, 3, "nop");
3312 RExC_rx->data->data[n] = (void*)rop;
3313 RExC_rx->data->data[n+1] = (void*)sop;
3314 RExC_rx->data->data[n+2] = (void*)pad;
3317 else { /* First pass */
3318 if (PL_reginterp_cnt < ++RExC_seen_evals
3320 /* No compiled RE interpolated, has runtime
3321 components ===> unsafe. */
3322 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3323 if (PL_tainting && PL_tainted)
3324 FAIL("Eval-group in insecure regular expression");
3325 if (IN_PERL_COMPILETIME)
3329 nextchar(pRExC_state);
3331 ret = reg_node(pRExC_state, LOGICAL);
3334 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3335 /* deal with the length of this later - MJD */
3338 ret = reganode(pRExC_state, EVAL, n);
3339 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3340 Set_Node_Offset(ret, parse_start);
3343 case '(': /* (?(?{...})...) and (?(?=...)...) */
3345 if (RExC_parse[0] == '?') { /* (?(?...)) */
3346 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3347 || RExC_parse[1] == '<'
3348 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3351 ret = reg_node(pRExC_state, LOGICAL);
3354 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3358 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3361 parno = atoi(RExC_parse++);
3363 while (isDIGIT(*RExC_parse))
3365 ret = reganode(pRExC_state, GROUPP, parno);
3367 if ((c = *nextchar(pRExC_state)) != ')')
3368 vFAIL("Switch condition not recognized");
3370 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3371 br = regbranch(pRExC_state, &flags, 1);
3373 br = reganode(pRExC_state, LONGJMP, 0);
3375 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3376 c = *nextchar(pRExC_state);
3380 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3381 regbranch(pRExC_state, &flags, 1);
3382 regtail(pRExC_state, ret, lastbr);
3385 c = *nextchar(pRExC_state);
3390 vFAIL("Switch (?(condition)... contains too many branches");
3391 ender = reg_node(pRExC_state, TAIL);
3392 regtail(pRExC_state, br, ender);
3394 regtail(pRExC_state, lastbr, ender);
3395 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3398 regtail(pRExC_state, ret, ender);
3402 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3406 RExC_parse--; /* for vFAIL to print correctly */
3407 vFAIL("Sequence (? incomplete");
3411 parse_flags: /* (?i) */
3412 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3413 /* (?g), (?gc) and (?o) are useless here
3414 and must be globally applied -- japhy */
3416 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3417 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3418 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3419 if (! (wastedflags & wflagbit) ) {
3420 wastedflags |= wflagbit;
3423 "Useless (%s%c) - %suse /%c modifier",
3424 flagsp == &negflags ? "?-" : "?",
3426 flagsp == &negflags ? "don't " : "",
3432 else if (*RExC_parse == 'c') {
3433 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3434 if (! (wastedflags & WASTED_C) ) {
3435 wastedflags |= WASTED_GC;
3438 "Useless (%sc) - %suse /gc modifier",
3439 flagsp == &negflags ? "?-" : "?",
3440 flagsp == &negflags ? "don't " : ""
3445 else { pmflag(flagsp, *RExC_parse); }
3449 if (*RExC_parse == '-') {
3451 wastedflags = 0; /* reset so (?g-c) warns twice */
3455 RExC_flags |= posflags;
3456 RExC_flags &= ~negflags;
3457 if (*RExC_parse == ':') {
3463 if (*RExC_parse != ')') {
3465 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3467 nextchar(pRExC_state);
3475 ret = reganode(pRExC_state, OPEN, parno);
3476 Set_Node_Length(ret, 1); /* MJD */
3477 Set_Node_Offset(ret, RExC_parse); /* MJD */
3484 /* Pick up the branches, linking them together. */
3485 parse_start = RExC_parse; /* MJD */
3486 br = regbranch(pRExC_state, &flags, 1);
3487 /* branch_len = (paren != 0); */
3491 if (*RExC_parse == '|') {
3492 if (!SIZE_ONLY && RExC_extralen) {
3493 reginsert(pRExC_state, BRANCHJ, br);
3496 reginsert(pRExC_state, BRANCH, br);
3497 Set_Node_Length(br, paren != 0);
3498 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3502 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3504 else if (paren == ':') {
3505 *flagp |= flags&SIMPLE;
3507 if (is_open) { /* Starts with OPEN. */
3508 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3510 else if (paren != '?') /* Not Conditional */
3512 *flagp |= flags & (SPSTART | HASWIDTH);
3514 while (*RExC_parse == '|') {
3515 if (!SIZE_ONLY && RExC_extralen) {
3516 ender = reganode(pRExC_state, LONGJMP,0);
3517 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3520 RExC_extralen += 2; /* Account for LONGJMP. */
3521 nextchar(pRExC_state);
3522 br = regbranch(pRExC_state, &flags, 0);
3526 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3530 *flagp |= flags&SPSTART;
3533 if (have_branch || paren != ':') {
3534 /* Make a closing node, and hook it on the end. */
3537 ender = reg_node(pRExC_state, TAIL);
3540 ender = reganode(pRExC_state, CLOSE, parno);
3541 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3542 Set_Node_Length(ender,1); /* MJD */
3548 *flagp &= ~HASWIDTH;
3551 ender = reg_node(pRExC_state, SUCCEED);
3554 ender = reg_node(pRExC_state, END);
3557 regtail(pRExC_state, lastbr, ender);
3560 /* Hook the tails of the branches to the closing node. */
3561 for (br = ret; br != NULL; br = regnext(br)) {
3562 regoptail(pRExC_state, br, ender);
3569 static const char parens[] = "=!<,>";
3571 if (paren && (p = strchr(parens, paren))) {
3572 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3573 int flag = (p - parens) > 1;
3576 node = SUSPEND, flag = 0;
3577 reginsert(pRExC_state, node,ret);
3578 Set_Node_Cur_Length(ret);
3579 Set_Node_Offset(ret, parse_start + 1);
3581 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3585 /* Check for proper termination. */
3587 RExC_flags = oregflags;
3588 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3589 RExC_parse = oregcomp_parse;
3590 vFAIL("Unmatched (");
3593 else if (!paren && RExC_parse < RExC_end) {
3594 if (*RExC_parse == ')') {
3596 vFAIL("Unmatched )");
3599 FAIL("Junk on end of regexp"); /* "Can't happen". */
3607 - regbranch - one alternative of an | operator
3609 * Implements the concatenation operator.
3612 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3615 register regnode *ret;
3616 register regnode *chain = NULL;
3617 register regnode *latest;
3618 I32 flags = 0, c = 0;
3623 if (!SIZE_ONLY && RExC_extralen)
3624 ret = reganode(pRExC_state, BRANCHJ,0);
3626 ret = reg_node(pRExC_state, BRANCH);
3627 Set_Node_Length(ret, 1);
3631 if (!first && SIZE_ONLY)
3632 RExC_extralen += 1; /* BRANCHJ */
3634 *flagp = WORST; /* Tentatively. */
3637 nextchar(pRExC_state);
3638 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3640 latest = regpiece(pRExC_state, &flags);
3641 if (latest == NULL) {
3642 if (flags & TRYAGAIN)
3646 else if (ret == NULL)
3648 *flagp |= flags&HASWIDTH;
3649 if (chain == NULL) /* First piece. */
3650 *flagp |= flags&SPSTART;
3653 regtail(pRExC_state, chain, latest);
3658 if (chain == NULL) { /* Loop ran zero times. */
3659 chain = reg_node(pRExC_state, NOTHING);
3664 *flagp |= flags&SIMPLE;
3671 - regpiece - something followed by possible [*+?]
3673 * Note that the branching code sequences used for ? and the general cases
3674 * of * and + are somewhat optimized: they use the same NOTHING node as
3675 * both the endmarker for their branch list and the body of the last branch.
3676 * It might seem that this node could be dispensed with entirely, but the
3677 * endmarker role is not redundant.
3680 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3683 register regnode *ret;
3685 register char *next;
3687 const char * const origparse = RExC_parse;
3690 I32 max = REG_INFTY;
3693 ret = regatom(pRExC_state, &flags);
3695 if (flags & TRYAGAIN)
3702 if (op == '{' && regcurly(RExC_parse)) {
3703 parse_start = RExC_parse; /* MJD */
3704 next = RExC_parse + 1;
3706 while (isDIGIT(*next) || *next == ',') {
3715 if (*next == '}') { /* got one */
3719 min = atoi(RExC_parse);
3723 maxpos = RExC_parse;
3725 if (!max && *maxpos != '0')
3726 max = REG_INFTY; /* meaning "infinity" */
3727 else if (max >= REG_INFTY)
3728 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3730 nextchar(pRExC_state);
3733 if ((flags&SIMPLE)) {
3734 RExC_naughty += 2 + RExC_naughty / 2;
3735 reginsert(pRExC_state, CURLY, ret);
3736 Set_Node_Offset(ret, parse_start+1); /* MJD */
3737 Set_Node_Cur_Length(ret);
3740 regnode *w = reg_node(pRExC_state, WHILEM);
3743 regtail(pRExC_state, ret, w);
3744 if (!SIZE_ONLY && RExC_extralen) {
3745 reginsert(pRExC_state, LONGJMP,ret);
3746 reginsert(pRExC_state, NOTHING,ret);
3747 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3749 reginsert(pRExC_state, CURLYX,ret);
3751 Set_Node_Offset(ret, parse_start+1);
3752 Set_Node_Length(ret,
3753 op == '{' ? (RExC_parse - parse_start) : 1);
3755 if (!SIZE_ONLY && RExC_extralen)
3756 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3757 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3759 RExC_whilem_seen++, RExC_extralen += 3;
3760 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3768 if (max && max < min)
3769 vFAIL("Can't do {n,m} with n > m");
3771 ARG1_SET(ret, (U16)min);
3772 ARG2_SET(ret, (U16)max);
3784 #if 0 /* Now runtime fix should be reliable. */
3786 /* if this is reinstated, don't forget to put this back into perldiag:
3788 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3790 (F) The part of the regexp subject to either the * or + quantifier
3791 could match an empty string. The {#} shows in the regular
3792 expression about where the problem was discovered.
3796 if (!(flags&HASWIDTH) && op != '?')
3797 vFAIL("Regexp *+ operand could be empty");
3800 parse_start = RExC_parse;
3801 nextchar(pRExC_state);
3803 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3805 if (op == '*' && (flags&SIMPLE)) {
3806 reginsert(pRExC_state, STAR, ret);
3810 else if (op == '*') {
3814 else if (op == '+' && (flags&SIMPLE)) {
3815 reginsert(pRExC_state, PLUS, ret);
3819 else if (op == '+') {
3823 else if (op == '?') {
3828 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
3830 "%.*s matches null string many times",
3831 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
3835 if (*RExC_parse == '?') {
3836 nextchar(pRExC_state);
3837 reginsert(pRExC_state, MINMOD, ret);
3838 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3840 if (ISMULT2(RExC_parse)) {
3842 vFAIL("Nested quantifiers");
3849 - regatom - the lowest level
3851 * Optimization: gobbles an entire sequence of ordinary characters so that
3852 * it can turn them into a single node, which is smaller to store and
3853 * faster to run. Backslashed characters are exceptions, each becoming a
3854 * separate node; the code is simpler that way and it's not worth fixing.
3856 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3858 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3861 register regnode *ret = NULL;
3863 char *parse_start = RExC_parse;
3865 *flagp = WORST; /* Tentatively. */
3868 switch (*RExC_parse) {
3870 RExC_seen_zerolen++;
3871 nextchar(pRExC_state);
3872 if (RExC_flags & PMf_MULTILINE)
3873 ret = reg_node(pRExC_state, MBOL);
3874 else if (RExC_flags & PMf_SINGLELINE)
3875 ret = reg_node(pRExC_state, SBOL);
3877 ret = reg_node(pRExC_state, BOL);
3878 Set_Node_Length(ret, 1); /* MJD */
3881 nextchar(pRExC_state);
3883 RExC_seen_zerolen++;
3884 if (RExC_flags & PMf_MULTILINE)
3885 ret = reg_node(pRExC_state, MEOL);
3886 else if (RExC_flags & PMf_SINGLELINE)
3887 ret = reg_node(pRExC_state, SEOL);
3889 ret = reg_node(pRExC_state, EOL);
3890 Set_Node_Length(ret, 1); /* MJD */
3893 nextchar(pRExC_state);
3894 if (RExC_flags & PMf_SINGLELINE)
3895 ret = reg_node(pRExC_state, SANY);
3897 ret = reg_node(pRExC_state, REG_ANY);
3898 *flagp |= HASWIDTH|SIMPLE;
3900 Set_Node_Length(ret, 1); /* MJD */
3904 char *oregcomp_parse = ++RExC_parse;
3905 ret = regclass(pRExC_state);
3906 if (*RExC_parse != ']') {
3907 RExC_parse = oregcomp_parse;
3908 vFAIL("Unmatched [");
3910 nextchar(pRExC_state);
3911 *flagp |= HASWIDTH|SIMPLE;
3912 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3916 nextchar(pRExC_state);
3917 ret = reg(pRExC_state, 1, &flags);
3919 if (flags & TRYAGAIN) {
3920 if (RExC_parse == RExC_end) {
3921 /* Make parent create an empty node if needed. */
3929 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3933 if (flags & TRYAGAIN) {
3937 vFAIL("Internal urp");
3938 /* Supposed to be caught earlier. */
3941 if (!regcurly(RExC_parse)) {
3950 vFAIL("Quantifier follows nothing");
3953 switch (*++RExC_parse) {
3955 RExC_seen_zerolen++;
3956 ret = reg_node(pRExC_state, SBOL);
3958 nextchar(pRExC_state);
3959 Set_Node_Length(ret, 2); /* MJD */
3962 ret = reg_node(pRExC_state, GPOS);
3963 RExC_seen |= REG_SEEN_GPOS;
3965 nextchar(pRExC_state);
3966 Set_Node_Length(ret, 2); /* MJD */
3969 ret = reg_node(pRExC_state, SEOL);
3971 RExC_seen_zerolen++; /* Do not optimize RE away */
3972 nextchar(pRExC_state);
3975 ret = reg_node(pRExC_state, EOS);
3977 RExC_seen_zerolen++; /* Do not optimize RE away */
3978 nextchar(pRExC_state);
3979 Set_Node_Length(ret, 2); /* MJD */
3982 ret = reg_node(pRExC_state, CANY);
3983 RExC_seen |= REG_SEEN_CANY;
3984 *flagp |= HASWIDTH|SIMPLE;
3985 nextchar(pRExC_state);
3986 Set_Node_Length(ret, 2); /* MJD */
3989 ret = reg_node(pRExC_state, CLUMP);
3991 nextchar(pRExC_state);
3992 Set_Node_Length(ret, 2); /* MJD */
3995 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
3996 *flagp |= HASWIDTH|SIMPLE;
3997 nextchar(pRExC_state);
3998 Set_Node_Length(ret, 2); /* MJD */
4001 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4002 *flagp |= HASWIDTH|SIMPLE;
4003 nextchar(pRExC_state);
4004 Set_Node_Length(ret, 2); /* MJD */
4007 RExC_seen_zerolen++;
4008 RExC_seen |= REG_SEEN_LOOKBEHIND;
4009 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4011 nextchar(pRExC_state);
4012 Set_Node_Length(ret, 2); /* MJD */
4015 RExC_seen_zerolen++;
4016 RExC_seen |= REG_SEEN_LOOKBEHIND;
4017 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4019 nextchar(pRExC_state);
4020 Set_Node_Length(ret, 2); /* MJD */
4023 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4024 *flagp |= HASWIDTH|SIMPLE;
4025 nextchar(pRExC_state);
4026 Set_Node_Length(ret, 2); /* MJD */
4029 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4030 *flagp |= HASWIDTH|SIMPLE;
4031 nextchar(pRExC_state);
4032 Set_Node_Length(ret, 2); /* MJD */
4035 ret = reg_node(pRExC_state, DIGIT);
4036 *flagp |= HASWIDTH|SIMPLE;
4037 nextchar(pRExC_state);
4038 Set_Node_Length(ret, 2); /* MJD */
4041 ret = reg_node(pRExC_state, NDIGIT);
4042 *flagp |= HASWIDTH|SIMPLE;
4043 nextchar(pRExC_state);
4044 Set_Node_Length(ret, 2); /* MJD */
4049 char* oldregxend = RExC_end;
4050 char* parse_start = RExC_parse - 2;
4052 if (RExC_parse[1] == '{') {
4053 /* a lovely hack--pretend we saw [\pX] instead */
4054 RExC_end = strchr(RExC_parse, '}');
4056 U8 c = (U8)*RExC_parse;
4058 RExC_end = oldregxend;
4059 vFAIL2("Missing right brace on \\%c{}", c);
4064 RExC_end = RExC_parse + 2;
4065 if (RExC_end > oldregxend)
4066 RExC_end = oldregxend;
4070 ret = regclass(pRExC_state);
4072 RExC_end = oldregxend;
4075 Set_Node_Offset(ret, parse_start + 2);
4076 Set_Node_Cur_Length(ret);
4077 nextchar(pRExC_state);
4078 *flagp |= HASWIDTH|SIMPLE;
4091 case '1': case '2': case '3': case '4':
4092 case '5': case '6': case '7': case '8': case '9':
4094 const I32 num = atoi(RExC_parse);
4096 if (num > 9 && num >= RExC_npar)
4099 char * parse_start = RExC_parse - 1; /* MJD */
4100 while (isDIGIT(*RExC_parse))
4103 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4104 vFAIL("Reference to nonexistent group");
4106 ret = reganode(pRExC_state,
4107 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4111 /* override incorrect value set in reganode MJD */
4112 Set_Node_Offset(ret, parse_start+1);
4113 Set_Node_Cur_Length(ret); /* MJD */
4115 nextchar(pRExC_state);
4120 if (RExC_parse >= RExC_end)
4121 FAIL("Trailing \\");
4124 /* Do not generate "unrecognized" warnings here, we fall
4125 back into the quick-grab loop below */
4132 if (RExC_flags & PMf_EXTENDED) {
4133 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4134 if (RExC_parse < RExC_end)
4140 register STRLEN len;
4145 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4147 parse_start = RExC_parse - 1;
4153 ret = reg_node(pRExC_state,
4154 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4156 for (len = 0, p = RExC_parse - 1;
4157 len < 127 && p < RExC_end;
4162 if (RExC_flags & PMf_EXTENDED)
4163 p = regwhite(p, RExC_end);
4210 ender = ASCII_TO_NATIVE('\033');
4214 ender = ASCII_TO_NATIVE('\007');
4219 char* const e = strchr(p, '}');
4223 vFAIL("Missing right brace on \\x{}");
4226 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4227 | PERL_SCAN_DISALLOW_PREFIX;
4228 STRLEN numlen = e - p - 1;
4229 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4236 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4238 ender = grok_hex(p, &numlen, &flags, NULL);
4244 ender = UCHARAT(p++);
4245 ender = toCTRL(ender);
4247 case '0': case '1': case '2': case '3':case '4':
4248 case '5': case '6': case '7': case '8':case '9':
4250 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4253 ender = grok_oct(p, &numlen, &flags, NULL);
4263 FAIL("Trailing \\");
4266 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4267 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4268 goto normal_default;
4273 if (UTF8_IS_START(*p) && UTF) {
4275 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4276 &numlen, UTF8_ALLOW_DEFAULT);
4283 if (RExC_flags & PMf_EXTENDED)
4284 p = regwhite(p, RExC_end);
4286 /* Prime the casefolded buffer. */
4287 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4289 if (ISMULT2(p)) { /* Back off on ?+*. */
4296 /* Emit all the Unicode characters. */
4298 for (foldbuf = tmpbuf;
4300 foldlen -= numlen) {
4301 ender = utf8_to_uvchr(foldbuf, &numlen);
4303 reguni(pRExC_state, ender, s, &unilen);
4306 /* In EBCDIC the numlen
4307 * and unilen can differ. */
4309 if (numlen >= foldlen)
4313 break; /* "Can't happen." */
4317 reguni(pRExC_state, ender, s, &unilen);
4326 REGC((char)ender, s++);
4334 /* Emit all the Unicode characters. */
4336 for (foldbuf = tmpbuf;
4338 foldlen -= numlen) {
4339 ender = utf8_to_uvchr(foldbuf, &numlen);
4341 reguni(pRExC_state, ender, s, &unilen);
4344 /* In EBCDIC the numlen
4345 * and unilen can differ. */
4347 if (numlen >= foldlen)
4355 reguni(pRExC_state, ender, s, &unilen);
4364 REGC((char)ender, s++);
4368 Set_Node_Cur_Length(ret); /* MJD */
4369 nextchar(pRExC_state);
4371 /* len is STRLEN which is unsigned, need to copy to signed */
4374 vFAIL("Internal disaster");
4378 if (len == 1 && UNI_IS_INVARIANT(ender))
4383 RExC_size += STR_SZ(len);
4385 RExC_emit += STR_SZ(len);
4390 /* If the encoding pragma is in effect recode the text of
4391 * any EXACT-kind nodes. */
4392 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4393 STRLEN oldlen = STR_LEN(ret);
4394 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4398 if (sv_utf8_downgrade(sv, TRUE)) {
4399 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4400 const STRLEN newlen = SvCUR(sv);
4405 GET_RE_DEBUG_FLAGS_DECL;
4406 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4407 (int)oldlen, STRING(ret),
4409 Copy(s, STRING(ret), newlen, char);
4410 STR_LEN(ret) += newlen - oldlen;
4411 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4413 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4421 S_regwhite(char *p, const char *e)
4426 else if (*p == '#') {
4429 } while (p < e && *p != '\n');
4437 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4438 Character classes ([:foo:]) can also be negated ([:^foo:]).
4439 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4440 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4441 but trigger failures because they are currently unimplemented. */
4443 #define POSIXCC_DONE(c) ((c) == ':')
4444 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4445 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4448 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4451 I32 namedclass = OOB_NAMEDCLASS;
4453 if (value == '[' && RExC_parse + 1 < RExC_end &&
4454 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4455 POSIXCC(UCHARAT(RExC_parse))) {
4456 const char c = UCHARAT(RExC_parse);
4457 char* const s = RExC_parse++;
4459 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4461 if (RExC_parse == RExC_end)
4462 /* Grandfather lone [:, [=, [. */
4465 const char* t = RExC_parse++; /* skip over the c */
4466 const char *posixcc;
4470 if (UCHARAT(RExC_parse) == ']') {
4471 RExC_parse++; /* skip over the ending ] */
4474 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4475 const I32 skip = t - posixcc;
4477 /* Initially switch on the length of the name. */
4480 if (memEQ(posixcc, "word", 4)) {
4481 /* this is not POSIX, this is the Perl \w */;
4483 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4487 /* Names all of length 5. */
4488 /* alnum alpha ascii blank cntrl digit graph lower
4489 print punct space upper */
4490 /* Offset 4 gives the best switch position. */
4491 switch (posixcc[4]) {
4493 if (memEQ(posixcc, "alph", 4)) {
4496 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4500 if (memEQ(posixcc, "spac", 4)) {
4503 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4507 if (memEQ(posixcc, "grap", 4)) {
4510 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4514 if (memEQ(posixcc, "asci", 4)) {
4517 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4521 if (memEQ(posixcc, "blan", 4)) {
4524 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4528 if (memEQ(posixcc, "cntr", 4)) {
4531 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4535 if (memEQ(posixcc, "alnu", 4)) {
4538 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4542 if (memEQ(posixcc, "lowe", 4)) {
4545 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4547 if (memEQ(posixcc, "uppe", 4)) {
4550 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4554 if (memEQ(posixcc, "digi", 4)) {
4557 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4559 if (memEQ(posixcc, "prin", 4)) {
4562 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4564 if (memEQ(posixcc, "punc", 4)) {
4567 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4573 if (memEQ(posixcc, "xdigit", 6)) {
4575 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4580 if (namedclass == OOB_NAMEDCLASS)
4582 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4585 assert (posixcc[skip] == ':');
4586 assert (posixcc[skip+1] == ']');
4587 } else if (!SIZE_ONLY) {
4588 /* [[=foo=]] and [[.foo.]] are still future. */
4590 /* adjust RExC_parse so the warning shows after
4592 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4594 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4597 /* Maternal grandfather:
4598 * "[:" ending in ":" but not in ":]" */
4608 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4611 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4612 const char *s = RExC_parse;
4613 const char c = *s++;
4615 while(*s && isALNUM(*s))
4617 if (*s && c == *s && s[1] == ']') {
4618 if (ckWARN(WARN_REGEXP))
4620 "POSIX syntax [%c %c] belongs inside character classes",
4623 /* [[=foo=]] and [[.foo.]] are still future. */
4624 if (POSIXCC_NOTYET(c)) {
4625 /* adjust RExC_parse so the error shows after
4627 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4629 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4636 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4640 register UV nextvalue;
4641 register IV prevvalue = OOB_UNICODE;
4642 register IV range = 0;
4643 register regnode *ret;
4646 char *rangebegin = NULL;
4647 bool need_class = 0;
4651 bool optimize_invert = TRUE;
4652 AV* unicode_alternate = NULL;
4654 UV literal_endpoint = 0;
4657 ret = reganode(pRExC_state, ANYOF, 0);
4660 ANYOF_FLAGS(ret) = 0;
4662 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4666 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4670 RExC_size += ANYOF_SKIP;
4671 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
4674 RExC_emit += ANYOF_SKIP;
4676 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4678 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4679 ANYOF_BITMAP_ZERO(ret);
4680 listsv = newSVpvs("# comment\n");
4683 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4685 if (!SIZE_ONLY && POSIXCC(nextvalue))
4686 checkposixcc(pRExC_state);
4688 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4689 if (UCHARAT(RExC_parse) == ']')
4692 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4696 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4699 rangebegin = RExC_parse;
4701 value = utf8n_to_uvchr((U8*)RExC_parse,
4702 RExC_end - RExC_parse,
4703 &numlen, UTF8_ALLOW_DEFAULT);
4704 RExC_parse += numlen;
4707 value = UCHARAT(RExC_parse++);
4708 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4709 if (value == '[' && POSIXCC(nextvalue))
4710 namedclass = regpposixcc(pRExC_state, value);
4711 else if (value == '\\') {
4713 value = utf8n_to_uvchr((U8*)RExC_parse,
4714 RExC_end - RExC_parse,
4715 &numlen, UTF8_ALLOW_DEFAULT);
4716 RExC_parse += numlen;
4719 value = UCHARAT(RExC_parse++);
4720 /* Some compilers cannot handle switching on 64-bit integer
4721 * values, therefore value cannot be an UV. Yes, this will
4722 * be a problem later if we want switch on Unicode.
4723 * A similar issue a little bit later when switching on
4724 * namedclass. --jhi */
4725 switch ((I32)value) {
4726 case 'w': namedclass = ANYOF_ALNUM; break;
4727 case 'W': namedclass = ANYOF_NALNUM; break;
4728 case 's': namedclass = ANYOF_SPACE; break;
4729 case 'S': namedclass = ANYOF_NSPACE; break;
4730 case 'd': namedclass = ANYOF_DIGIT; break;
4731 case 'D': namedclass = ANYOF_NDIGIT; break;
4734 if (RExC_parse >= RExC_end)
4735 vFAIL2("Empty \\%c{}", (U8)value);
4736 if (*RExC_parse == '{') {
4737 const U8 c = (U8)value;
4738 e = strchr(RExC_parse++, '}');
4740 vFAIL2("Missing right brace on \\%c{}", c);
4741 while (isSPACE(UCHARAT(RExC_parse)))
4743 if (e == RExC_parse)
4744 vFAIL2("Empty \\%c{}", c);
4746 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4754 if (UCHARAT(RExC_parse) == '^') {
4757 value = value == 'p' ? 'P' : 'p'; /* toggle */
4758 while (isSPACE(UCHARAT(RExC_parse))) {
4763 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
4764 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
4767 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4768 namedclass = ANYOF_MAX; /* no official name, but it's named */
4770 case 'n': value = '\n'; break;
4771 case 'r': value = '\r'; break;
4772 case 't': value = '\t'; break;
4773 case 'f': value = '\f'; break;
4774 case 'b': value = '\b'; break;
4775 case 'e': value = ASCII_TO_NATIVE('\033');break;
4776 case 'a': value = ASCII_TO_NATIVE('\007');break;
4778 if (*RExC_parse == '{') {
4779 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4780 | PERL_SCAN_DISALLOW_PREFIX;
4781 e = strchr(RExC_parse++, '}');
4783 vFAIL("Missing right brace on \\x{}");
4785 numlen = e - RExC_parse;
4786 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4790 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4792 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4793 RExC_parse += numlen;
4797 value = UCHARAT(RExC_parse++);
4798 value = toCTRL(value);
4800 case '0': case '1': case '2': case '3': case '4':
4801 case '5': case '6': case '7': case '8': case '9':
4805 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4806 RExC_parse += numlen;
4810 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
4812 "Unrecognized escape \\%c in character class passed through",
4816 } /* end of \blah */
4822 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4824 if (!SIZE_ONLY && !need_class)
4825 ANYOF_CLASS_ZERO(ret);
4829 /* a bad range like a-\d, a-[:digit:] ? */
4832 if (ckWARN(WARN_REGEXP)) {
4834 RExC_parse >= rangebegin ?
4835 RExC_parse - rangebegin : 0;
4837 "False [] range \"%*.*s\"",
4840 if (prevvalue < 256) {
4841 ANYOF_BITMAP_SET(ret, prevvalue);
4842 ANYOF_BITMAP_SET(ret, '-');
4845 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4846 Perl_sv_catpvf(aTHX_ listsv,
4847 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4851 range = 0; /* this was not a true range */
4855 const char *what = NULL;
4858 if (namedclass > OOB_NAMEDCLASS)
4859 optimize_invert = FALSE;
4860 /* Possible truncation here but in some 64-bit environments
4861 * the compiler gets heartburn about switch on 64-bit values.
4862 * A similar issue a little earlier when switching on value.
4864 switch ((I32)namedclass) {
4867 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4869 for (value = 0; value < 256; value++)
4871 ANYOF_BITMAP_SET(ret, value);
4878 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4880 for (value = 0; value < 256; value++)
4881 if (!isALNUM(value))
4882 ANYOF_BITMAP_SET(ret, value);
4889 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4891 for (value = 0; value < 256; value++)
4892 if (isALNUMC(value))
4893 ANYOF_BITMAP_SET(ret, value);
4900 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4902 for (value = 0; value < 256; value++)
4903 if (!isALNUMC(value))
4904 ANYOF_BITMAP_SET(ret, value);
4911 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4913 for (value = 0; value < 256; value++)
4915 ANYOF_BITMAP_SET(ret, value);
4922 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4924 for (value = 0; value < 256; value++)
4925 if (!isALPHA(value))
4926 ANYOF_BITMAP_SET(ret, value);
4933 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4936 for (value = 0; value < 128; value++)
4937 ANYOF_BITMAP_SET(ret, value);
4939 for (value = 0; value < 256; value++) {
4941 ANYOF_BITMAP_SET(ret, value);
4950 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4953 for (value = 128; value < 256; value++)
4954 ANYOF_BITMAP_SET(ret, value);
4956 for (value = 0; value < 256; value++) {
4957 if (!isASCII(value))
4958 ANYOF_BITMAP_SET(ret, value);
4967 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4969 for (value = 0; value < 256; value++)
4971 ANYOF_BITMAP_SET(ret, value);
4978 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4980 for (value = 0; value < 256; value++)
4981 if (!isBLANK(value))
4982 ANYOF_BITMAP_SET(ret, value);
4989 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
4991 for (value = 0; value < 256; value++)
4993 ANYOF_BITMAP_SET(ret, value);
5000 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5002 for (value = 0; value < 256; value++)
5003 if (!isCNTRL(value))
5004 ANYOF_BITMAP_SET(ret, value);
5011 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5013 /* consecutive digits assumed */
5014 for (value = '0'; value <= '9'; value++)
5015 ANYOF_BITMAP_SET(ret, value);
5022 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5024 /* consecutive digits assumed */
5025 for (value = 0; value < '0'; value++)
5026 ANYOF_BITMAP_SET(ret, value);
5027 for (value = '9' + 1; value < 256; value++)
5028 ANYOF_BITMAP_SET(ret, value);
5035 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5037 for (value = 0; value < 256; value++)
5039 ANYOF_BITMAP_SET(ret, value);
5046 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5048 for (value = 0; value < 256; value++)
5049 if (!isGRAPH(value))
5050 ANYOF_BITMAP_SET(ret, value);
5057 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5059 for (value = 0; value < 256; value++)
5061 ANYOF_BITMAP_SET(ret, value);
5068 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5070 for (value = 0; value < 256; value++)
5071 if (!isLOWER(value))
5072 ANYOF_BITMAP_SET(ret, value);
5079 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5081 for (value = 0; value < 256; value++)
5083 ANYOF_BITMAP_SET(ret, value);
5090 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5092 for (value = 0; value < 256; value++)
5093 if (!isPRINT(value))
5094 ANYOF_BITMAP_SET(ret, value);
5101 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5103 for (value = 0; value < 256; value++)
5104 if (isPSXSPC(value))
5105 ANYOF_BITMAP_SET(ret, value);
5112 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5114 for (value = 0; value < 256; value++)
5115 if (!isPSXSPC(value))
5116 ANYOF_BITMAP_SET(ret, value);
5123 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5125 for (value = 0; value < 256; value++)
5127 ANYOF_BITMAP_SET(ret, value);
5134 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5136 for (value = 0; value < 256; value++)
5137 if (!isPUNCT(value))
5138 ANYOF_BITMAP_SET(ret, value);
5145 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5147 for (value = 0; value < 256; value++)
5149 ANYOF_BITMAP_SET(ret, value);
5156 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5158 for (value = 0; value < 256; value++)
5159 if (!isSPACE(value))
5160 ANYOF_BITMAP_SET(ret, value);
5167 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5169 for (value = 0; value < 256; value++)
5171 ANYOF_BITMAP_SET(ret, value);
5178 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5180 for (value = 0; value < 256; value++)
5181 if (!isUPPER(value))
5182 ANYOF_BITMAP_SET(ret, value);
5189 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5191 for (value = 0; value < 256; value++)
5192 if (isXDIGIT(value))
5193 ANYOF_BITMAP_SET(ret, value);
5200 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5202 for (value = 0; value < 256; value++)
5203 if (!isXDIGIT(value))
5204 ANYOF_BITMAP_SET(ret, value);
5210 /* this is to handle \p and \P */
5213 vFAIL("Invalid [::] class");
5217 /* Strings such as "+utf8::isWord\n" */
5218 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5221 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5224 } /* end of namedclass \blah */
5227 if (prevvalue > (IV)value) /* b-a */ {
5228 const int w = RExC_parse - rangebegin;
5229 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5230 range = 0; /* not a valid range */
5234 prevvalue = value; /* save the beginning of the range */
5235 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5236 RExC_parse[1] != ']') {
5239 /* a bad range like \w-, [:word:]- ? */
5240 if (namedclass > OOB_NAMEDCLASS) {
5241 if (ckWARN(WARN_REGEXP)) {
5243 RExC_parse >= rangebegin ?
5244 RExC_parse - rangebegin : 0;
5246 "False [] range \"%*.*s\"",
5250 ANYOF_BITMAP_SET(ret, '-');
5252 range = 1; /* yeah, it's a range! */
5253 continue; /* but do it the next time */
5257 /* now is the next time */
5261 if (prevvalue < 256) {
5262 const IV ceilvalue = value < 256 ? value : 255;
5265 /* In EBCDIC [\x89-\x91] should include
5266 * the \x8e but [i-j] should not. */
5267 if (literal_endpoint == 2 &&
5268 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5269 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5271 if (isLOWER(prevvalue)) {
5272 for (i = prevvalue; i <= ceilvalue; i++)
5274 ANYOF_BITMAP_SET(ret, i);
5276 for (i = prevvalue; i <= ceilvalue; i++)
5278 ANYOF_BITMAP_SET(ret, i);
5283 for (i = prevvalue; i <= ceilvalue; i++)
5284 ANYOF_BITMAP_SET(ret, i);
5286 if (value > 255 || UTF) {
5287 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5288 const UV natvalue = NATIVE_TO_UNI(value);
5290 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5291 if (prevnatvalue < natvalue) { /* what about > ? */
5292 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5293 prevnatvalue, natvalue);
5295 else if (prevnatvalue == natvalue) {
5296 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5298 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5300 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5302 /* If folding and foldable and a single
5303 * character, insert also the folded version
5304 * to the charclass. */
5306 if (foldlen == (STRLEN)UNISKIP(f))
5307 Perl_sv_catpvf(aTHX_ listsv,
5310 /* Any multicharacter foldings
5311 * require the following transform:
5312 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5313 * where E folds into "pq" and F folds
5314 * into "rst", all other characters
5315 * fold to single characters. We save
5316 * away these multicharacter foldings,
5317 * to be later saved as part of the
5318 * additional "s" data. */
5321 if (!unicode_alternate)
5322 unicode_alternate = newAV();
5323 sv = newSVpvn((char*)foldbuf, foldlen);
5325 av_push(unicode_alternate, sv);
5329 /* If folding and the value is one of the Greek
5330 * sigmas insert a few more sigmas to make the
5331 * folding rules of the sigmas to work right.
5332 * Note that not all the possible combinations
5333 * are handled here: some of them are handled
5334 * by the standard folding rules, and some of
5335 * them (literal or EXACTF cases) are handled
5336 * during runtime in regexec.c:S_find_byclass(). */
5337 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5338 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5339 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5340 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5341 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5343 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5344 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5345 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5350 literal_endpoint = 0;
5354 range = 0; /* this range (if it was one) is done now */
5358 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5360 RExC_size += ANYOF_CLASS_ADD_SKIP;
5362 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5365 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5367 /* If the only flag is folding (plus possibly inversion). */
5368 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5370 for (value = 0; value < 256; ++value) {
5371 if (ANYOF_BITMAP_TEST(ret, value)) {
5372 UV fold = PL_fold[value];
5375 ANYOF_BITMAP_SET(ret, fold);
5378 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5381 /* optimize inverted simple patterns (e.g. [^a-z]) */
5382 if (!SIZE_ONLY && optimize_invert &&
5383 /* If the only flag is inversion. */
5384 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5385 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5386 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5387 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5391 AV * const av = newAV();
5394 /* The 0th element stores the character class description
5395 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5396 * to initialize the appropriate swash (which gets stored in
5397 * the 1st element), and also useful for dumping the regnode.
5398 * The 2nd element stores the multicharacter foldings,
5399 * used later (regexec.c:S_reginclass()). */
5400 av_store(av, 0, listsv);
5401 av_store(av, 1, NULL);
5402 av_store(av, 2, (SV*)unicode_alternate);
5403 rv = newRV_noinc((SV*)av);
5404 n = add_data(pRExC_state, 1, "s");
5405 RExC_rx->data->data[n] = (void*)rv;
5413 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5415 char* const retval = RExC_parse++;
5418 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5419 RExC_parse[2] == '#') {
5420 while (*RExC_parse != ')') {
5421 if (RExC_parse == RExC_end)
5422 FAIL("Sequence (?#... not terminated");
5428 if (RExC_flags & PMf_EXTENDED) {
5429 if (isSPACE(*RExC_parse)) {
5433 else if (*RExC_parse == '#') {
5434 while (RExC_parse < RExC_end)
5435 if (*RExC_parse++ == '\n') break;
5444 - reg_node - emit a node
5446 STATIC regnode * /* Location. */
5447 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5450 register regnode *ptr;
5451 regnode * const ret = RExC_emit;
5454 SIZE_ALIGN(RExC_size);
5459 NODE_ALIGN_FILL(ret);
5461 FILL_ADVANCE_NODE(ptr, op);
5462 if (RExC_offsets) { /* MJD */
5463 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5464 "reg_node", __LINE__,
5466 RExC_emit - RExC_emit_start > RExC_offsets[0]
5467 ? "Overwriting end of array!\n" : "OK",
5468 RExC_emit - RExC_emit_start,
5469 RExC_parse - RExC_start,
5471 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5480 - reganode - emit a node with an argument
5482 STATIC regnode * /* Location. */
5483 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5486 register regnode *ptr;
5487 regnode * const ret = RExC_emit;
5490 SIZE_ALIGN(RExC_size);
5495 NODE_ALIGN_FILL(ret);
5497 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5498 if (RExC_offsets) { /* MJD */
5499 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5503 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5504 "Overwriting end of array!\n" : "OK",
5505 RExC_emit - RExC_emit_start,
5506 RExC_parse - RExC_start,
5508 Set_Cur_Node_Offset;
5517 - reguni - emit (if appropriate) a Unicode character
5520 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5523 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5527 - reginsert - insert an operator in front of already-emitted operand
5529 * Means relocating the operand.
5532 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5535 register regnode *src;
5536 register regnode *dst;
5537 register regnode *place;
5538 const int offset = regarglen[(U8)op];
5540 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5543 RExC_size += NODE_STEP_REGNODE + offset;
5548 RExC_emit += NODE_STEP_REGNODE + offset;
5550 while (src > opnd) {
5551 StructCopy(--src, --dst, regnode);
5552 if (RExC_offsets) { /* MJD 20010112 */
5553 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5557 dst - RExC_emit_start > RExC_offsets[0]
5558 ? "Overwriting end of array!\n" : "OK",
5559 src - RExC_emit_start,
5560 dst - RExC_emit_start,
5562 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5563 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5568 place = opnd; /* Op node, where operand used to be. */
5569 if (RExC_offsets) { /* MJD */
5570 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5574 place - RExC_emit_start > RExC_offsets[0]
5575 ? "Overwriting end of array!\n" : "OK",
5576 place - RExC_emit_start,
5577 RExC_parse - RExC_start,
5579 Set_Node_Offset(place, RExC_parse);
5580 Set_Node_Length(place, 1);
5582 src = NEXTOPER(place);
5583 FILL_ADVANCE_NODE(place, op);
5584 Zero(src, offset, regnode);
5588 - regtail - set the next-pointer at the end of a node chain of p to val.
5590 /* TODO: All three parms should be const */
5592 S_regtail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
5595 register regnode *scan;
5600 /* Find last node. */
5603 regnode * const temp = regnext(scan);
5609 if (reg_off_by_arg[OP(scan)]) {
5610 ARG_SET(scan, val - scan);
5613 NEXT_OFF(scan) = val - scan;
5618 - regoptail - regtail on operand of first argument; nop if operandless
5620 /* TODO: All three parms should be const */
5622 S_regoptail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
5625 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5626 if (p == NULL || SIZE_ONLY)
5628 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5629 regtail(pRExC_state, NEXTOPER(p), val);
5631 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5632 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5639 - regcurly - a little FSA that accepts {\d+,?\d*}
5642 S_regcurly(register const char *s)
5661 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5664 Perl_regdump(pTHX_ const regexp *r)
5668 SV * const sv = sv_newmortal();
5670 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
5672 /* Header fields of interest. */
5673 if (r->anchored_substr)
5674 PerlIO_printf(Perl_debug_log,
5675 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5677 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5678 SvPVX_const(r->anchored_substr),
5680 SvTAIL(r->anchored_substr) ? "$" : "",
5681 (IV)r->anchored_offset);
5682 else if (r->anchored_utf8)
5683 PerlIO_printf(Perl_debug_log,
5684 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5686 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5687 SvPVX_const(r->anchored_utf8),
5689 SvTAIL(r->anchored_utf8) ? "$" : "",
5690 (IV)r->anchored_offset);
5691 if (r->float_substr)
5692 PerlIO_printf(Perl_debug_log,
5693 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5695 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5696 SvPVX_const(r->float_substr),
5698 SvTAIL(r->float_substr) ? "$" : "",
5699 (IV)r->float_min_offset, (UV)r->float_max_offset);
5700 else if (r->float_utf8)
5701 PerlIO_printf(Perl_debug_log,
5702 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5704 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5705 SvPVX_const(r->float_utf8),
5707 SvTAIL(r->float_utf8) ? "$" : "",
5708 (IV)r->float_min_offset, (UV)r->float_max_offset);
5709 if (r->check_substr || r->check_utf8)
5710 PerlIO_printf(Perl_debug_log,
5711 r->check_substr == r->float_substr
5712 && r->check_utf8 == r->float_utf8
5713 ? "(checking floating" : "(checking anchored");
5714 if (r->reganch & ROPT_NOSCAN)
5715 PerlIO_printf(Perl_debug_log, " noscan");
5716 if (r->reganch & ROPT_CHECK_ALL)
5717 PerlIO_printf(Perl_debug_log, " isall");
5718 if (r->check_substr || r->check_utf8)
5719 PerlIO_printf(Perl_debug_log, ") ");
5721 if (r->regstclass) {
5722 regprop(r, sv, r->regstclass);
5723 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5725 if (r->reganch & ROPT_ANCH) {
5726 PerlIO_printf(Perl_debug_log, "anchored");
5727 if (r->reganch & ROPT_ANCH_BOL)
5728 PerlIO_printf(Perl_debug_log, "(BOL)");
5729 if (r->reganch & ROPT_ANCH_MBOL)
5730 PerlIO_printf(Perl_debug_log, "(MBOL)");
5731 if (r->reganch & ROPT_ANCH_SBOL)
5732 PerlIO_printf(Perl_debug_log, "(SBOL)");
5733 if (r->reganch & ROPT_ANCH_GPOS)
5734 PerlIO_printf(Perl_debug_log, "(GPOS)");
5735 PerlIO_putc(Perl_debug_log, ' ');
5737 if (r->reganch & ROPT_GPOS_SEEN)
5738 PerlIO_printf(Perl_debug_log, "GPOS ");
5739 if (r->reganch & ROPT_SKIP)
5740 PerlIO_printf(Perl_debug_log, "plus ");
5741 if (r->reganch & ROPT_IMPLICIT)
5742 PerlIO_printf(Perl_debug_log, "implicit ");
5743 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5744 if (r->reganch & ROPT_EVAL_SEEN)
5745 PerlIO_printf(Perl_debug_log, "with eval ");
5746 PerlIO_printf(Perl_debug_log, "\n");
5748 const U32 len = r->offsets[0];
5749 GET_RE_DEBUG_FLAGS_DECL;
5752 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5753 for (i = 1; i <= len; i++)
5754 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5755 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5756 PerlIO_printf(Perl_debug_log, "\n");
5760 PERL_UNUSED_CONTEXT;
5762 #endif /* DEBUGGING */
5766 - regprop - printable representation of opcode
5769 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
5775 sv_setpvn(sv, "", 0);
5776 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5777 /* It would be nice to FAIL() here, but this may be called from
5778 regexec.c, and it would be hard to supply pRExC_state. */
5779 Perl_croak(aTHX_ "Corrupted regexp opcode");
5780 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5782 k = PL_regkind[(U8)OP(o)];
5785 SV * const dsv = sv_2mortal(newSVpvs(""));
5786 /* Using is_utf8_string() is a crude hack but it may
5787 * be the best for now since we have no flag "this EXACTish
5788 * node was UTF-8" --jhi */
5789 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5790 const char * const s = do_utf8 ?
5791 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5792 UNI_DISPLAY_REGEX) :
5794 const int len = do_utf8 ?
5797 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5801 } else if (k == TRIE) {
5803 /* print the details od the trie in dumpuntil instead, as
5804 * prog->data isn't available here */
5805 } else if (k == CURLY) {
5806 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5807 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5808 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5810 else if (k == WHILEM && o->flags) /* Ordinal/of */
5811 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5812 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5813 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5814 else if (k == LOGICAL)
5815 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5816 else if (k == ANYOF) {
5817 int i, rangestart = -1;
5818 const U8 flags = ANYOF_FLAGS(o);
5820 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5821 static const char * const anyofs[] = {
5854 if (flags & ANYOF_LOCALE)
5855 sv_catpvs(sv, "{loc}");
5856 if (flags & ANYOF_FOLD)
5857 sv_catpvs(sv, "{i}");
5858 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5859 if (flags & ANYOF_INVERT)
5861 for (i = 0; i <= 256; i++) {
5862 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5863 if (rangestart == -1)
5865 } else if (rangestart != -1) {
5866 if (i <= rangestart + 3)
5867 for (; rangestart < i; rangestart++)
5868 put_byte(sv, rangestart);
5870 put_byte(sv, rangestart);
5872 put_byte(sv, i - 1);
5878 if (o->flags & ANYOF_CLASS)
5879 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5880 if (ANYOF_CLASS_TEST(o,i))
5881 sv_catpv(sv, anyofs[i]);
5883 if (flags & ANYOF_UNICODE)
5884 sv_catpvs(sv, "{unicode}");
5885 else if (flags & ANYOF_UNICODE_ALL)
5886 sv_catpvs(sv, "{unicode_all}");
5890 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
5894 U8 s[UTF8_MAXBYTES_CASE+1];
5896 for (i = 0; i <= 256; i++) { /* just the first 256 */
5897 uvchr_to_utf8(s, i);
5899 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5900 if (rangestart == -1)
5902 } else if (rangestart != -1) {
5903 if (i <= rangestart + 3)
5904 for (; rangestart < i; rangestart++) {
5905 const U8 * const e = uvchr_to_utf8(s,rangestart);
5907 for(p = s; p < e; p++)
5911 const U8 *e = uvchr_to_utf8(s,rangestart);
5913 for (p = s; p < e; p++)
5916 e = uvchr_to_utf8(s, i-1);
5917 for (p = s; p < e; p++)
5924 sv_catpvs(sv, "..."); /* et cetera */
5928 char *s = savesvpv(lv);
5929 char * const origs = s;
5931 while(*s && *s != '\n') s++;
5934 const char * const t = ++s;
5952 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5954 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5955 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5957 PERL_UNUSED_CONTEXT;
5958 PERL_UNUSED_ARG(sv);
5960 #endif /* DEBUGGING */
5964 Perl_re_intuit_string(pTHX_ regexp *prog)
5965 { /* Assume that RE_INTUIT is set */
5967 GET_RE_DEBUG_FLAGS_DECL;
5968 PERL_UNUSED_CONTEXT;
5972 const char * const s = SvPV_nolen_const(prog->check_substr
5973 ? prog->check_substr : prog->check_utf8);
5975 if (!PL_colorset) reginitcolors();
5976 PerlIO_printf(Perl_debug_log,
5977 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
5979 prog->check_substr ? "" : "utf8 ",
5980 PL_colors[5],PL_colors[0],
5983 (strlen(s) > 60 ? "..." : ""));
5986 return prog->check_substr ? prog->check_substr : prog->check_utf8;
5990 Perl_pregfree(pTHX_ struct regexp *r)
5994 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
5995 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
5999 if (!r || (--r->refcnt > 0))
6001 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6002 const char * const s = (r->reganch & ROPT_UTF8)
6003 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6004 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6005 const int len = SvCUR(dsv);
6008 PerlIO_printf(Perl_debug_log,
6009 "%sFreeing REx:%s %s%*.*s%s%s\n",
6010 PL_colors[4],PL_colors[5],PL_colors[0],
6013 len > 60 ? "..." : "");
6016 /* gcov results gave these as non-null 100% of the time, so there's no
6017 optimisation in checking them before calling Safefree */
6018 Safefree(r->precomp);
6019 Safefree(r->offsets); /* 20010421 MJD */
6020 RX_MATCH_COPY_FREE(r);
6021 #ifdef PERL_OLD_COPY_ON_WRITE
6023 SvREFCNT_dec(r->saved_copy);
6026 if (r->anchored_substr)
6027 SvREFCNT_dec(r->anchored_substr);
6028 if (r->anchored_utf8)
6029 SvREFCNT_dec(r->anchored_utf8);
6030 if (r->float_substr)
6031 SvREFCNT_dec(r->float_substr);
6033 SvREFCNT_dec(r->float_utf8);
6034 Safefree(r->substrs);
6037 int n = r->data->count;
6038 PAD* new_comppad = NULL;
6043 /* If you add a ->what type here, update the comment in regcomp.h */
6044 switch (r->data->what[n]) {
6046 SvREFCNT_dec((SV*)r->data->data[n]);
6049 Safefree(r->data->data[n]);
6052 new_comppad = (AV*)r->data->data[n];
6055 if (new_comppad == NULL)
6056 Perl_croak(aTHX_ "panic: pregfree comppad");
6057 PAD_SAVE_LOCAL(old_comppad,
6058 /* Watch out for global destruction's random ordering. */
6059 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6062 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6065 op_free((OP_4tree*)r->data->data[n]);
6067 PAD_RESTORE_LOCAL(old_comppad);
6068 SvREFCNT_dec((SV*)new_comppad);
6075 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
6078 refcount = --trie->refcount;
6081 Safefree(trie->charmap);
6082 if (trie->widecharmap)
6083 SvREFCNT_dec((SV*)trie->widecharmap);
6084 Safefree(trie->states);
6085 Safefree(trie->trans);
6088 SvREFCNT_dec((SV*)trie->words);
6089 if (trie->revcharmap)
6090 SvREFCNT_dec((SV*)trie->revcharmap);
6092 Safefree(r->data->data[n]); /* do this last!!!! */
6097 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6100 Safefree(r->data->what);
6103 Safefree(r->startp);
6109 - regnext - dig the "next" pointer out of a node
6112 Perl_regnext(pTHX_ register regnode *p)
6115 register I32 offset;
6117 if (p == &PL_regdummy)
6120 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6128 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6131 STRLEN l1 = strlen(pat1);
6132 STRLEN l2 = strlen(pat2);
6135 const char *message;
6141 Copy(pat1, buf, l1 , char);
6142 Copy(pat2, buf + l1, l2 , char);
6143 buf[l1 + l2] = '\n';
6144 buf[l1 + l2 + 1] = '\0';
6146 /* ANSI variant takes additional second argument */
6147 va_start(args, pat2);
6151 msv = vmess(buf, &args);
6153 message = SvPV_const(msv,l1);
6156 Copy(message, buf, l1 , char);
6157 buf[l1-1] = '\0'; /* Overwrite \n */
6158 Perl_croak(aTHX_ "%s", buf);
6161 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6164 Perl_save_re_context(pTHX)
6168 struct re_save_state *state;
6170 SAVEVPTR(PL_curcop);
6171 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6173 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6174 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6175 SSPUSHINT(SAVEt_RE_STATE);
6177 Copy(&PL_reg_state, state, 1, struct re_save_state);
6179 PL_reg_start_tmp = 0;
6180 PL_reg_start_tmpl = 0;
6181 PL_reg_oldsaved = NULL;
6182 PL_reg_oldsavedlen = 0;
6184 PL_reg_leftiter = 0;
6185 PL_reg_poscache = NULL;
6186 PL_reg_poscache_size = 0;
6187 #ifdef PERL_OLD_COPY_ON_WRITE
6191 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6193 const REGEXP * const rx = PM_GETRE(PL_curpm);
6196 for (i = 1; i <= rx->nparens; i++) {
6197 char digits[TYPE_CHARS(long)];
6198 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
6199 GV *const *const gvp
6200 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6203 GV * const gv = *gvp;
6204 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6213 clear_re(pTHX_ void *r)
6216 ReREFCNT_dec((regexp *)r);
6222 S_put_byte(pTHX_ SV *sv, int c)
6224 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6225 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6226 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6227 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6229 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6233 STATIC const regnode *
6234 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6235 const regnode *last, SV* sv, I32 l)
6238 register U8 op = EXACT; /* Arbitrary non-END op. */
6239 register const regnode *next;
6241 while (op != END && (!last || node < last)) {
6242 /* While that wasn't END last time... */
6248 next = regnext((regnode *)node);
6250 if (OP(node) == OPTIMIZED)
6252 regprop(r, sv, node);
6253 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6254 (int)(2*l + 1), "", SvPVX_const(sv));
6255 if (next == NULL) /* Next ptr. */
6256 PerlIO_printf(Perl_debug_log, "(0)");
6258 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6259 (void)PerlIO_putc(Perl_debug_log, '\n');
6261 if (PL_regkind[(U8)op] == BRANCHJ) {
6262 register const regnode *nnode = (OP(next) == LONGJMP
6263 ? regnext((regnode *)next)
6265 if (last && nnode > last)
6267 node = dumpuntil(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6269 else if (PL_regkind[(U8)op] == BRANCH) {
6270 node = dumpuntil(r, start, NEXTOPER(node), next, sv, l + 1);
6272 else if ( PL_regkind[(U8)op] == TRIE ) {
6273 const I32 n = ARG(node);
6274 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6275 const I32 arry_len = av_len(trie->words)+1;
6277 PerlIO_printf(Perl_debug_log,
6278 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6282 (int)trie->charcount,
6283 trie->uniquecharcount,
6284 (IV)trie->laststate-1,
6285 node->flags ? " EVAL mode" : "");
6287 for (word_idx=0; word_idx < arry_len; word_idx++) {
6288 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
6290 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6293 SvPV_nolen_const(*elem_ptr),
6298 PerlIO_printf(Perl_debug_log, "(0)\n");
6300 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6306 node = NEXTOPER(node);
6307 node += regarglen[(U8)op];
6310 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6311 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6312 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6314 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6315 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6318 else if ( op == PLUS || op == STAR) {
6319 node = dumpuntil(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6321 else if (op == ANYOF) {
6322 /* arglen 1 + class block */
6323 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6324 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6325 node = NEXTOPER(node);
6327 else if (PL_regkind[(U8)op] == EXACT) {
6328 /* Literal string, where present. */
6329 node += NODE_SZ_STR(node) - 1;
6330 node = NEXTOPER(node);
6333 node = NEXTOPER(node);
6334 node += regarglen[(U8)op];
6336 if (op == CURLYX || op == OPEN)
6338 else if (op == WHILEM)
6344 #endif /* DEBUGGING */
6348 * c-indentation-style: bsd
6350 * indent-tabs-mode: t
6353 * ex: set ts=8 sts=4 sw=4 noet: