5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
287 #define Simple_vFAIL(m) STMT_START { \
288 const IV offset = RExC_parse - RExC_precomp; \
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
296 #define vFAIL(m) STMT_START { \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
303 * Like Simple_vFAIL(), but accepts two arguments.
305 #define Simple_vFAIL2(m,a1) STMT_START { \
306 const IV offset = RExC_parse - RExC_precomp; \
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
314 #define vFAIL2(m,a1) STMT_START { \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
322 * Like Simple_vFAIL(), but accepts three arguments.
324 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
325 const IV offset = RExC_parse - RExC_precomp; \
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
333 #define vFAIL3(m,a1,a2) STMT_START { \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
340 * Like Simple_vFAIL(), but accepts four arguments.
342 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
343 const IV offset = RExC_parse - RExC_precomp; \
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
348 #define vWARN(loc,m) STMT_START { \
349 const IV offset = loc - RExC_precomp; \
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
354 #define vWARNdep(loc,m) STMT_START { \
355 const IV offset = loc - RExC_precomp; \
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
362 #define vWARN2(loc, m, a1) STMT_START { \
363 const IV offset = loc - RExC_precomp; \
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
368 #define vWARN3(loc, m, a1, a2) STMT_START { \
369 const IV offset = loc - RExC_precomp; \
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
374 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
375 const IV offset = loc - RExC_precomp; \
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
381 const IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
387 /* Allow for side effects in s */
388 #define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
392 /* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
398 #define MJD_OFFSET_DEBUG(x)
399 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
402 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
409 RExC_offsets[2*(node)-1] = (byte); \
414 #define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
418 #define Set_Node_Length_To_R(node,len) STMT_START { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
421 __LINE__, (int)(node), (int)(len))); \
423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
425 RExC_offsets[2*(node)] = (len); \
430 #define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433 #define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
436 /* Get offsets and lengths */
437 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
440 static void clear_re(pTHX_ void *r);
442 /* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
447 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
453 SvSetMagicSV(*data->longest, data->last_found);
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
460 data->flags &= ~SF_FIX_BEFORE_EOL;
463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
464 data->offset_float_max = (l
465 ? data->last_start_max
466 : data->pos_min + data->pos_delta);
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
469 if (data->flags & SF_BEFORE_EOL)
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
473 data->flags &= ~SF_FL_BEFORE_EOL;
476 SvCUR_set(data->last_found, 0);
478 SV * const sv = data->last_found;
479 if (SvUTF8(sv) && SvMAGICAL(sv)) {
480 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
486 data->flags &= ~SF_BEFORE_EOL;
489 /* Can match anything (initialization) */
491 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
493 ANYOF_CLASS_ZERO(cl);
494 ANYOF_BITMAP_SETALL(cl);
495 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
497 cl->flags |= ANYOF_LOCALE;
500 /* Can match anything (initialization) */
502 S_cl_is_anything(const struct regnode_charclass_class *cl)
506 for (value = 0; value <= ANYOF_MAX; value += 2)
507 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
509 if (!(cl->flags & ANYOF_UNICODE_ALL))
511 if (!ANYOF_BITMAP_TESTALLSET(cl))
516 /* Can match anything (initialization) */
518 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
520 Zero(cl, 1, struct regnode_charclass_class);
522 cl_anything(pRExC_state, cl);
526 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
528 Zero(cl, 1, struct regnode_charclass_class);
530 cl_anything(pRExC_state, cl);
532 cl->flags |= ANYOF_LOCALE;
535 /* 'And' a given class with another one. Can create false positives */
536 /* We assume that cl is not inverted */
538 S_cl_and(struct regnode_charclass_class *cl,
539 const struct regnode_charclass_class *and_with)
541 if (!(and_with->flags & ANYOF_CLASS)
542 && !(cl->flags & ANYOF_CLASS)
543 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
544 && !(and_with->flags & ANYOF_FOLD)
545 && !(cl->flags & ANYOF_FOLD)) {
548 if (and_with->flags & ANYOF_INVERT)
549 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
550 cl->bitmap[i] &= ~and_with->bitmap[i];
552 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
553 cl->bitmap[i] &= and_with->bitmap[i];
554 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
555 if (!(and_with->flags & ANYOF_EOS))
556 cl->flags &= ~ANYOF_EOS;
558 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
559 !(and_with->flags & ANYOF_INVERT)) {
560 cl->flags &= ~ANYOF_UNICODE_ALL;
561 cl->flags |= ANYOF_UNICODE;
562 ARG_SET(cl, ARG(and_with));
564 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
565 !(and_with->flags & ANYOF_INVERT))
566 cl->flags &= ~ANYOF_UNICODE_ALL;
567 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
568 !(and_with->flags & ANYOF_INVERT))
569 cl->flags &= ~ANYOF_UNICODE;
572 /* 'OR' a given class with another one. Can create false positives */
573 /* We assume that cl is not inverted */
575 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
577 if (or_with->flags & ANYOF_INVERT) {
579 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
580 * <= (B1 | !B2) | (CL1 | !CL2)
581 * which is wasteful if CL2 is small, but we ignore CL2:
582 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
583 * XXXX Can we handle case-fold? Unclear:
584 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
585 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
587 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
588 && !(or_with->flags & ANYOF_FOLD)
589 && !(cl->flags & ANYOF_FOLD) ) {
592 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
593 cl->bitmap[i] |= ~or_with->bitmap[i];
594 } /* XXXX: logic is complicated otherwise */
596 cl_anything(pRExC_state, cl);
599 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
600 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
601 && (!(or_with->flags & ANYOF_FOLD)
602 || (cl->flags & ANYOF_FOLD)) ) {
605 /* OR char bitmap and class bitmap separately */
606 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
607 cl->bitmap[i] |= or_with->bitmap[i];
608 if (or_with->flags & ANYOF_CLASS) {
609 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
610 cl->classflags[i] |= or_with->classflags[i];
611 cl->flags |= ANYOF_CLASS;
614 else { /* XXXX: logic is complicated, leave it along for a moment. */
615 cl_anything(pRExC_state, cl);
618 if (or_with->flags & ANYOF_EOS)
619 cl->flags |= ANYOF_EOS;
621 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
622 ARG(cl) != ARG(or_with)) {
623 cl->flags |= ANYOF_UNICODE_ALL;
624 cl->flags &= ~ANYOF_UNICODE;
626 if (or_with->flags & ANYOF_UNICODE_ALL) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
634 make_trie(startbranch,first,last,tail,flags)
635 startbranch: the first branch in the whole branch sequence
636 first : start branch of sequence of branch-exact nodes.
637 May be the same as startbranch
638 last : Thing following the last branch.
639 May be the same as tail.
640 tail : item following the branch sequence
641 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
643 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
645 A trie is an N'ary tree where the branches are determined by digital
646 decomposition of the key. IE, at the root node you look up the 1st character and
647 follow that branch repeat until you find the end of the branches. Nodes can be
648 marked as "accepting" meaning they represent a complete word. Eg:
652 would convert into the following structure. Numbers represent states, letters
653 following numbers represent valid transitions on the letter from that state, if
654 the number is in square brackets it represents an accepting state, otherwise it
655 will be in parenthesis.
657 +-h->+-e->[3]-+-r->(8)-+-s->[9]
661 (1) +-i->(6)-+-s->[7]
663 +-s->(3)-+-h->(4)-+-e->[5]
665 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
667 This shows that when matching against the string 'hers' we will begin at state 1
668 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
669 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
670 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
671 single traverse. We store a mapping from accepting to state to which word was
672 matched, and then when we have multiple possibilities we try to complete the
673 rest of the regex in the order in which they occured in the alternation.
675 The only prior NFA like behaviour that would be changed by the TRIE support is
676 the silent ignoring of duplicate alternations which are of the form:
678 / (DUPE|DUPE) X? (?{ ... }) Y /x
680 Thus EVAL blocks follwing a trie may be called a different number of times with
681 and without the optimisation. With the optimisations dupes will be silently
682 ignored. This inconsistant behaviour of EVAL type nodes is well established as
683 the following demonstrates:
685 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
687 which prints out 'word' three times, but
689 'words'=~/(word|word|word)(?{ print $1 })S/
691 which doesnt print it out at all. This is due to other optimisations kicking in.
693 Example of what happens on a structural level:
695 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
697 1: CURLYM[1] {1,32767}(18)
708 This would be optimizable with startbranch=5, first=5, last=16, tail=16
709 and should turn into:
711 1: CURLYM[1] {1,32767}(18)
713 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
721 Cases where tail != last would be like /(?foo|bar)baz/:
731 which would be optimizable with startbranch=1, first=1, last=7, tail=8
732 and would end up looking like:
735 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
744 #define TRIE_DEBUG_CHAR \
745 DEBUG_TRIE_COMPILE_r({ \
748 tmp = newSVpvs( "" ); \
749 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
751 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
753 av_push( trie->revcharmap, tmp ); \
756 #define TRIE_READ_CHAR STMT_START { \
759 if ( foldlen > 0 ) { \
760 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
765 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
766 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
767 foldlen -= UNISKIP( uvc ); \
768 scan = foldbuf + UNISKIP( uvc ); \
771 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
780 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
781 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
782 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
783 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
785 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
786 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
787 TRIE_LIST_LEN( state ) *= 2; \
788 Renew( trie->states[ state ].trans.list, \
789 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
791 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
792 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
793 TRIE_LIST_CUR( state )++; \
796 #define TRIE_LIST_NEW(state) STMT_START { \
797 Newxz( trie->states[ state ].trans.list, \
798 4, reg_trie_trans_le ); \
799 TRIE_LIST_CUR( state ) = 1; \
800 TRIE_LIST_LEN( state ) = 4; \
804 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
807 /* first pass, loop through and scan words */
810 const U32 uniflags = UTF8_ALLOW_DEFAULT;
815 /* we just use folder as a flag in utf8 */
816 const U8 * const folder = ( flags == EXACTF
824 const U32 data_slot = add_data( pRExC_state, 1, "t" );
827 GET_RE_DEBUG_FLAGS_DECL;
829 Newxz( trie, 1, reg_trie_data );
831 RExC_rx->data->data[ data_slot ] = (void*)trie;
832 Newxz( trie->charmap, 256, U16 );
834 trie->words = newAV();
835 trie->revcharmap = newAV();
839 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
840 if (!SvIOK(re_trie_maxbuff)) {
841 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
844 /* -- First loop and Setup --
846 We first traverse the branches and scan each word to determine if it
847 contains widechars, and how many unique chars there are, this is
848 important as we have to build a table with at least as many columns as we
851 We use an array of integers to represent the character codes 0..255
852 (trie->charmap) and we use a an HV* to store unicode characters. We use the
853 native representation of the character value as the key and IV's for the
856 *TODO* If we keep track of how many times each character is used we can
857 remap the columns so that the table compression later on is more
858 efficient in terms of memory by ensuring most common value is in the
859 middle and the least common are on the outside. IMO this would be better
860 than a most to least common mapping as theres a decent chance the most
861 common letter will share a node with the least common, meaning the node
862 will not be compressable. With a middle is most common approach the worst
863 case is when we have the least common nodes twice.
868 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
869 regnode * const noper = NEXTOPER( cur );
870 const U8 *uc = (U8*)STRING( noper );
871 const U8 * const e = uc + STR_LEN( noper );
873 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
874 const U8 *scan = (U8*)NULL;
876 for ( ; uc < e ; uc += len ) {
880 if ( !trie->charmap[ uvc ] ) {
881 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
883 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
888 if ( !trie->widecharmap )
889 trie->widecharmap = newHV();
891 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
894 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
896 if ( !SvTRUE( *svpp ) ) {
897 sv_setiv( *svpp, ++trie->uniquecharcount );
903 } /* end first pass */
904 DEBUG_TRIE_COMPILE_r(
905 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
906 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
907 (int)trie->charcount, trie->uniquecharcount )
912 We now know what we are dealing with in terms of unique chars and
913 string sizes so we can calculate how much memory a naive
914 representation using a flat table will take. If it's over a reasonable
915 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
916 conservative but potentially much slower representation using an array
919 At the end we convert both representations into the same compressed
920 form that will be used in regexec.c for matching with. The latter
921 is a form that cannot be used to construct with but has memory
922 properties similar to the list form and access properties similar
923 to the table form making it both suitable for fast searches and
924 small enough that its feasable to store for the duration of a program.
926 See the comment in the code where the compressed table is produced
927 inplace from the flat tabe representation for an explanation of how
928 the compression works.
933 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
935 Second Pass -- Array Of Lists Representation
937 Each state will be represented by a list of charid:state records
938 (reg_trie_trans_le) the first such element holds the CUR and LEN
939 points of the allocated array. (See defines above).
941 We build the initial structure using the lists, and then convert
942 it into the compressed table form which allows faster lookups
943 (but cant be modified once converted).
949 STRLEN transcount = 1;
951 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
955 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
957 regnode * const noper = NEXTOPER( cur );
958 U8 *uc = (U8*)STRING( noper );
959 const U8 * const e = uc + STR_LEN( noper );
960 U32 state = 1; /* required init */
961 U16 charid = 0; /* sanity init */
962 U8 *scan = (U8*)NULL; /* sanity init */
963 STRLEN foldlen = 0; /* required init */
964 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
966 for ( ; uc < e ; uc += len ) {
971 charid = trie->charmap[ uvc ];
973 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
977 charid=(U16)SvIV( *svpp );
986 if ( !trie->states[ state ].trans.list ) {
987 TRIE_LIST_NEW( state );
989 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
990 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
991 newstate = TRIE_LIST_ITEM( state, check ).newstate;
996 newstate = next_alloc++;
997 TRIE_LIST_PUSH( state, charid, newstate );
1002 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1004 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1007 if ( !trie->states[ state ].wordnum ) {
1008 /* we havent inserted this word into the structure yet. */
1009 trie->states[ state ].wordnum = ++curword;
1012 /* store the word for dumping */
1013 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1014 if ( UTF ) SvUTF8_on( tmp );
1015 av_push( trie->words, tmp );
1019 /*EMPTY*/; /* It's a dupe. So ignore it. */
1022 } /* end second pass */
1024 trie->laststate = next_alloc;
1025 Renew( trie->states, next_alloc, reg_trie_state );
1027 DEBUG_TRIE_COMPILE_MORE_r({
1030 /* print out the table precompression. */
1032 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1033 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1035 for( state=1 ; state < next_alloc ; state ++ ) {
1038 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1039 if ( ! trie->states[ state ].wordnum ) {
1040 PerlIO_printf( Perl_debug_log, "%5s| ","");
1042 PerlIO_printf( Perl_debug_log, "W%04x| ",
1043 trie->states[ state ].wordnum
1046 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1047 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1048 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1049 SvPV_nolen_const( *tmp ),
1050 TRIE_LIST_ITEM(state,charid).forid,
1051 (UV)TRIE_LIST_ITEM(state,charid).newstate
1056 PerlIO_printf( Perl_debug_log, "\n\n" );
1059 Newxz( trie->trans, transcount ,reg_trie_trans );
1066 for( state=1 ; state < next_alloc ; state ++ ) {
1070 DEBUG_TRIE_COMPILE_MORE_r(
1071 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1075 if (trie->states[state].trans.list) {
1076 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1080 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1081 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1082 if ( forid < minid ) {
1084 } else if ( forid > maxid ) {
1088 if ( transcount < tp + maxid - minid + 1) {
1090 Renew( trie->trans, transcount, reg_trie_trans );
1091 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1093 base = trie->uniquecharcount + tp - minid;
1094 if ( maxid == minid ) {
1096 for ( ; zp < tp ; zp++ ) {
1097 if ( ! trie->trans[ zp ].next ) {
1098 base = trie->uniquecharcount + zp - minid;
1099 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1100 trie->trans[ zp ].check = state;
1106 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1107 trie->trans[ tp ].check = state;
1112 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1113 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1114 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1115 trie->trans[ tid ].check = state;
1117 tp += ( maxid - minid + 1 );
1119 Safefree(trie->states[ state ].trans.list);
1122 DEBUG_TRIE_COMPILE_MORE_r(
1123 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1126 trie->states[ state ].trans.base=base;
1128 trie->lasttrans = tp + 1;
1132 Second Pass -- Flat Table Representation.
1134 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1135 We know that we will need Charcount+1 trans at most to store the data
1136 (one row per char at worst case) So we preallocate both structures
1137 assuming worst case.
1139 We then construct the trie using only the .next slots of the entry
1142 We use the .check field of the first entry of the node temporarily to
1143 make compression both faster and easier by keeping track of how many non
1144 zero fields are in the node.
1146 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1149 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1150 number representing the first entry of the node, and state as a
1151 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1152 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1153 are 2 entrys per node. eg:
1161 The table is internally in the right hand, idx form. However as we also
1162 have to deal with the states array which is indexed by nodenum we have to
1163 use TRIE_NODENUM() to convert.
1167 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1169 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
1170 next_alloc = trie->uniquecharcount + 1;
1172 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1174 regnode * const noper = NEXTOPER( cur );
1175 const U8 *uc = (U8*)STRING( noper );
1176 const U8 * const e = uc + STR_LEN( noper );
1178 U32 state = 1; /* required init */
1180 U16 charid = 0; /* sanity init */
1181 U32 accept_state = 0; /* sanity init */
1182 U8 *scan = (U8*)NULL; /* sanity init */
1184 STRLEN foldlen = 0; /* required init */
1185 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1188 for ( ; uc < e ; uc += len ) {
1193 charid = trie->charmap[ uvc ];
1195 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1196 charid = svpp ? (U16)SvIV(*svpp) : 0;
1200 if ( !trie->trans[ state + charid ].next ) {
1201 trie->trans[ state + charid ].next = next_alloc;
1202 trie->trans[ state ].check++;
1203 next_alloc += trie->uniquecharcount;
1205 state = trie->trans[ state + charid ].next;
1207 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1209 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1212 accept_state = TRIE_NODENUM( state );
1213 if ( !trie->states[ accept_state ].wordnum ) {
1214 /* we havent inserted this word into the structure yet. */
1215 trie->states[ accept_state ].wordnum = ++curword;
1218 /* store the word for dumping */
1219 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1220 if ( UTF ) SvUTF8_on( tmp );
1221 av_push( trie->words, tmp );
1225 /*EMPTY*/; /* Its a dupe. So ignore it. */
1228 } /* end second pass */
1230 DEBUG_TRIE_COMPILE_MORE_r({
1232 print out the table precompression so that we can do a visual check
1233 that they are identical.
1237 PerlIO_printf( Perl_debug_log, "\nChar : " );
1239 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1240 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1242 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1246 PerlIO_printf( Perl_debug_log, "\nState+-" );
1248 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1249 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1252 PerlIO_printf( Perl_debug_log, "\n" );
1254 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1256 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1258 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1259 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1260 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1262 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1263 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1265 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1266 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1269 PerlIO_printf( Perl_debug_log, "\n\n" );
1273 * Inplace compress the table.*
1275 For sparse data sets the table constructed by the trie algorithm will
1276 be mostly 0/FAIL transitions or to put it another way mostly empty.
1277 (Note that leaf nodes will not contain any transitions.)
1279 This algorithm compresses the tables by eliminating most such
1280 transitions, at the cost of a modest bit of extra work during lookup:
1282 - Each states[] entry contains a .base field which indicates the
1283 index in the state[] array wheres its transition data is stored.
1285 - If .base is 0 there are no valid transitions from that node.
1287 - If .base is nonzero then charid is added to it to find an entry in
1290 -If trans[states[state].base+charid].check!=state then the
1291 transition is taken to be a 0/Fail transition. Thus if there are fail
1292 transitions at the front of the node then the .base offset will point
1293 somewhere inside the previous nodes data (or maybe even into a node
1294 even earlier), but the .check field determines if the transition is
1297 The following process inplace converts the table to the compressed
1298 table: We first do not compress the root node 1,and mark its all its
1299 .check pointers as 1 and set its .base pointer as 1 as well. This
1300 allows to do a DFA construction from the compressed table later, and
1301 ensures that any .base pointers we calculate later are greater than
1304 - We set 'pos' to indicate the first entry of the second node.
1306 - We then iterate over the columns of the node, finding the first and
1307 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1308 and set the .check pointers accordingly, and advance pos
1309 appropriately and repreat for the next node. Note that when we copy
1310 the next pointers we have to convert them from the original
1311 NODEIDX form to NODENUM form as the former is not valid post
1314 - If a node has no transitions used we mark its base as 0 and do not
1315 advance the pos pointer.
1317 - If a node only has one transition we use a second pointer into the
1318 structure to fill in allocated fail transitions from other states.
1319 This pointer is independent of the main pointer and scans forward
1320 looking for null transitions that are allocated to a state. When it
1321 finds one it writes the single transition into the "hole". If the
1322 pointer doesnt find one the single transition is appeneded as normal.
1324 - Once compressed we can Renew/realloc the structures to release the
1327 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1328 specifically Fig 3.47 and the associated pseudocode.
1332 const U32 laststate = TRIE_NODENUM( next_alloc );
1335 trie->laststate = laststate;
1337 for ( state = 1 ; state < laststate ; state++ ) {
1339 const U32 stateidx = TRIE_NODEIDX( state );
1340 const U32 o_used = trie->trans[ stateidx ].check;
1341 U32 used = trie->trans[ stateidx ].check;
1342 trie->trans[ stateidx ].check = 0;
1344 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1345 if ( flag || trie->trans[ stateidx + charid ].next ) {
1346 if ( trie->trans[ stateidx + charid ].next ) {
1348 for ( ; zp < pos ; zp++ ) {
1349 if ( ! trie->trans[ zp ].next ) {
1353 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1354 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1355 trie->trans[ zp ].check = state;
1356 if ( ++zp > pos ) pos = zp;
1363 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1365 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1366 trie->trans[ pos ].check = state;
1371 trie->lasttrans = pos + 1;
1372 Renew( trie->states, laststate + 1, reg_trie_state);
1373 DEBUG_TRIE_COMPILE_MORE_r(
1374 PerlIO_printf( Perl_debug_log,
1375 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1376 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1379 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1382 } /* end table compress */
1384 /* resize the trans array to remove unused space */
1385 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1387 DEBUG_TRIE_COMPILE_r({
1390 Now we print it out again, in a slightly different form as there is additional
1391 info we want to be able to see when its compressed. They are close enough for
1392 visual comparison though.
1394 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1396 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1397 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1399 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1402 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1404 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1405 PerlIO_printf( Perl_debug_log, "-----");
1406 PerlIO_printf( Perl_debug_log, "\n");
1408 for( state = 1 ; state < trie->laststate ; state++ ) {
1409 const U32 base = trie->states[ state ].trans.base;
1411 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1413 if ( trie->states[ state ].wordnum ) {
1414 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1416 PerlIO_printf( Perl_debug_log, "%6s", "" );
1419 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1424 while( ( base + ofs < trie->uniquecharcount ) ||
1425 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1426 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1429 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1431 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1432 if ( ( base + ofs >= trie->uniquecharcount ) &&
1433 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1434 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1436 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1437 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1439 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1443 PerlIO_printf( Perl_debug_log, "]");
1446 PerlIO_printf( Perl_debug_log, "\n" );
1451 /* now finally we "stitch in" the new TRIE node
1452 This means we convert either the first branch or the first Exact,
1453 depending on whether the thing following (in 'last') is a branch
1454 or not and whther first is the startbranch (ie is it a sub part of
1455 the alternation or is it the whole thing.)
1456 Assuming its a sub part we conver the EXACT otherwise we convert
1457 the whole branch sequence, including the first.
1464 if ( first == startbranch && OP( last ) != BRANCH ) {
1467 convert = NEXTOPER( first );
1468 NEXT_OFF( first ) = (U16)(last - first);
1471 OP( convert ) = TRIE + (U8)( flags - EXACT );
1472 NEXT_OFF( convert ) = (U16)(tail - convert);
1473 ARG_SET( convert, data_slot );
1475 /* tells us if we need to handle accept buffers specially */
1476 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1479 /* needed for dumping*/
1481 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1482 /* We now need to mark all of the space originally used by the
1483 branches as optimized away. This keeps the dumpuntil from
1484 throwing a wobbly as it doesnt use regnext() to traverse the
1487 while( optimize < last ) {
1488 OP( optimize ) = OPTIMIZED;
1492 } /* end node insert */
1499 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1500 * These need to be revisited when a newer toolchain becomes available.
1502 #if defined(__sparc64__) && defined(__GNUC__)
1503 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1504 # undef SPARC64_GCC_WORKAROUND
1505 # define SPARC64_GCC_WORKAROUND 1
1509 /* REx optimizer. Converts nodes into quickier variants "in place".
1510 Finds fixed substrings. */
1512 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1513 to the position after last scanned or to NULL. */
1517 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1518 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1519 /* scanp: Start here (read-write). */
1520 /* deltap: Write maxlen-minlen here. */
1521 /* last: Stop before this one. */
1524 I32 min = 0, pars = 0, code;
1525 regnode *scan = *scanp, *next;
1527 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1528 int is_inf_internal = 0; /* The studied chunk is infinite */
1529 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1530 scan_data_t data_fake;
1531 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1532 SV *re_trie_maxbuff = NULL;
1534 GET_RE_DEBUG_FLAGS_DECL;
1536 while (scan && OP(scan) != END && scan < last) {
1537 /* Peephole optimizer: */
1539 SV * const mysv=sv_newmortal();
1540 regprop(RExC_rx, mysv, scan);
1541 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1542 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1545 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1546 /* Merge several consecutive EXACTish nodes into one. */
1547 regnode *n = regnext(scan);
1550 regnode *stop = scan;
1553 next = scan + NODE_SZ_STR(scan);
1554 /* Skip NOTHING, merge EXACT*. */
1556 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1557 (stringok && (OP(n) == OP(scan))))
1559 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1560 if (OP(n) == TAIL || n > next)
1562 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1563 NEXT_OFF(scan) += NEXT_OFF(n);
1564 next = n + NODE_STEP_REGNODE;
1571 else if (stringok) {
1572 const int oldl = STR_LEN(scan);
1573 regnode * const nnext = regnext(n);
1575 if (oldl + STR_LEN(n) > U8_MAX)
1577 NEXT_OFF(scan) += NEXT_OFF(n);
1578 STR_LEN(scan) += STR_LEN(n);
1579 next = n + NODE_SZ_STR(n);
1580 /* Now we can overwrite *n : */
1581 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1589 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1591 Two problematic code points in Unicode casefolding of EXACT nodes:
1593 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1594 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1600 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1601 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1603 This means that in case-insensitive matching (or "loose matching",
1604 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1605 length of the above casefolded versions) can match a target string
1606 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1607 This would rather mess up the minimum length computation.
1609 What we'll do is to look for the tail four bytes, and then peek
1610 at the preceding two bytes to see whether we need to decrease
1611 the minimum length by four (six minus two).
1613 Thanks to the design of UTF-8, there cannot be false matches:
1614 A sequence of valid UTF-8 bytes cannot be a subsequence of
1615 another valid sequence of UTF-8 bytes.
1618 char * const s0 = STRING(scan), *s, *t;
1619 char * const s1 = s0 + STR_LEN(scan) - 1;
1620 char * const s2 = s1 - 4;
1621 const char t0[] = "\xcc\x88\xcc\x81";
1622 const char * const t1 = t0 + 3;
1625 s < s2 && (t = ninstr(s, s1, t0, t1));
1627 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1628 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1635 n = scan + NODE_SZ_STR(scan);
1637 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1648 /* Follow the next-chain of the current node and optimize
1649 away all the NOTHINGs from it. */
1650 if (OP(scan) != CURLYX) {
1651 const int max = (reg_off_by_arg[OP(scan)]
1653 /* I32 may be smaller than U16 on CRAYs! */
1654 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1655 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1659 /* Skip NOTHING and LONGJMP. */
1660 while ((n = regnext(n))
1661 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1662 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1663 && off + noff < max)
1665 if (reg_off_by_arg[OP(scan)])
1668 NEXT_OFF(scan) = off;
1671 /* The principal pseudo-switch. Cannot be a switch, since we
1672 look into several different things. */
1673 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1674 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1675 next = regnext(scan);
1677 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1679 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1680 I32 max1 = 0, min1 = I32_MAX, num = 0;
1681 struct regnode_charclass_class accum;
1682 regnode * const startbranch=scan;
1684 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1685 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1686 if (flags & SCF_DO_STCLASS)
1687 cl_init_zero(pRExC_state, &accum);
1689 while (OP(scan) == code) {
1690 I32 deltanext, minnext, f = 0, fake;
1691 struct regnode_charclass_class this_class;
1694 data_fake.flags = 0;
1696 data_fake.whilem_c = data->whilem_c;
1697 data_fake.last_closep = data->last_closep;
1700 data_fake.last_closep = &fake;
1701 next = regnext(scan);
1702 scan = NEXTOPER(scan);
1704 scan = NEXTOPER(scan);
1705 if (flags & SCF_DO_STCLASS) {
1706 cl_init(pRExC_state, &this_class);
1707 data_fake.start_class = &this_class;
1708 f = SCF_DO_STCLASS_AND;
1710 if (flags & SCF_WHILEM_VISITED_POS)
1711 f |= SCF_WHILEM_VISITED_POS;
1713 /* we suppose the run is continuous, last=next...*/
1714 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1715 next, &data_fake, f,depth+1);
1718 if (max1 < minnext + deltanext)
1719 max1 = minnext + deltanext;
1720 if (deltanext == I32_MAX)
1721 is_inf = is_inf_internal = 1;
1723 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1725 if (data && (data_fake.flags & SF_HAS_EVAL))
1726 data->flags |= SF_HAS_EVAL;
1728 data->whilem_c = data_fake.whilem_c;
1729 if (flags & SCF_DO_STCLASS)
1730 cl_or(pRExC_state, &accum, &this_class);
1731 if (code == SUSPEND)
1734 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1736 if ((flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) && !data)
1737 Perl_croak(aTHX_ "panic: null data in study_chunk");
1738 if (flags & SCF_DO_SUBSTR) {
1739 data->pos_min += min1;
1740 data->pos_delta += max1 - min1;
1741 if (max1 != min1 || is_inf)
1742 data->longest = &(data->longest_float);
1745 delta += max1 - min1;
1746 if (flags & SCF_DO_STCLASS_OR) {
1747 cl_or(pRExC_state, data->start_class, &accum);
1749 cl_and(data->start_class, &and_with);
1750 flags &= ~SCF_DO_STCLASS;
1753 else if (flags & SCF_DO_STCLASS_AND) {
1755 cl_and(data->start_class, &accum);
1756 flags &= ~SCF_DO_STCLASS;
1759 /* Switch to OR mode: cache the old value of
1760 * data->start_class */
1761 StructCopy(data->start_class, &and_with,
1762 struct regnode_charclass_class);
1763 flags &= ~SCF_DO_STCLASS_AND;
1764 StructCopy(&accum, data->start_class,
1765 struct regnode_charclass_class);
1766 flags |= SCF_DO_STCLASS_OR;
1767 data->start_class->flags |= ANYOF_EOS;
1773 Assuming this was/is a branch we are dealing with: 'scan' now
1774 points at the item that follows the branch sequence, whatever
1775 it is. We now start at the beginning of the sequence and look
1781 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1783 If we can find such a subseqence we need to turn the first
1784 element into a trie and then add the subsequent branch exact
1785 strings to the trie.
1789 1. patterns where the whole set of branch can be converted to a trie,
1791 2. patterns where only a subset of the alternations can be
1792 converted to a trie.
1794 In case 1 we can replace the whole set with a single regop
1795 for the trie. In case 2 we need to keep the start and end
1798 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1799 becomes BRANCH TRIE; BRANCH X;
1801 Hypthetically when we know the regex isnt anchored we can
1802 turn a case 1 into a DFA and let it rip... Every time it finds a match
1803 it would just call its tail, no WHILEM/CURLY needed.
1807 if (!re_trie_maxbuff) {
1808 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1809 if (!SvIOK(re_trie_maxbuff))
1810 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1812 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1814 regnode *first = (regnode *)NULL;
1815 regnode *last = (regnode *)NULL;
1816 regnode *tail = scan;
1821 SV * const mysv = sv_newmortal(); /* for dumping */
1823 /* var tail is used because there may be a TAIL
1824 regop in the way. Ie, the exacts will point to the
1825 thing following the TAIL, but the last branch will
1826 point at the TAIL. So we advance tail. If we
1827 have nested (?:) we may have to move through several
1831 while ( OP( tail ) == TAIL ) {
1832 /* this is the TAIL generated by (?:) */
1833 tail = regnext( tail );
1837 regprop(RExC_rx, mysv, tail );
1838 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1839 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1840 (RExC_seen_evals) ? "[EVAL]" : ""
1845 step through the branches, cur represents each
1846 branch, noper is the first thing to be matched
1847 as part of that branch and noper_next is the
1848 regnext() of that node. if noper is an EXACT
1849 and noper_next is the same as scan (our current
1850 position in the regex) then the EXACT branch is
1851 a possible optimization target. Once we have
1852 two or more consequetive such branches we can
1853 create a trie of the EXACT's contents and stich
1854 it in place. If the sequence represents all of
1855 the branches we eliminate the whole thing and
1856 replace it with a single TRIE. If it is a
1857 subsequence then we need to stitch it in. This
1858 means the first branch has to remain, and needs
1859 to be repointed at the item on the branch chain
1860 following the last branch optimized. This could
1861 be either a BRANCH, in which case the
1862 subsequence is internal, or it could be the
1863 item following the branch sequence in which
1864 case the subsequence is at the end.
1868 /* dont use tail as the end marker for this traverse */
1869 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1870 regnode * const noper = NEXTOPER( cur );
1871 regnode * const noper_next = regnext( noper );
1874 regprop(RExC_rx, mysv, cur);
1875 PerlIO_printf( Perl_debug_log, "%*s%s",
1876 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1878 regprop(RExC_rx, mysv, noper);
1879 PerlIO_printf( Perl_debug_log, " -> %s",
1880 SvPV_nolen_const(mysv));
1883 regprop(RExC_rx, mysv, noper_next );
1884 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1885 SvPV_nolen_const(mysv));
1887 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1890 if ( ( first ? OP( noper ) == optype
1891 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1892 && noper_next == tail && count<U16_MAX)
1897 optype = OP( noper );
1901 regprop(RExC_rx, mysv, first);
1902 PerlIO_printf( Perl_debug_log, "%*s%s",
1903 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1904 regprop(RExC_rx, mysv, NEXTOPER(first) );
1905 PerlIO_printf( Perl_debug_log, " -> %s\n",
1906 SvPV_nolen_const( mysv ) );
1911 regprop(RExC_rx, mysv, cur);
1912 PerlIO_printf( Perl_debug_log, "%*s%s",
1913 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1914 regprop(RExC_rx, mysv, noper );
1915 PerlIO_printf( Perl_debug_log, " -> %s\n",
1916 SvPV_nolen_const( mysv ) );
1922 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1923 (int)depth * 2 + 2, "E:", "**END**" );
1925 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1927 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1928 && noper_next == tail )
1932 optype = OP( noper );
1942 regprop(RExC_rx, mysv, cur);
1943 PerlIO_printf( Perl_debug_log,
1944 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1945 " ", SvPV_nolen_const( mysv ), first, last, cur);
1950 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1951 (int)depth * 2 + 2, "E:", "==END==" );
1953 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1958 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1959 scan = NEXTOPER(NEXTOPER(scan));
1960 } else /* single branch is optimized. */
1961 scan = NEXTOPER(scan);
1964 else if (OP(scan) == EXACT) {
1965 I32 l = STR_LEN(scan);
1968 const U8 * const s = (U8*)STRING(scan);
1969 l = utf8_length(s, s + l);
1970 uc = utf8_to_uvchr(s, NULL);
1972 uc = *((U8*)STRING(scan));
1975 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1976 /* The code below prefers earlier match for fixed
1977 offset, later match for variable offset. */
1978 if (data->last_end == -1) { /* Update the start info. */
1979 data->last_start_min = data->pos_min;
1980 data->last_start_max = is_inf
1981 ? I32_MAX : data->pos_min + data->pos_delta;
1983 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
1985 SvUTF8_on(data->last_found);
1987 SV * const sv = data->last_found;
1988 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
1989 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1990 if (mg && mg->mg_len >= 0)
1991 mg->mg_len += utf8_length((U8*)STRING(scan),
1992 (U8*)STRING(scan)+STR_LEN(scan));
1994 data->last_end = data->pos_min + l;
1995 data->pos_min += l; /* As in the first entry. */
1996 data->flags &= ~SF_BEFORE_EOL;
1998 if (flags & SCF_DO_STCLASS_AND) {
1999 /* Check whether it is compatible with what we know already! */
2003 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2004 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2005 && (!(data->start_class->flags & ANYOF_FOLD)
2006 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2009 ANYOF_CLASS_ZERO(data->start_class);
2010 ANYOF_BITMAP_ZERO(data->start_class);
2012 ANYOF_BITMAP_SET(data->start_class, uc);
2013 data->start_class->flags &= ~ANYOF_EOS;
2015 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2017 else if (flags & SCF_DO_STCLASS_OR) {
2018 /* false positive possible if the class is case-folded */
2020 ANYOF_BITMAP_SET(data->start_class, uc);
2022 data->start_class->flags |= ANYOF_UNICODE_ALL;
2023 data->start_class->flags &= ~ANYOF_EOS;
2024 cl_and(data->start_class, &and_with);
2026 flags &= ~SCF_DO_STCLASS;
2028 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2029 I32 l = STR_LEN(scan);
2030 UV uc = *((U8*)STRING(scan));
2032 /* Search for fixed substrings supports EXACT only. */
2033 if (flags & SCF_DO_SUBSTR) {
2035 scan_commit(pRExC_state, data);
2038 const U8 * const s = (U8 *)STRING(scan);
2039 l = utf8_length(s, s + l);
2040 uc = utf8_to_uvchr(s, NULL);
2043 if (flags & SCF_DO_SUBSTR)
2045 if (flags & SCF_DO_STCLASS_AND) {
2046 /* Check whether it is compatible with what we know already! */
2050 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2051 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2052 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2054 ANYOF_CLASS_ZERO(data->start_class);
2055 ANYOF_BITMAP_ZERO(data->start_class);
2057 ANYOF_BITMAP_SET(data->start_class, uc);
2058 data->start_class->flags &= ~ANYOF_EOS;
2059 data->start_class->flags |= ANYOF_FOLD;
2060 if (OP(scan) == EXACTFL)
2061 data->start_class->flags |= ANYOF_LOCALE;
2064 else if (flags & SCF_DO_STCLASS_OR) {
2065 if (data->start_class->flags & ANYOF_FOLD) {
2066 /* false positive possible if the class is case-folded.
2067 Assume that the locale settings are the same... */
2069 ANYOF_BITMAP_SET(data->start_class, uc);
2070 data->start_class->flags &= ~ANYOF_EOS;
2072 cl_and(data->start_class, &and_with);
2074 flags &= ~SCF_DO_STCLASS;
2076 else if (strchr((const char*)PL_varies,OP(scan))) {
2077 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2078 I32 f = flags, pos_before = 0;
2079 regnode * const oscan = scan;
2080 struct regnode_charclass_class this_class;
2081 struct regnode_charclass_class *oclass = NULL;
2082 I32 next_is_eval = 0;
2084 switch (PL_regkind[(U8)OP(scan)]) {
2085 case WHILEM: /* End of (?:...)* . */
2086 scan = NEXTOPER(scan);
2089 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2090 next = NEXTOPER(scan);
2091 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2093 maxcount = REG_INFTY;
2094 next = regnext(scan);
2095 scan = NEXTOPER(scan);
2099 if (flags & SCF_DO_SUBSTR)
2104 if (flags & SCF_DO_STCLASS) {
2106 maxcount = REG_INFTY;
2107 next = regnext(scan);
2108 scan = NEXTOPER(scan);
2111 is_inf = is_inf_internal = 1;
2112 scan = regnext(scan);
2113 if (flags & SCF_DO_SUBSTR) {
2114 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2115 data->longest = &(data->longest_float);
2117 goto optimize_curly_tail;
2119 mincount = ARG1(scan);
2120 maxcount = ARG2(scan);
2121 next = regnext(scan);
2122 if (OP(scan) == CURLYX) {
2123 I32 lp = (data ? *(data->last_closep) : 0);
2124 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2126 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2127 next_is_eval = (OP(scan) == EVAL);
2129 if (flags & SCF_DO_SUBSTR) {
2130 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2131 pos_before = data->pos_min;
2135 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2137 data->flags |= SF_IS_INF;
2139 if (flags & SCF_DO_STCLASS) {
2140 cl_init(pRExC_state, &this_class);
2141 oclass = data->start_class;
2142 data->start_class = &this_class;
2143 f |= SCF_DO_STCLASS_AND;
2144 f &= ~SCF_DO_STCLASS_OR;
2146 /* These are the cases when once a subexpression
2147 fails at a particular position, it cannot succeed
2148 even after backtracking at the enclosing scope.
2150 XXXX what if minimal match and we are at the
2151 initial run of {n,m}? */
2152 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2153 f &= ~SCF_WHILEM_VISITED_POS;
2155 /* This will finish on WHILEM, setting scan, or on NULL: */
2156 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2158 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2160 if (flags & SCF_DO_STCLASS)
2161 data->start_class = oclass;
2162 if (mincount == 0 || minnext == 0) {
2163 if (flags & SCF_DO_STCLASS_OR) {
2164 cl_or(pRExC_state, data->start_class, &this_class);
2166 else if (flags & SCF_DO_STCLASS_AND) {
2167 /* Switch to OR mode: cache the old value of
2168 * data->start_class */
2169 StructCopy(data->start_class, &and_with,
2170 struct regnode_charclass_class);
2171 flags &= ~SCF_DO_STCLASS_AND;
2172 StructCopy(&this_class, data->start_class,
2173 struct regnode_charclass_class);
2174 flags |= SCF_DO_STCLASS_OR;
2175 data->start_class->flags |= ANYOF_EOS;
2177 } else { /* Non-zero len */
2178 if (flags & SCF_DO_STCLASS_OR) {
2179 cl_or(pRExC_state, data->start_class, &this_class);
2180 cl_and(data->start_class, &and_with);
2182 else if (flags & SCF_DO_STCLASS_AND)
2183 cl_and(data->start_class, &this_class);
2184 flags &= ~SCF_DO_STCLASS;
2186 if (!scan) /* It was not CURLYX, but CURLY. */
2188 if ( /* ? quantifier ok, except for (?{ ... }) */
2189 (next_is_eval || !(mincount == 0 && maxcount == 1))
2190 && (minnext == 0) && (deltanext == 0)
2191 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2192 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2193 && ckWARN(WARN_REGEXP))
2196 "Quantifier unexpected on zero-length expression");
2199 min += minnext * mincount;
2200 is_inf_internal |= ((maxcount == REG_INFTY
2201 && (minnext + deltanext) > 0)
2202 || deltanext == I32_MAX);
2203 is_inf |= is_inf_internal;
2204 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2206 /* Try powerful optimization CURLYX => CURLYN. */
2207 if ( OP(oscan) == CURLYX && data
2208 && data->flags & SF_IN_PAR
2209 && !(data->flags & SF_HAS_EVAL)
2210 && !deltanext && minnext == 1 ) {
2211 /* Try to optimize to CURLYN. */
2212 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2213 regnode * const nxt1 = nxt;
2220 if (!strchr((const char*)PL_simple,OP(nxt))
2221 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2222 && STR_LEN(nxt) == 1))
2228 if (OP(nxt) != CLOSE)
2230 /* Now we know that nxt2 is the only contents: */
2231 oscan->flags = (U8)ARG(nxt);
2233 OP(nxt1) = NOTHING; /* was OPEN. */
2235 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2236 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2237 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2238 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2239 OP(nxt + 1) = OPTIMIZED; /* was count. */
2240 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2245 /* Try optimization CURLYX => CURLYM. */
2246 if ( OP(oscan) == CURLYX && data
2247 && !(data->flags & SF_HAS_PAR)
2248 && !(data->flags & SF_HAS_EVAL)
2249 && !deltanext /* atom is fixed width */
2250 && minnext != 0 /* CURLYM can't handle zero width */
2252 /* XXXX How to optimize if data == 0? */
2253 /* Optimize to a simpler form. */
2254 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2258 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2259 && (OP(nxt2) != WHILEM))
2261 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2262 /* Need to optimize away parenths. */
2263 if (data->flags & SF_IN_PAR) {
2264 /* Set the parenth number. */
2265 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2267 if (OP(nxt) != CLOSE)
2268 FAIL("Panic opt close");
2269 oscan->flags = (U8)ARG(nxt);
2270 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2271 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2273 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2274 OP(nxt + 1) = OPTIMIZED; /* was count. */
2275 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2276 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2279 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2280 regnode *nnxt = regnext(nxt1);
2283 if (reg_off_by_arg[OP(nxt1)])
2284 ARG_SET(nxt1, nxt2 - nxt1);
2285 else if (nxt2 - nxt1 < U16_MAX)
2286 NEXT_OFF(nxt1) = nxt2 - nxt1;
2288 OP(nxt) = NOTHING; /* Cannot beautify */
2293 /* Optimize again: */
2294 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2300 else if ((OP(oscan) == CURLYX)
2301 && (flags & SCF_WHILEM_VISITED_POS)
2302 /* See the comment on a similar expression above.
2303 However, this time it not a subexpression
2304 we care about, but the expression itself. */
2305 && (maxcount == REG_INFTY)
2306 && data && ++data->whilem_c < 16) {
2307 /* This stays as CURLYX, we can put the count/of pair. */
2308 /* Find WHILEM (as in regexec.c) */
2309 regnode *nxt = oscan + NEXT_OFF(oscan);
2311 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2313 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2314 | (RExC_whilem_seen << 4)); /* On WHILEM */
2316 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2318 if (flags & SCF_DO_SUBSTR) {
2319 SV *last_str = NULL;
2320 int counted = mincount != 0;
2322 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2323 #if defined(SPARC64_GCC_WORKAROUND)
2326 const char *s = NULL;
2329 if (pos_before >= data->last_start_min)
2332 b = data->last_start_min;
2335 s = SvPV_const(data->last_found, l);
2336 old = b - data->last_start_min;
2339 I32 b = pos_before >= data->last_start_min
2340 ? pos_before : data->last_start_min;
2342 const char * const s = SvPV_const(data->last_found, l);
2343 I32 old = b - data->last_start_min;
2347 old = utf8_hop((U8*)s, old) - (U8*)s;
2350 /* Get the added string: */
2351 last_str = newSVpvn(s + old, l);
2353 SvUTF8_on(last_str);
2354 if (deltanext == 0 && pos_before == b) {
2355 /* What was added is a constant string */
2357 SvGROW(last_str, (mincount * l) + 1);
2358 repeatcpy(SvPVX(last_str) + l,
2359 SvPVX_const(last_str), l, mincount - 1);
2360 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2361 /* Add additional parts. */
2362 SvCUR_set(data->last_found,
2363 SvCUR(data->last_found) - l);
2364 sv_catsv(data->last_found, last_str);
2366 SV * sv = data->last_found;
2368 SvUTF8(sv) && SvMAGICAL(sv) ?
2369 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2370 if (mg && mg->mg_len >= 0)
2371 mg->mg_len += CHR_SVLEN(last_str);
2373 data->last_end += l * (mincount - 1);
2376 /* start offset must point into the last copy */
2377 data->last_start_min += minnext * (mincount - 1);
2378 data->last_start_max += is_inf ? I32_MAX
2379 : (maxcount - 1) * (minnext + data->pos_delta);
2382 /* It is counted once already... */
2383 data->pos_min += minnext * (mincount - counted);
2384 data->pos_delta += - counted * deltanext +
2385 (minnext + deltanext) * maxcount - minnext * mincount;
2386 if (mincount != maxcount) {
2387 /* Cannot extend fixed substrings found inside
2389 scan_commit(pRExC_state,data);
2390 if (mincount && last_str) {
2391 SV * const sv = data->last_found;
2392 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2393 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2397 sv_setsv(sv, last_str);
2398 data->last_end = data->pos_min;
2399 data->last_start_min =
2400 data->pos_min - CHR_SVLEN(last_str);
2401 data->last_start_max = is_inf
2403 : data->pos_min + data->pos_delta
2404 - CHR_SVLEN(last_str);
2406 data->longest = &(data->longest_float);
2408 SvREFCNT_dec(last_str);
2410 if (data && (fl & SF_HAS_EVAL))
2411 data->flags |= SF_HAS_EVAL;
2412 optimize_curly_tail:
2413 if (OP(oscan) != CURLYX) {
2414 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2416 NEXT_OFF(oscan) += NEXT_OFF(next);
2419 default: /* REF and CLUMP only? */
2420 if (flags & SCF_DO_SUBSTR) {
2421 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2422 data->longest = &(data->longest_float);
2424 is_inf = is_inf_internal = 1;
2425 if (flags & SCF_DO_STCLASS_OR)
2426 cl_anything(pRExC_state, data->start_class);
2427 flags &= ~SCF_DO_STCLASS;
2431 else if (strchr((const char*)PL_simple,OP(scan))) {
2434 if (flags & SCF_DO_SUBSTR) {
2435 scan_commit(pRExC_state,data);
2439 if (flags & SCF_DO_STCLASS) {
2440 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2442 /* Some of the logic below assumes that switching
2443 locale on will only add false positives. */
2444 switch (PL_regkind[(U8)OP(scan)]) {
2448 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2449 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2450 cl_anything(pRExC_state, data->start_class);
2453 if (OP(scan) == SANY)
2455 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2456 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2457 || (data->start_class->flags & ANYOF_CLASS));
2458 cl_anything(pRExC_state, data->start_class);
2460 if (flags & SCF_DO_STCLASS_AND || !value)
2461 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2464 if (flags & SCF_DO_STCLASS_AND)
2465 cl_and(data->start_class,
2466 (struct regnode_charclass_class*)scan);
2468 cl_or(pRExC_state, data->start_class,
2469 (struct regnode_charclass_class*)scan);
2472 if (flags & SCF_DO_STCLASS_AND) {
2473 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2475 for (value = 0; value < 256; value++)
2476 if (!isALNUM(value))
2477 ANYOF_BITMAP_CLEAR(data->start_class, value);
2481 if (data->start_class->flags & ANYOF_LOCALE)
2482 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2484 for (value = 0; value < 256; value++)
2486 ANYOF_BITMAP_SET(data->start_class, value);
2491 if (flags & SCF_DO_STCLASS_AND) {
2492 if (data->start_class->flags & ANYOF_LOCALE)
2493 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2496 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2497 data->start_class->flags |= ANYOF_LOCALE;
2501 if (flags & SCF_DO_STCLASS_AND) {
2502 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2504 for (value = 0; value < 256; value++)
2506 ANYOF_BITMAP_CLEAR(data->start_class, value);
2510 if (data->start_class->flags & ANYOF_LOCALE)
2511 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2513 for (value = 0; value < 256; value++)
2514 if (!isALNUM(value))
2515 ANYOF_BITMAP_SET(data->start_class, value);
2520 if (flags & SCF_DO_STCLASS_AND) {
2521 if (data->start_class->flags & ANYOF_LOCALE)
2522 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2525 data->start_class->flags |= ANYOF_LOCALE;
2526 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2530 if (flags & SCF_DO_STCLASS_AND) {
2531 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2532 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2533 for (value = 0; value < 256; value++)
2534 if (!isSPACE(value))
2535 ANYOF_BITMAP_CLEAR(data->start_class, value);
2539 if (data->start_class->flags & ANYOF_LOCALE)
2540 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2542 for (value = 0; value < 256; value++)
2544 ANYOF_BITMAP_SET(data->start_class, value);
2549 if (flags & SCF_DO_STCLASS_AND) {
2550 if (data->start_class->flags & ANYOF_LOCALE)
2551 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2554 data->start_class->flags |= ANYOF_LOCALE;
2555 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2559 if (flags & SCF_DO_STCLASS_AND) {
2560 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2561 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2562 for (value = 0; value < 256; value++)
2564 ANYOF_BITMAP_CLEAR(data->start_class, value);
2568 if (data->start_class->flags & ANYOF_LOCALE)
2569 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2571 for (value = 0; value < 256; value++)
2572 if (!isSPACE(value))
2573 ANYOF_BITMAP_SET(data->start_class, value);
2578 if (flags & SCF_DO_STCLASS_AND) {
2579 if (data->start_class->flags & ANYOF_LOCALE) {
2580 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2581 for (value = 0; value < 256; value++)
2582 if (!isSPACE(value))
2583 ANYOF_BITMAP_CLEAR(data->start_class, value);
2587 data->start_class->flags |= ANYOF_LOCALE;
2588 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2592 if (flags & SCF_DO_STCLASS_AND) {
2593 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2594 for (value = 0; value < 256; value++)
2595 if (!isDIGIT(value))
2596 ANYOF_BITMAP_CLEAR(data->start_class, value);
2599 if (data->start_class->flags & ANYOF_LOCALE)
2600 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2602 for (value = 0; value < 256; value++)
2604 ANYOF_BITMAP_SET(data->start_class, value);
2609 if (flags & SCF_DO_STCLASS_AND) {
2610 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2611 for (value = 0; value < 256; value++)
2613 ANYOF_BITMAP_CLEAR(data->start_class, value);
2616 if (data->start_class->flags & ANYOF_LOCALE)
2617 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2619 for (value = 0; value < 256; value++)
2620 if (!isDIGIT(value))
2621 ANYOF_BITMAP_SET(data->start_class, value);
2626 if (flags & SCF_DO_STCLASS_OR)
2627 cl_and(data->start_class, &and_with);
2628 flags &= ~SCF_DO_STCLASS;
2631 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2632 data->flags |= (OP(scan) == MEOL
2636 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2637 /* Lookbehind, or need to calculate parens/evals/stclass: */
2638 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2639 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2640 /* Lookahead/lookbehind */
2641 I32 deltanext, minnext, fake = 0;
2643 struct regnode_charclass_class intrnl;
2646 data_fake.flags = 0;
2648 data_fake.whilem_c = data->whilem_c;
2649 data_fake.last_closep = data->last_closep;
2652 data_fake.last_closep = &fake;
2653 if ( flags & SCF_DO_STCLASS && !scan->flags
2654 && OP(scan) == IFMATCH ) { /* Lookahead */
2655 cl_init(pRExC_state, &intrnl);
2656 data_fake.start_class = &intrnl;
2657 f |= SCF_DO_STCLASS_AND;
2659 if (flags & SCF_WHILEM_VISITED_POS)
2660 f |= SCF_WHILEM_VISITED_POS;
2661 next = regnext(scan);
2662 nscan = NEXTOPER(NEXTOPER(scan));
2663 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2666 vFAIL("Variable length lookbehind not implemented");
2668 else if (minnext > U8_MAX) {
2669 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2671 scan->flags = (U8)minnext;
2673 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2675 if (data && (data_fake.flags & SF_HAS_EVAL))
2676 data->flags |= SF_HAS_EVAL;
2678 data->whilem_c = data_fake.whilem_c;
2679 if (f & SCF_DO_STCLASS_AND) {
2680 const int was = (data->start_class->flags & ANYOF_EOS);
2682 cl_and(data->start_class, &intrnl);
2684 data->start_class->flags |= ANYOF_EOS;
2687 else if (OP(scan) == OPEN) {
2690 else if (OP(scan) == CLOSE) {
2691 if ((I32)ARG(scan) == is_par) {
2692 next = regnext(scan);
2694 if ( next && (OP(next) != WHILEM) && next < last)
2695 is_par = 0; /* Disable optimization */
2698 *(data->last_closep) = ARG(scan);
2700 else if (OP(scan) == EVAL) {
2702 data->flags |= SF_HAS_EVAL;
2704 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2705 if (flags & SCF_DO_SUBSTR) {
2706 scan_commit(pRExC_state,data);
2707 data->longest = &(data->longest_float);
2709 is_inf = is_inf_internal = 1;
2710 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2711 cl_anything(pRExC_state, data->start_class);
2712 flags &= ~SCF_DO_STCLASS;
2714 /* Else: zero-length, ignore. */
2715 scan = regnext(scan);
2720 *deltap = is_inf_internal ? I32_MAX : delta;
2721 if (flags & SCF_DO_SUBSTR && is_inf)
2722 data->pos_delta = I32_MAX - data->pos_min;
2723 if (is_par > U8_MAX)
2725 if (is_par && pars==1 && data) {
2726 data->flags |= SF_IN_PAR;
2727 data->flags &= ~SF_HAS_PAR;
2729 else if (pars && data) {
2730 data->flags |= SF_HAS_PAR;
2731 data->flags &= ~SF_IN_PAR;
2733 if (flags & SCF_DO_STCLASS_OR)
2734 cl_and(data->start_class, &and_with);
2739 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
2741 if (RExC_rx->data) {
2742 Renewc(RExC_rx->data,
2743 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2744 char, struct reg_data);
2745 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2746 RExC_rx->data->count += n;
2749 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2750 char, struct reg_data);
2751 Newx(RExC_rx->data->what, n, U8);
2752 RExC_rx->data->count = n;
2754 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2755 return RExC_rx->data->count - n;
2759 Perl_reginitcolors(pTHX)
2762 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2764 char *t = savepv(s);
2768 t = strchr(t, '\t');
2774 PL_colors[i] = t = (char *)"";
2779 PL_colors[i++] = (char *)"";
2786 - pregcomp - compile a regular expression into internal code
2788 * We can't allocate space until we know how big the compiled form will be,
2789 * but we can't compile it (and thus know how big it is) until we've got a
2790 * place to put the code. So we cheat: we compile it twice, once with code
2791 * generation turned off and size counting turned on, and once "for real".
2792 * This also means that we don't allocate space until we are sure that the
2793 * thing really will compile successfully, and we never have to move the
2794 * code and thus invalidate pointers into it. (Note that it has to be in
2795 * one piece because free() must be able to free it all.) [NB: not true in perl]
2797 * Beware that the optimization-preparation code in here knows about some
2798 * of the structure of the compiled regexp. [I'll say.]
2801 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2812 RExC_state_t RExC_state;
2813 RExC_state_t *pRExC_state = &RExC_state;
2815 GET_RE_DEBUG_FLAGS_DECL;
2818 FAIL("NULL regexp argument");
2820 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2823 DEBUG_r(if (!PL_colorset) reginitcolors());
2825 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2826 PL_colors[4],PL_colors[5],PL_colors[0],
2827 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2829 RExC_flags = pm->op_pmflags;
2833 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2834 RExC_seen_evals = 0;
2837 /* First pass: determine size, legality. */
2844 RExC_emit = &PL_regdummy;
2845 RExC_whilem_seen = 0;
2846 #if 0 /* REGC() is (currently) a NOP at the first pass.
2847 * Clever compilers notice this and complain. --jhi */
2848 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2850 if (reg(pRExC_state, 0, &flags) == NULL) {
2851 RExC_precomp = NULL;
2854 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2856 /* Small enough for pointer-storage convention?
2857 If extralen==0, this means that we will not need long jumps. */
2858 if (RExC_size >= 0x10000L && RExC_extralen)
2859 RExC_size += RExC_extralen;
2862 if (RExC_whilem_seen > 15)
2863 RExC_whilem_seen = 15;
2865 /* Allocate space and initialize. */
2866 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2869 FAIL("Regexp out of space");
2872 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2873 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2876 r->prelen = xend - exp;
2877 r->precomp = savepvn(RExC_precomp, r->prelen);
2879 #ifdef PERL_OLD_COPY_ON_WRITE
2880 r->saved_copy = NULL;
2882 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2883 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2884 r->lastparen = 0; /* mg.c reads this. */
2886 r->substrs = 0; /* Useful during FAIL. */
2887 r->startp = 0; /* Useful during FAIL. */
2888 r->endp = 0; /* Useful during FAIL. */
2890 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2892 r->offsets[0] = RExC_size;
2894 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2895 "%s %"UVuf" bytes for offset annotations.\n",
2896 r->offsets ? "Got" : "Couldn't get",
2897 (UV)((2*RExC_size+1) * sizeof(U32))));
2901 /* Second pass: emit code. */
2902 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2907 RExC_emit_start = r->program;
2908 RExC_emit = r->program;
2909 /* Store the count of eval-groups for security checks: */
2910 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2911 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2913 if (reg(pRExC_state, 0, &flags) == NULL)
2917 /* Dig out information for optimizations. */
2918 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2919 pm->op_pmflags = RExC_flags;
2921 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2922 r->regstclass = NULL;
2923 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2924 r->reganch |= ROPT_NAUGHTY;
2925 scan = r->program + 1; /* First BRANCH. */
2927 /* XXXX To minimize changes to RE engine we always allocate
2928 3-units-long substrs field. */
2929 Newxz(r->substrs, 1, struct reg_substr_data);
2931 StructCopy(&zero_scan_data, &data, scan_data_t);
2932 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2933 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2935 STRLEN longest_float_length, longest_fixed_length;
2936 struct regnode_charclass_class ch_class;
2941 /* Skip introductions and multiplicators >= 1. */
2942 while ((OP(first) == OPEN && (sawopen = 1)) ||
2943 /* An OR of *one* alternative - should not happen now. */
2944 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2945 (OP(first) == PLUS) ||
2946 (OP(first) == MINMOD) ||
2947 /* An {n,m} with n>0 */
2948 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2949 if (OP(first) == PLUS)
2952 first += regarglen[(U8)OP(first)];
2953 first = NEXTOPER(first);
2956 /* Starting-point info. */
2958 if (PL_regkind[(U8)OP(first)] == EXACT) {
2959 if (OP(first) == EXACT)
2960 /*EMPTY*/; /* Empty, get anchored substr later. */
2961 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2962 r->regstclass = first;
2964 else if (strchr((const char*)PL_simple,OP(first)))
2965 r->regstclass = first;
2966 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2967 PL_regkind[(U8)OP(first)] == NBOUND)
2968 r->regstclass = first;
2969 else if (PL_regkind[(U8)OP(first)] == BOL) {
2970 r->reganch |= (OP(first) == MBOL
2972 : (OP(first) == SBOL
2975 first = NEXTOPER(first);
2978 else if (OP(first) == GPOS) {
2979 r->reganch |= ROPT_ANCH_GPOS;
2980 first = NEXTOPER(first);
2983 else if (!sawopen && (OP(first) == STAR &&
2984 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2985 !(r->reganch & ROPT_ANCH) )
2987 /* turn .* into ^.* with an implied $*=1 */
2989 (OP(NEXTOPER(first)) == REG_ANY)
2992 r->reganch |= type | ROPT_IMPLICIT;
2993 first = NEXTOPER(first);
2996 if (sawplus && (!sawopen || !RExC_sawback)
2997 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
2998 /* x+ must match at the 1st pos of run of x's */
2999 r->reganch |= ROPT_SKIP;
3001 /* Scan is after the zeroth branch, first is atomic matcher. */
3002 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3003 (IV)(first - scan + 1)));
3005 * If there's something expensive in the r.e., find the
3006 * longest literal string that must appear and make it the
3007 * regmust. Resolve ties in favor of later strings, since
3008 * the regstart check works with the beginning of the r.e.
3009 * and avoiding duplication strengthens checking. Not a
3010 * strong reason, but sufficient in the absence of others.
3011 * [Now we resolve ties in favor of the earlier string if
3012 * it happens that c_offset_min has been invalidated, since the
3013 * earlier string may buy us something the later one won't.]
3017 data.longest_fixed = newSVpvs("");
3018 data.longest_float = newSVpvs("");
3019 data.last_found = newSVpvs("");
3020 data.longest = &(data.longest_fixed);
3022 if (!r->regstclass) {
3023 cl_init(pRExC_state, &ch_class);
3024 data.start_class = &ch_class;
3025 stclass_flag = SCF_DO_STCLASS_AND;
3026 } else /* XXXX Check for BOUND? */
3028 data.last_closep = &last_close;
3030 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3031 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3032 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3033 && data.last_start_min == 0 && data.last_end > 0
3034 && !RExC_seen_zerolen
3035 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3036 r->reganch |= ROPT_CHECK_ALL;
3037 scan_commit(pRExC_state, &data);
3038 SvREFCNT_dec(data.last_found);
3040 longest_float_length = CHR_SVLEN(data.longest_float);
3041 if (longest_float_length
3042 || (data.flags & SF_FL_BEFORE_EOL
3043 && (!(data.flags & SF_FL_BEFORE_MEOL)
3044 || (RExC_flags & PMf_MULTILINE)))) {
3047 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3048 && data.offset_fixed == data.offset_float_min
3049 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3050 goto remove_float; /* As in (a)+. */
3052 if (SvUTF8(data.longest_float)) {
3053 r->float_utf8 = data.longest_float;
3054 r->float_substr = NULL;
3056 r->float_substr = data.longest_float;
3057 r->float_utf8 = NULL;
3059 r->float_min_offset = data.offset_float_min;
3060 r->float_max_offset = data.offset_float_max;
3061 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3062 && (!(data.flags & SF_FL_BEFORE_MEOL)
3063 || (RExC_flags & PMf_MULTILINE)));
3064 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3068 r->float_substr = r->float_utf8 = NULL;
3069 SvREFCNT_dec(data.longest_float);
3070 longest_float_length = 0;
3073 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3074 if (longest_fixed_length
3075 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3076 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3077 || (RExC_flags & PMf_MULTILINE)))) {
3080 if (SvUTF8(data.longest_fixed)) {
3081 r->anchored_utf8 = data.longest_fixed;
3082 r->anchored_substr = NULL;
3084 r->anchored_substr = data.longest_fixed;
3085 r->anchored_utf8 = NULL;
3087 r->anchored_offset = data.offset_fixed;
3088 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3089 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3090 || (RExC_flags & PMf_MULTILINE)));
3091 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3094 r->anchored_substr = r->anchored_utf8 = NULL;
3095 SvREFCNT_dec(data.longest_fixed);
3096 longest_fixed_length = 0;
3099 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3100 r->regstclass = NULL;
3101 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3103 && !(data.start_class->flags & ANYOF_EOS)
3104 && !cl_is_anything(data.start_class))
3106 const I32 n = add_data(pRExC_state, 1, "f");
3108 Newx(RExC_rx->data->data[n], 1,
3109 struct regnode_charclass_class);
3110 StructCopy(data.start_class,
3111 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3112 struct regnode_charclass_class);
3113 r->regstclass = (regnode*)RExC_rx->data->data[n];
3114 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3115 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3116 regprop(r, sv, (regnode*)data.start_class);
3117 PerlIO_printf(Perl_debug_log,
3118 "synthetic stclass \"%s\".\n",
3119 SvPVX_const(sv));});
3122 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3123 if (longest_fixed_length > longest_float_length) {
3124 r->check_substr = r->anchored_substr;
3125 r->check_utf8 = r->anchored_utf8;
3126 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3127 if (r->reganch & ROPT_ANCH_SINGLE)
3128 r->reganch |= ROPT_NOSCAN;
3131 r->check_substr = r->float_substr;
3132 r->check_utf8 = r->float_utf8;
3133 r->check_offset_min = data.offset_float_min;
3134 r->check_offset_max = data.offset_float_max;
3136 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3137 This should be changed ASAP! */
3138 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3139 r->reganch |= RE_USE_INTUIT;
3140 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3141 r->reganch |= RE_INTUIT_TAIL;
3145 /* Several toplevels. Best we can is to set minlen. */
3147 struct regnode_charclass_class ch_class;
3150 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3151 scan = r->program + 1;
3152 cl_init(pRExC_state, &ch_class);
3153 data.start_class = &ch_class;
3154 data.last_closep = &last_close;
3155 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3156 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3157 = r->float_substr = r->float_utf8 = NULL;
3158 if (!(data.start_class->flags & ANYOF_EOS)
3159 && !cl_is_anything(data.start_class))
3161 const I32 n = add_data(pRExC_state, 1, "f");
3163 Newx(RExC_rx->data->data[n], 1,
3164 struct regnode_charclass_class);
3165 StructCopy(data.start_class,
3166 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3167 struct regnode_charclass_class);
3168 r->regstclass = (regnode*)RExC_rx->data->data[n];
3169 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3170 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3171 regprop(r, sv, (regnode*)data.start_class);
3172 PerlIO_printf(Perl_debug_log,
3173 "synthetic stclass \"%s\".\n",
3174 SvPVX_const(sv));});
3179 if (RExC_seen & REG_SEEN_GPOS)
3180 r->reganch |= ROPT_GPOS_SEEN;
3181 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3182 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3183 if (RExC_seen & REG_SEEN_EVAL)
3184 r->reganch |= ROPT_EVAL_SEEN;
3185 if (RExC_seen & REG_SEEN_CANY)
3186 r->reganch |= ROPT_CANY_SEEN;
3187 Newxz(r->startp, RExC_npar, I32);
3188 Newxz(r->endp, RExC_npar, I32);
3189 DEBUG_COMPILE_r(regdump(r));
3194 - reg - regular expression, i.e. main body or parenthesized thing
3196 * Caller must absorb opening parenthesis.
3198 * Combining parenthesis handling with the base level of regular expression
3199 * is a trifle forced, but the need to tie the tails of the branches to what
3200 * follows makes it hard to avoid.
3203 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3204 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3207 register regnode *ret; /* Will be the head of the group. */
3208 register regnode *br;
3209 register regnode *lastbr;
3210 register regnode *ender = NULL;
3211 register I32 parno = 0;
3213 const I32 oregflags = RExC_flags;
3214 bool have_branch = 0;
3217 /* for (?g), (?gc), and (?o) warnings; warning
3218 about (?c) will warn about (?g) -- japhy */
3220 #define WASTED_O 0x01
3221 #define WASTED_G 0x02
3222 #define WASTED_C 0x04
3223 #define WASTED_GC (0x02|0x04)
3224 I32 wastedflags = 0x00;
3226 char * parse_start = RExC_parse; /* MJD */
3227 char * const oregcomp_parse = RExC_parse;
3229 *flagp = 0; /* Tentatively. */
3232 /* Make an OPEN node, if parenthesized. */
3234 if (*RExC_parse == '?') { /* (?...) */
3235 U32 posflags = 0, negflags = 0;
3236 U32 *flagsp = &posflags;
3237 bool is_logical = 0;
3238 const char * const seqstart = RExC_parse;
3241 paren = *RExC_parse++;
3242 ret = NULL; /* For look-ahead/behind. */
3244 case '<': /* (?<...) */
3245 RExC_seen |= REG_SEEN_LOOKBEHIND;
3246 if (*RExC_parse == '!')
3248 if (*RExC_parse != '=' && *RExC_parse != '!')
3251 case '=': /* (?=...) */
3252 case '!': /* (?!...) */
3253 RExC_seen_zerolen++;
3254 case ':': /* (?:...) */
3255 case '>': /* (?>...) */
3257 case '$': /* (?$...) */
3258 case '@': /* (?@...) */
3259 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3261 case '#': /* (?#...) */
3262 while (*RExC_parse && *RExC_parse != ')')
3264 if (*RExC_parse != ')')
3265 FAIL("Sequence (?#... not terminated");
3266 nextchar(pRExC_state);
3269 case 'p': /* (?p...) */
3270 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3271 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3273 case '?': /* (??...) */
3275 if (*RExC_parse != '{')
3277 paren = *RExC_parse++;
3279 case '{': /* (?{...}) */
3281 I32 count = 1, n = 0;
3283 char *s = RExC_parse;
3285 RExC_seen_zerolen++;
3286 RExC_seen |= REG_SEEN_EVAL;
3287 while (count && (c = *RExC_parse)) {
3298 if (*RExC_parse != ')') {
3300 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3304 OP_4tree *sop, *rop;
3305 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3308 Perl_save_re_context(aTHX);
3309 rop = sv_compile_2op(sv, &sop, "re", &pad);
3310 sop->op_private |= OPpREFCOUNTED;
3311 /* re_dup will OpREFCNT_inc */
3312 OpREFCNT_set(sop, 1);
3315 n = add_data(pRExC_state, 3, "nop");
3316 RExC_rx->data->data[n] = (void*)rop;
3317 RExC_rx->data->data[n+1] = (void*)sop;
3318 RExC_rx->data->data[n+2] = (void*)pad;
3321 else { /* First pass */
3322 if (PL_reginterp_cnt < ++RExC_seen_evals
3324 /* No compiled RE interpolated, has runtime
3325 components ===> unsafe. */
3326 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3327 if (PL_tainting && PL_tainted)
3328 FAIL("Eval-group in insecure regular expression");
3329 if (IN_PERL_COMPILETIME)
3333 nextchar(pRExC_state);
3335 ret = reg_node(pRExC_state, LOGICAL);
3338 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3339 /* deal with the length of this later - MJD */
3342 ret = reganode(pRExC_state, EVAL, n);
3343 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3344 Set_Node_Offset(ret, parse_start);
3347 case '(': /* (?(?{...})...) and (?(?=...)...) */
3349 if (RExC_parse[0] == '?') { /* (?(?...)) */
3350 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3351 || RExC_parse[1] == '<'
3352 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3355 ret = reg_node(pRExC_state, LOGICAL);
3358 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3362 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3365 parno = atoi(RExC_parse++);
3367 while (isDIGIT(*RExC_parse))
3369 ret = reganode(pRExC_state, GROUPP, parno);
3371 if ((c = *nextchar(pRExC_state)) != ')')
3372 vFAIL("Switch condition not recognized");
3374 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3375 br = regbranch(pRExC_state, &flags, 1);
3377 br = reganode(pRExC_state, LONGJMP, 0);
3379 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3380 c = *nextchar(pRExC_state);
3384 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3385 regbranch(pRExC_state, &flags, 1);
3386 regtail(pRExC_state, ret, lastbr);
3389 c = *nextchar(pRExC_state);
3394 vFAIL("Switch (?(condition)... contains too many branches");
3395 ender = reg_node(pRExC_state, TAIL);
3396 regtail(pRExC_state, br, ender);
3398 regtail(pRExC_state, lastbr, ender);
3399 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3402 regtail(pRExC_state, ret, ender);
3406 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3410 RExC_parse--; /* for vFAIL to print correctly */
3411 vFAIL("Sequence (? incomplete");
3415 parse_flags: /* (?i) */
3416 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3417 /* (?g), (?gc) and (?o) are useless here
3418 and must be globally applied -- japhy */
3420 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3421 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3422 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3423 if (! (wastedflags & wflagbit) ) {
3424 wastedflags |= wflagbit;
3427 "Useless (%s%c) - %suse /%c modifier",
3428 flagsp == &negflags ? "?-" : "?",
3430 flagsp == &negflags ? "don't " : "",
3436 else if (*RExC_parse == 'c') {
3437 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3438 if (! (wastedflags & WASTED_C) ) {
3439 wastedflags |= WASTED_GC;
3442 "Useless (%sc) - %suse /gc modifier",
3443 flagsp == &negflags ? "?-" : "?",
3444 flagsp == &negflags ? "don't " : ""
3449 else { pmflag(flagsp, *RExC_parse); }
3453 if (*RExC_parse == '-') {
3455 wastedflags = 0; /* reset so (?g-c) warns twice */
3459 RExC_flags |= posflags;
3460 RExC_flags &= ~negflags;
3461 if (*RExC_parse == ':') {
3467 if (*RExC_parse != ')') {
3469 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3471 nextchar(pRExC_state);
3479 ret = reganode(pRExC_state, OPEN, parno);
3480 Set_Node_Length(ret, 1); /* MJD */
3481 Set_Node_Offset(ret, RExC_parse); /* MJD */
3488 /* Pick up the branches, linking them together. */
3489 parse_start = RExC_parse; /* MJD */
3490 br = regbranch(pRExC_state, &flags, 1);
3491 /* branch_len = (paren != 0); */
3495 if (*RExC_parse == '|') {
3496 if (!SIZE_ONLY && RExC_extralen) {
3497 reginsert(pRExC_state, BRANCHJ, br);
3500 reginsert(pRExC_state, BRANCH, br);
3501 Set_Node_Length(br, paren != 0);
3502 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3506 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3508 else if (paren == ':') {
3509 *flagp |= flags&SIMPLE;
3511 if (is_open) { /* Starts with OPEN. */
3512 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3514 else if (paren != '?') /* Not Conditional */
3516 *flagp |= flags & (SPSTART | HASWIDTH);
3518 while (*RExC_parse == '|') {
3519 if (!SIZE_ONLY && RExC_extralen) {
3520 ender = reganode(pRExC_state, LONGJMP,0);
3521 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3524 RExC_extralen += 2; /* Account for LONGJMP. */
3525 nextchar(pRExC_state);
3526 br = regbranch(pRExC_state, &flags, 0);
3530 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3534 *flagp |= flags&SPSTART;
3537 if (have_branch || paren != ':') {
3538 /* Make a closing node, and hook it on the end. */
3541 ender = reg_node(pRExC_state, TAIL);
3544 ender = reganode(pRExC_state, CLOSE, parno);
3545 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3546 Set_Node_Length(ender,1); /* MJD */
3552 *flagp &= ~HASWIDTH;
3555 ender = reg_node(pRExC_state, SUCCEED);
3558 ender = reg_node(pRExC_state, END);
3561 regtail(pRExC_state, lastbr, ender);
3564 /* Hook the tails of the branches to the closing node. */
3565 for (br = ret; br != NULL; br = regnext(br)) {
3566 regoptail(pRExC_state, br, ender);
3573 static const char parens[] = "=!<,>";
3575 if (paren && (p = strchr(parens, paren))) {
3576 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3577 int flag = (p - parens) > 1;
3580 node = SUSPEND, flag = 0;
3581 reginsert(pRExC_state, node,ret);
3582 Set_Node_Cur_Length(ret);
3583 Set_Node_Offset(ret, parse_start + 1);
3585 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3589 /* Check for proper termination. */
3591 RExC_flags = oregflags;
3592 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3593 RExC_parse = oregcomp_parse;
3594 vFAIL("Unmatched (");
3597 else if (!paren && RExC_parse < RExC_end) {
3598 if (*RExC_parse == ')') {
3600 vFAIL("Unmatched )");
3603 FAIL("Junk on end of regexp"); /* "Can't happen". */
3611 - regbranch - one alternative of an | operator
3613 * Implements the concatenation operator.
3616 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3619 register regnode *ret;
3620 register regnode *chain = NULL;
3621 register regnode *latest;
3622 I32 flags = 0, c = 0;
3627 if (!SIZE_ONLY && RExC_extralen)
3628 ret = reganode(pRExC_state, BRANCHJ,0);
3630 ret = reg_node(pRExC_state, BRANCH);
3631 Set_Node_Length(ret, 1);
3635 if (!first && SIZE_ONLY)
3636 RExC_extralen += 1; /* BRANCHJ */
3638 *flagp = WORST; /* Tentatively. */
3641 nextchar(pRExC_state);
3642 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3644 latest = regpiece(pRExC_state, &flags);
3645 if (latest == NULL) {
3646 if (flags & TRYAGAIN)
3650 else if (ret == NULL)
3652 *flagp |= flags&HASWIDTH;
3653 if (chain == NULL) /* First piece. */
3654 *flagp |= flags&SPSTART;
3657 regtail(pRExC_state, chain, latest);
3662 if (chain == NULL) { /* Loop ran zero times. */
3663 chain = reg_node(pRExC_state, NOTHING);
3668 *flagp |= flags&SIMPLE;
3675 - regpiece - something followed by possible [*+?]
3677 * Note that the branching code sequences used for ? and the general cases
3678 * of * and + are somewhat optimized: they use the same NOTHING node as
3679 * both the endmarker for their branch list and the body of the last branch.
3680 * It might seem that this node could be dispensed with entirely, but the
3681 * endmarker role is not redundant.
3684 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3687 register regnode *ret;
3689 register char *next;
3691 const char * const origparse = RExC_parse;
3694 I32 max = REG_INFTY;
3697 ret = regatom(pRExC_state, &flags);
3699 if (flags & TRYAGAIN)
3706 if (op == '{' && regcurly(RExC_parse)) {
3707 parse_start = RExC_parse; /* MJD */
3708 next = RExC_parse + 1;
3710 while (isDIGIT(*next) || *next == ',') {
3719 if (*next == '}') { /* got one */
3723 min = atoi(RExC_parse);
3727 maxpos = RExC_parse;
3729 if (!max && *maxpos != '0')
3730 max = REG_INFTY; /* meaning "infinity" */
3731 else if (max >= REG_INFTY)
3732 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3734 nextchar(pRExC_state);
3737 if ((flags&SIMPLE)) {
3738 RExC_naughty += 2 + RExC_naughty / 2;
3739 reginsert(pRExC_state, CURLY, ret);
3740 Set_Node_Offset(ret, parse_start+1); /* MJD */
3741 Set_Node_Cur_Length(ret);
3744 regnode *w = reg_node(pRExC_state, WHILEM);
3747 regtail(pRExC_state, ret, w);
3748 if (!SIZE_ONLY && RExC_extralen) {
3749 reginsert(pRExC_state, LONGJMP,ret);
3750 reginsert(pRExC_state, NOTHING,ret);
3751 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3753 reginsert(pRExC_state, CURLYX,ret);
3755 Set_Node_Offset(ret, parse_start+1);
3756 Set_Node_Length(ret,
3757 op == '{' ? (RExC_parse - parse_start) : 1);
3759 if (!SIZE_ONLY && RExC_extralen)
3760 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3761 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3763 RExC_whilem_seen++, RExC_extralen += 3;
3764 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3772 if (max && max < min)
3773 vFAIL("Can't do {n,m} with n > m");
3775 ARG1_SET(ret, (U16)min);
3776 ARG2_SET(ret, (U16)max);
3788 #if 0 /* Now runtime fix should be reliable. */
3790 /* if this is reinstated, don't forget to put this back into perldiag:
3792 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3794 (F) The part of the regexp subject to either the * or + quantifier
3795 could match an empty string. The {#} shows in the regular
3796 expression about where the problem was discovered.
3800 if (!(flags&HASWIDTH) && op != '?')
3801 vFAIL("Regexp *+ operand could be empty");
3804 parse_start = RExC_parse;
3805 nextchar(pRExC_state);
3807 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3809 if (op == '*' && (flags&SIMPLE)) {
3810 reginsert(pRExC_state, STAR, ret);
3814 else if (op == '*') {
3818 else if (op == '+' && (flags&SIMPLE)) {
3819 reginsert(pRExC_state, PLUS, ret);
3823 else if (op == '+') {
3827 else if (op == '?') {
3832 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
3834 "%.*s matches null string many times",
3835 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
3839 if (*RExC_parse == '?') {
3840 nextchar(pRExC_state);
3841 reginsert(pRExC_state, MINMOD, ret);
3842 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3844 if (ISMULT2(RExC_parse)) {
3846 vFAIL("Nested quantifiers");
3853 - regatom - the lowest level
3855 * Optimization: gobbles an entire sequence of ordinary characters so that
3856 * it can turn them into a single node, which is smaller to store and
3857 * faster to run. Backslashed characters are exceptions, each becoming a
3858 * separate node; the code is simpler that way and it's not worth fixing.
3860 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3862 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3865 register regnode *ret = NULL;
3867 char *parse_start = RExC_parse;
3869 *flagp = WORST; /* Tentatively. */
3872 switch (*RExC_parse) {
3874 RExC_seen_zerolen++;
3875 nextchar(pRExC_state);
3876 if (RExC_flags & PMf_MULTILINE)
3877 ret = reg_node(pRExC_state, MBOL);
3878 else if (RExC_flags & PMf_SINGLELINE)
3879 ret = reg_node(pRExC_state, SBOL);
3881 ret = reg_node(pRExC_state, BOL);
3882 Set_Node_Length(ret, 1); /* MJD */
3885 nextchar(pRExC_state);
3887 RExC_seen_zerolen++;
3888 if (RExC_flags & PMf_MULTILINE)
3889 ret = reg_node(pRExC_state, MEOL);
3890 else if (RExC_flags & PMf_SINGLELINE)
3891 ret = reg_node(pRExC_state, SEOL);
3893 ret = reg_node(pRExC_state, EOL);
3894 Set_Node_Length(ret, 1); /* MJD */
3897 nextchar(pRExC_state);
3898 if (RExC_flags & PMf_SINGLELINE)
3899 ret = reg_node(pRExC_state, SANY);
3901 ret = reg_node(pRExC_state, REG_ANY);
3902 *flagp |= HASWIDTH|SIMPLE;
3904 Set_Node_Length(ret, 1); /* MJD */
3908 char *oregcomp_parse = ++RExC_parse;
3909 ret = regclass(pRExC_state);
3910 if (*RExC_parse != ']') {
3911 RExC_parse = oregcomp_parse;
3912 vFAIL("Unmatched [");
3914 nextchar(pRExC_state);
3915 *flagp |= HASWIDTH|SIMPLE;
3916 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3920 nextchar(pRExC_state);
3921 ret = reg(pRExC_state, 1, &flags);
3923 if (flags & TRYAGAIN) {
3924 if (RExC_parse == RExC_end) {
3925 /* Make parent create an empty node if needed. */
3933 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3937 if (flags & TRYAGAIN) {
3941 vFAIL("Internal urp");
3942 /* Supposed to be caught earlier. */
3945 if (!regcurly(RExC_parse)) {
3954 vFAIL("Quantifier follows nothing");
3957 switch (*++RExC_parse) {
3959 RExC_seen_zerolen++;
3960 ret = reg_node(pRExC_state, SBOL);
3962 nextchar(pRExC_state);
3963 Set_Node_Length(ret, 2); /* MJD */
3966 ret = reg_node(pRExC_state, GPOS);
3967 RExC_seen |= REG_SEEN_GPOS;
3969 nextchar(pRExC_state);
3970 Set_Node_Length(ret, 2); /* MJD */
3973 ret = reg_node(pRExC_state, SEOL);
3975 RExC_seen_zerolen++; /* Do not optimize RE away */
3976 nextchar(pRExC_state);
3979 ret = reg_node(pRExC_state, EOS);
3981 RExC_seen_zerolen++; /* Do not optimize RE away */
3982 nextchar(pRExC_state);
3983 Set_Node_Length(ret, 2); /* MJD */
3986 ret = reg_node(pRExC_state, CANY);
3987 RExC_seen |= REG_SEEN_CANY;
3988 *flagp |= HASWIDTH|SIMPLE;
3989 nextchar(pRExC_state);
3990 Set_Node_Length(ret, 2); /* MJD */
3993 ret = reg_node(pRExC_state, CLUMP);
3995 nextchar(pRExC_state);
3996 Set_Node_Length(ret, 2); /* MJD */
3999 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4000 *flagp |= HASWIDTH|SIMPLE;
4001 nextchar(pRExC_state);
4002 Set_Node_Length(ret, 2); /* MJD */
4005 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4006 *flagp |= HASWIDTH|SIMPLE;
4007 nextchar(pRExC_state);
4008 Set_Node_Length(ret, 2); /* MJD */
4011 RExC_seen_zerolen++;
4012 RExC_seen |= REG_SEEN_LOOKBEHIND;
4013 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4015 nextchar(pRExC_state);
4016 Set_Node_Length(ret, 2); /* MJD */
4019 RExC_seen_zerolen++;
4020 RExC_seen |= REG_SEEN_LOOKBEHIND;
4021 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4023 nextchar(pRExC_state);
4024 Set_Node_Length(ret, 2); /* MJD */
4027 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4028 *flagp |= HASWIDTH|SIMPLE;
4029 nextchar(pRExC_state);
4030 Set_Node_Length(ret, 2); /* MJD */
4033 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4034 *flagp |= HASWIDTH|SIMPLE;
4035 nextchar(pRExC_state);
4036 Set_Node_Length(ret, 2); /* MJD */
4039 ret = reg_node(pRExC_state, DIGIT);
4040 *flagp |= HASWIDTH|SIMPLE;
4041 nextchar(pRExC_state);
4042 Set_Node_Length(ret, 2); /* MJD */
4045 ret = reg_node(pRExC_state, NDIGIT);
4046 *flagp |= HASWIDTH|SIMPLE;
4047 nextchar(pRExC_state);
4048 Set_Node_Length(ret, 2); /* MJD */
4053 char* oldregxend = RExC_end;
4054 char* parse_start = RExC_parse - 2;
4056 if (RExC_parse[1] == '{') {
4057 /* a lovely hack--pretend we saw [\pX] instead */
4058 RExC_end = strchr(RExC_parse, '}');
4060 U8 c = (U8)*RExC_parse;
4062 RExC_end = oldregxend;
4063 vFAIL2("Missing right brace on \\%c{}", c);
4068 RExC_end = RExC_parse + 2;
4069 if (RExC_end > oldregxend)
4070 RExC_end = oldregxend;
4074 ret = regclass(pRExC_state);
4076 RExC_end = oldregxend;
4079 Set_Node_Offset(ret, parse_start + 2);
4080 Set_Node_Cur_Length(ret);
4081 nextchar(pRExC_state);
4082 *flagp |= HASWIDTH|SIMPLE;
4095 case '1': case '2': case '3': case '4':
4096 case '5': case '6': case '7': case '8': case '9':
4098 const I32 num = atoi(RExC_parse);
4100 if (num > 9 && num >= RExC_npar)
4103 char * parse_start = RExC_parse - 1; /* MJD */
4104 while (isDIGIT(*RExC_parse))
4107 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4108 vFAIL("Reference to nonexistent group");
4110 ret = reganode(pRExC_state,
4111 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4115 /* override incorrect value set in reganode MJD */
4116 Set_Node_Offset(ret, parse_start+1);
4117 Set_Node_Cur_Length(ret); /* MJD */
4119 nextchar(pRExC_state);
4124 if (RExC_parse >= RExC_end)
4125 FAIL("Trailing \\");
4128 /* Do not generate "unrecognized" warnings here, we fall
4129 back into the quick-grab loop below */
4136 if (RExC_flags & PMf_EXTENDED) {
4137 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4138 if (RExC_parse < RExC_end)
4144 register STRLEN len;
4149 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4151 parse_start = RExC_parse - 1;
4157 ret = reg_node(pRExC_state,
4158 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4160 for (len = 0, p = RExC_parse - 1;
4161 len < 127 && p < RExC_end;
4166 if (RExC_flags & PMf_EXTENDED)
4167 p = regwhite(p, RExC_end);
4214 ender = ASCII_TO_NATIVE('\033');
4218 ender = ASCII_TO_NATIVE('\007');
4223 char* const e = strchr(p, '}');
4227 vFAIL("Missing right brace on \\x{}");
4230 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4231 | PERL_SCAN_DISALLOW_PREFIX;
4232 STRLEN numlen = e - p - 1;
4233 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4240 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4242 ender = grok_hex(p, &numlen, &flags, NULL);
4248 ender = UCHARAT(p++);
4249 ender = toCTRL(ender);
4251 case '0': case '1': case '2': case '3':case '4':
4252 case '5': case '6': case '7': case '8':case '9':
4254 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4257 ender = grok_oct(p, &numlen, &flags, NULL);
4267 FAIL("Trailing \\");
4270 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4271 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4272 goto normal_default;
4277 if (UTF8_IS_START(*p) && UTF) {
4279 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4280 &numlen, UTF8_ALLOW_DEFAULT);
4287 if (RExC_flags & PMf_EXTENDED)
4288 p = regwhite(p, RExC_end);
4290 /* Prime the casefolded buffer. */
4291 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4293 if (ISMULT2(p)) { /* Back off on ?+*. */
4300 /* Emit all the Unicode characters. */
4302 for (foldbuf = tmpbuf;
4304 foldlen -= numlen) {
4305 ender = utf8_to_uvchr(foldbuf, &numlen);
4307 reguni(pRExC_state, ender, s, &unilen);
4310 /* In EBCDIC the numlen
4311 * and unilen can differ. */
4313 if (numlen >= foldlen)
4317 break; /* "Can't happen." */
4321 reguni(pRExC_state, ender, s, &unilen);
4330 REGC((char)ender, s++);
4338 /* Emit all the Unicode characters. */
4340 for (foldbuf = tmpbuf;
4342 foldlen -= numlen) {
4343 ender = utf8_to_uvchr(foldbuf, &numlen);
4345 reguni(pRExC_state, ender, s, &unilen);
4348 /* In EBCDIC the numlen
4349 * and unilen can differ. */
4351 if (numlen >= foldlen)
4359 reguni(pRExC_state, ender, s, &unilen);
4368 REGC((char)ender, s++);
4372 Set_Node_Cur_Length(ret); /* MJD */
4373 nextchar(pRExC_state);
4375 /* len is STRLEN which is unsigned, need to copy to signed */
4378 vFAIL("Internal disaster");
4382 if (len == 1 && UNI_IS_INVARIANT(ender))
4387 RExC_size += STR_SZ(len);
4389 RExC_emit += STR_SZ(len);
4394 /* If the encoding pragma is in effect recode the text of
4395 * any EXACT-kind nodes. */
4396 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4397 STRLEN oldlen = STR_LEN(ret);
4398 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4402 if (sv_utf8_downgrade(sv, TRUE)) {
4403 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4404 const STRLEN newlen = SvCUR(sv);
4409 GET_RE_DEBUG_FLAGS_DECL;
4410 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4411 (int)oldlen, STRING(ret),
4413 Copy(s, STRING(ret), newlen, char);
4414 STR_LEN(ret) += newlen - oldlen;
4415 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4417 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4425 S_regwhite(char *p, const char *e)
4430 else if (*p == '#') {
4433 } while (p < e && *p != '\n');
4441 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4442 Character classes ([:foo:]) can also be negated ([:^foo:]).
4443 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4444 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4445 but trigger failures because they are currently unimplemented. */
4447 #define POSIXCC_DONE(c) ((c) == ':')
4448 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4449 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4452 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4455 I32 namedclass = OOB_NAMEDCLASS;
4457 if (value == '[' && RExC_parse + 1 < RExC_end &&
4458 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4459 POSIXCC(UCHARAT(RExC_parse))) {
4460 const char c = UCHARAT(RExC_parse);
4461 char* const s = RExC_parse++;
4463 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4465 if (RExC_parse == RExC_end)
4466 /* Grandfather lone [:, [=, [. */
4469 const char* t = RExC_parse++; /* skip over the c */
4470 const char *posixcc;
4474 if (UCHARAT(RExC_parse) == ']') {
4475 RExC_parse++; /* skip over the ending ] */
4478 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4479 const I32 skip = t - posixcc;
4481 /* Initially switch on the length of the name. */
4484 if (memEQ(posixcc, "word", 4)) {
4485 /* this is not POSIX, this is the Perl \w */;
4487 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4491 /* Names all of length 5. */
4492 /* alnum alpha ascii blank cntrl digit graph lower
4493 print punct space upper */
4494 /* Offset 4 gives the best switch position. */
4495 switch (posixcc[4]) {
4497 if (memEQ(posixcc, "alph", 4)) {
4500 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4504 if (memEQ(posixcc, "spac", 4)) {
4507 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4511 if (memEQ(posixcc, "grap", 4)) {
4514 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4518 if (memEQ(posixcc, "asci", 4)) {
4521 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4525 if (memEQ(posixcc, "blan", 4)) {
4528 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4532 if (memEQ(posixcc, "cntr", 4)) {
4535 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4539 if (memEQ(posixcc, "alnu", 4)) {
4542 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4546 if (memEQ(posixcc, "lowe", 4)) {
4549 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4551 if (memEQ(posixcc, "uppe", 4)) {
4554 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4558 if (memEQ(posixcc, "digi", 4)) {
4561 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4563 if (memEQ(posixcc, "prin", 4)) {
4566 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4568 if (memEQ(posixcc, "punc", 4)) {
4571 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4577 if (memEQ(posixcc, "xdigit", 6)) {
4579 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4584 if (namedclass == OOB_NAMEDCLASS)
4586 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4589 assert (posixcc[skip] == ':');
4590 assert (posixcc[skip+1] == ']');
4591 } else if (!SIZE_ONLY) {
4592 /* [[=foo=]] and [[.foo.]] are still future. */
4594 /* adjust RExC_parse so the warning shows after
4596 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4598 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4601 /* Maternal grandfather:
4602 * "[:" ending in ":" but not in ":]" */
4612 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4615 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4616 const char *s = RExC_parse;
4617 const char c = *s++;
4619 while(*s && isALNUM(*s))
4621 if (*s && c == *s && s[1] == ']') {
4622 if (ckWARN(WARN_REGEXP))
4624 "POSIX syntax [%c %c] belongs inside character classes",
4627 /* [[=foo=]] and [[.foo.]] are still future. */
4628 if (POSIXCC_NOTYET(c)) {
4629 /* adjust RExC_parse so the error shows after
4631 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4633 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4640 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4644 register UV nextvalue;
4645 register IV prevvalue = OOB_UNICODE;
4646 register IV range = 0;
4647 register regnode *ret;
4650 char *rangebegin = NULL;
4651 bool need_class = 0;
4655 bool optimize_invert = TRUE;
4656 AV* unicode_alternate = NULL;
4658 UV literal_endpoint = 0;
4661 ret = reganode(pRExC_state, ANYOF, 0);
4664 ANYOF_FLAGS(ret) = 0;
4666 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4670 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4674 RExC_size += ANYOF_SKIP;
4675 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
4678 RExC_emit += ANYOF_SKIP;
4680 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4682 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4683 ANYOF_BITMAP_ZERO(ret);
4684 listsv = newSVpvs("# comment\n");
4687 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4689 if (!SIZE_ONLY && POSIXCC(nextvalue))
4690 checkposixcc(pRExC_state);
4692 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4693 if (UCHARAT(RExC_parse) == ']')
4696 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4700 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4703 rangebegin = RExC_parse;
4705 value = utf8n_to_uvchr((U8*)RExC_parse,
4706 RExC_end - RExC_parse,
4707 &numlen, UTF8_ALLOW_DEFAULT);
4708 RExC_parse += numlen;
4711 value = UCHARAT(RExC_parse++);
4712 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4713 if (value == '[' && POSIXCC(nextvalue))
4714 namedclass = regpposixcc(pRExC_state, value);
4715 else if (value == '\\') {
4717 value = utf8n_to_uvchr((U8*)RExC_parse,
4718 RExC_end - RExC_parse,
4719 &numlen, UTF8_ALLOW_DEFAULT);
4720 RExC_parse += numlen;
4723 value = UCHARAT(RExC_parse++);
4724 /* Some compilers cannot handle switching on 64-bit integer
4725 * values, therefore value cannot be an UV. Yes, this will
4726 * be a problem later if we want switch on Unicode.
4727 * A similar issue a little bit later when switching on
4728 * namedclass. --jhi */
4729 switch ((I32)value) {
4730 case 'w': namedclass = ANYOF_ALNUM; break;
4731 case 'W': namedclass = ANYOF_NALNUM; break;
4732 case 's': namedclass = ANYOF_SPACE; break;
4733 case 'S': namedclass = ANYOF_NSPACE; break;
4734 case 'd': namedclass = ANYOF_DIGIT; break;
4735 case 'D': namedclass = ANYOF_NDIGIT; break;
4738 if (RExC_parse >= RExC_end)
4739 vFAIL2("Empty \\%c{}", (U8)value);
4740 if (*RExC_parse == '{') {
4741 const U8 c = (U8)value;
4742 e = strchr(RExC_parse++, '}');
4744 vFAIL2("Missing right brace on \\%c{}", c);
4745 while (isSPACE(UCHARAT(RExC_parse)))
4747 if (e == RExC_parse)
4748 vFAIL2("Empty \\%c{}", c);
4750 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4758 if (UCHARAT(RExC_parse) == '^') {
4761 value = value == 'p' ? 'P' : 'p'; /* toggle */
4762 while (isSPACE(UCHARAT(RExC_parse))) {
4767 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
4768 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
4771 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4772 namedclass = ANYOF_MAX; /* no official name, but it's named */
4774 case 'n': value = '\n'; break;
4775 case 'r': value = '\r'; break;
4776 case 't': value = '\t'; break;
4777 case 'f': value = '\f'; break;
4778 case 'b': value = '\b'; break;
4779 case 'e': value = ASCII_TO_NATIVE('\033');break;
4780 case 'a': value = ASCII_TO_NATIVE('\007');break;
4782 if (*RExC_parse == '{') {
4783 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4784 | PERL_SCAN_DISALLOW_PREFIX;
4785 e = strchr(RExC_parse++, '}');
4787 vFAIL("Missing right brace on \\x{}");
4789 numlen = e - RExC_parse;
4790 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4794 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4796 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4797 RExC_parse += numlen;
4801 value = UCHARAT(RExC_parse++);
4802 value = toCTRL(value);
4804 case '0': case '1': case '2': case '3': case '4':
4805 case '5': case '6': case '7': case '8': case '9':
4809 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4810 RExC_parse += numlen;
4814 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
4816 "Unrecognized escape \\%c in character class passed through",
4820 } /* end of \blah */
4826 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4828 if (!SIZE_ONLY && !need_class)
4829 ANYOF_CLASS_ZERO(ret);
4833 /* a bad range like a-\d, a-[:digit:] ? */
4836 if (ckWARN(WARN_REGEXP)) {
4838 RExC_parse >= rangebegin ?
4839 RExC_parse - rangebegin : 0;
4841 "False [] range \"%*.*s\"",
4844 if (prevvalue < 256) {
4845 ANYOF_BITMAP_SET(ret, prevvalue);
4846 ANYOF_BITMAP_SET(ret, '-');
4849 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4850 Perl_sv_catpvf(aTHX_ listsv,
4851 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4855 range = 0; /* this was not a true range */
4859 const char *what = NULL;
4862 if (namedclass > OOB_NAMEDCLASS)
4863 optimize_invert = FALSE;
4864 /* Possible truncation here but in some 64-bit environments
4865 * the compiler gets heartburn about switch on 64-bit values.
4866 * A similar issue a little earlier when switching on value.
4868 switch ((I32)namedclass) {
4871 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4873 for (value = 0; value < 256; value++)
4875 ANYOF_BITMAP_SET(ret, value);
4882 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4884 for (value = 0; value < 256; value++)
4885 if (!isALNUM(value))
4886 ANYOF_BITMAP_SET(ret, value);
4893 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4895 for (value = 0; value < 256; value++)
4896 if (isALNUMC(value))
4897 ANYOF_BITMAP_SET(ret, value);
4904 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4906 for (value = 0; value < 256; value++)
4907 if (!isALNUMC(value))
4908 ANYOF_BITMAP_SET(ret, value);
4915 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4917 for (value = 0; value < 256; value++)
4919 ANYOF_BITMAP_SET(ret, value);
4926 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4928 for (value = 0; value < 256; value++)
4929 if (!isALPHA(value))
4930 ANYOF_BITMAP_SET(ret, value);
4937 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4940 for (value = 0; value < 128; value++)
4941 ANYOF_BITMAP_SET(ret, value);
4943 for (value = 0; value < 256; value++) {
4945 ANYOF_BITMAP_SET(ret, value);
4954 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4957 for (value = 128; value < 256; value++)
4958 ANYOF_BITMAP_SET(ret, value);
4960 for (value = 0; value < 256; value++) {
4961 if (!isASCII(value))
4962 ANYOF_BITMAP_SET(ret, value);
4971 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4973 for (value = 0; value < 256; value++)
4975 ANYOF_BITMAP_SET(ret, value);
4982 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4984 for (value = 0; value < 256; value++)
4985 if (!isBLANK(value))
4986 ANYOF_BITMAP_SET(ret, value);
4993 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
4995 for (value = 0; value < 256; value++)
4997 ANYOF_BITMAP_SET(ret, value);
5004 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5006 for (value = 0; value < 256; value++)
5007 if (!isCNTRL(value))
5008 ANYOF_BITMAP_SET(ret, value);
5015 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5017 /* consecutive digits assumed */
5018 for (value = '0'; value <= '9'; value++)
5019 ANYOF_BITMAP_SET(ret, value);
5026 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5028 /* consecutive digits assumed */
5029 for (value = 0; value < '0'; value++)
5030 ANYOF_BITMAP_SET(ret, value);
5031 for (value = '9' + 1; value < 256; value++)
5032 ANYOF_BITMAP_SET(ret, value);
5039 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5041 for (value = 0; value < 256; value++)
5043 ANYOF_BITMAP_SET(ret, value);
5050 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5052 for (value = 0; value < 256; value++)
5053 if (!isGRAPH(value))
5054 ANYOF_BITMAP_SET(ret, value);
5061 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5063 for (value = 0; value < 256; value++)
5065 ANYOF_BITMAP_SET(ret, value);
5072 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5074 for (value = 0; value < 256; value++)
5075 if (!isLOWER(value))
5076 ANYOF_BITMAP_SET(ret, value);
5083 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5085 for (value = 0; value < 256; value++)
5087 ANYOF_BITMAP_SET(ret, value);
5094 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5096 for (value = 0; value < 256; value++)
5097 if (!isPRINT(value))
5098 ANYOF_BITMAP_SET(ret, value);
5105 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5107 for (value = 0; value < 256; value++)
5108 if (isPSXSPC(value))
5109 ANYOF_BITMAP_SET(ret, value);
5116 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5118 for (value = 0; value < 256; value++)
5119 if (!isPSXSPC(value))
5120 ANYOF_BITMAP_SET(ret, value);
5127 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5129 for (value = 0; value < 256; value++)
5131 ANYOF_BITMAP_SET(ret, value);
5138 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5140 for (value = 0; value < 256; value++)
5141 if (!isPUNCT(value))
5142 ANYOF_BITMAP_SET(ret, value);
5149 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5151 for (value = 0; value < 256; value++)
5153 ANYOF_BITMAP_SET(ret, value);
5160 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5162 for (value = 0; value < 256; value++)
5163 if (!isSPACE(value))
5164 ANYOF_BITMAP_SET(ret, value);
5171 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5173 for (value = 0; value < 256; value++)
5175 ANYOF_BITMAP_SET(ret, value);
5182 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5184 for (value = 0; value < 256; value++)
5185 if (!isUPPER(value))
5186 ANYOF_BITMAP_SET(ret, value);
5193 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5195 for (value = 0; value < 256; value++)
5196 if (isXDIGIT(value))
5197 ANYOF_BITMAP_SET(ret, value);
5204 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5206 for (value = 0; value < 256; value++)
5207 if (!isXDIGIT(value))
5208 ANYOF_BITMAP_SET(ret, value);
5214 /* this is to handle \p and \P */
5217 vFAIL("Invalid [::] class");
5221 /* Strings such as "+utf8::isWord\n" */
5222 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5225 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5228 } /* end of namedclass \blah */
5231 if (prevvalue > (IV)value) /* b-a */ {
5232 const int w = RExC_parse - rangebegin;
5233 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5234 range = 0; /* not a valid range */
5238 prevvalue = value; /* save the beginning of the range */
5239 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5240 RExC_parse[1] != ']') {
5243 /* a bad range like \w-, [:word:]- ? */
5244 if (namedclass > OOB_NAMEDCLASS) {
5245 if (ckWARN(WARN_REGEXP)) {
5247 RExC_parse >= rangebegin ?
5248 RExC_parse - rangebegin : 0;
5250 "False [] range \"%*.*s\"",
5254 ANYOF_BITMAP_SET(ret, '-');
5256 range = 1; /* yeah, it's a range! */
5257 continue; /* but do it the next time */
5261 /* now is the next time */
5265 if (prevvalue < 256) {
5266 const IV ceilvalue = value < 256 ? value : 255;
5269 /* In EBCDIC [\x89-\x91] should include
5270 * the \x8e but [i-j] should not. */
5271 if (literal_endpoint == 2 &&
5272 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5273 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5275 if (isLOWER(prevvalue)) {
5276 for (i = prevvalue; i <= ceilvalue; i++)
5278 ANYOF_BITMAP_SET(ret, i);
5280 for (i = prevvalue; i <= ceilvalue; i++)
5282 ANYOF_BITMAP_SET(ret, i);
5287 for (i = prevvalue; i <= ceilvalue; i++)
5288 ANYOF_BITMAP_SET(ret, i);
5290 if (value > 255 || UTF) {
5291 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5292 const UV natvalue = NATIVE_TO_UNI(value);
5294 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5295 if (prevnatvalue < natvalue) { /* what about > ? */
5296 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5297 prevnatvalue, natvalue);
5299 else if (prevnatvalue == natvalue) {
5300 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5302 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5304 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5306 /* If folding and foldable and a single
5307 * character, insert also the folded version
5308 * to the charclass. */
5310 if (foldlen == (STRLEN)UNISKIP(f))
5311 Perl_sv_catpvf(aTHX_ listsv,
5314 /* Any multicharacter foldings
5315 * require the following transform:
5316 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5317 * where E folds into "pq" and F folds
5318 * into "rst", all other characters
5319 * fold to single characters. We save
5320 * away these multicharacter foldings,
5321 * to be later saved as part of the
5322 * additional "s" data. */
5325 if (!unicode_alternate)
5326 unicode_alternate = newAV();
5327 sv = newSVpvn((char*)foldbuf, foldlen);
5329 av_push(unicode_alternate, sv);
5333 /* If folding and the value is one of the Greek
5334 * sigmas insert a few more sigmas to make the
5335 * folding rules of the sigmas to work right.
5336 * Note that not all the possible combinations
5337 * are handled here: some of them are handled
5338 * by the standard folding rules, and some of
5339 * them (literal or EXACTF cases) are handled
5340 * during runtime in regexec.c:S_find_byclass(). */
5341 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5342 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5343 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5344 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5345 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5347 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5348 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5349 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5354 literal_endpoint = 0;
5358 range = 0; /* this range (if it was one) is done now */
5362 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5364 RExC_size += ANYOF_CLASS_ADD_SKIP;
5366 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5369 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5371 /* If the only flag is folding (plus possibly inversion). */
5372 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5374 for (value = 0; value < 256; ++value) {
5375 if (ANYOF_BITMAP_TEST(ret, value)) {
5376 UV fold = PL_fold[value];
5379 ANYOF_BITMAP_SET(ret, fold);
5382 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5385 /* optimize inverted simple patterns (e.g. [^a-z]) */
5386 if (!SIZE_ONLY && optimize_invert &&
5387 /* If the only flag is inversion. */
5388 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5389 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5390 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5391 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5395 AV * const av = newAV();
5398 /* The 0th element stores the character class description
5399 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5400 * to initialize the appropriate swash (which gets stored in
5401 * the 1st element), and also useful for dumping the regnode.
5402 * The 2nd element stores the multicharacter foldings,
5403 * used later (regexec.c:S_reginclass()). */
5404 av_store(av, 0, listsv);
5405 av_store(av, 1, NULL);
5406 av_store(av, 2, (SV*)unicode_alternate);
5407 rv = newRV_noinc((SV*)av);
5408 n = add_data(pRExC_state, 1, "s");
5409 RExC_rx->data->data[n] = (void*)rv;
5417 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5419 char* const retval = RExC_parse++;
5422 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5423 RExC_parse[2] == '#') {
5424 while (*RExC_parse != ')') {
5425 if (RExC_parse == RExC_end)
5426 FAIL("Sequence (?#... not terminated");
5432 if (RExC_flags & PMf_EXTENDED) {
5433 if (isSPACE(*RExC_parse)) {
5437 else if (*RExC_parse == '#') {
5438 while (RExC_parse < RExC_end)
5439 if (*RExC_parse++ == '\n') break;
5448 - reg_node - emit a node
5450 STATIC regnode * /* Location. */
5451 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5454 register regnode *ptr;
5455 regnode * const ret = RExC_emit;
5458 SIZE_ALIGN(RExC_size);
5463 NODE_ALIGN_FILL(ret);
5465 FILL_ADVANCE_NODE(ptr, op);
5466 if (RExC_offsets) { /* MJD */
5467 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5468 "reg_node", __LINE__,
5470 RExC_emit - RExC_emit_start > RExC_offsets[0]
5471 ? "Overwriting end of array!\n" : "OK",
5472 RExC_emit - RExC_emit_start,
5473 RExC_parse - RExC_start,
5475 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5484 - reganode - emit a node with an argument
5486 STATIC regnode * /* Location. */
5487 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5490 register regnode *ptr;
5491 regnode * const ret = RExC_emit;
5494 SIZE_ALIGN(RExC_size);
5499 NODE_ALIGN_FILL(ret);
5501 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5502 if (RExC_offsets) { /* MJD */
5503 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5507 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5508 "Overwriting end of array!\n" : "OK",
5509 RExC_emit - RExC_emit_start,
5510 RExC_parse - RExC_start,
5512 Set_Cur_Node_Offset;
5521 - reguni - emit (if appropriate) a Unicode character
5524 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5527 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5531 - reginsert - insert an operator in front of already-emitted operand
5533 * Means relocating the operand.
5536 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5539 register regnode *src;
5540 register regnode *dst;
5541 register regnode *place;
5542 const int offset = regarglen[(U8)op];
5544 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5547 RExC_size += NODE_STEP_REGNODE + offset;
5552 RExC_emit += NODE_STEP_REGNODE + offset;
5554 while (src > opnd) {
5555 StructCopy(--src, --dst, regnode);
5556 if (RExC_offsets) { /* MJD 20010112 */
5557 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5561 dst - RExC_emit_start > RExC_offsets[0]
5562 ? "Overwriting end of array!\n" : "OK",
5563 src - RExC_emit_start,
5564 dst - RExC_emit_start,
5566 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5567 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5572 place = opnd; /* Op node, where operand used to be. */
5573 if (RExC_offsets) { /* MJD */
5574 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5578 place - RExC_emit_start > RExC_offsets[0]
5579 ? "Overwriting end of array!\n" : "OK",
5580 place - RExC_emit_start,
5581 RExC_parse - RExC_start,
5583 Set_Node_Offset(place, RExC_parse);
5584 Set_Node_Length(place, 1);
5586 src = NEXTOPER(place);
5587 FILL_ADVANCE_NODE(place, op);
5588 Zero(src, offset, regnode);
5592 - regtail - set the next-pointer at the end of a node chain of p to val.
5594 /* TODO: All three parms should be const */
5596 S_regtail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
5599 register regnode *scan;
5604 /* Find last node. */
5607 regnode * const temp = regnext(scan);
5613 if (reg_off_by_arg[OP(scan)]) {
5614 ARG_SET(scan, val - scan);
5617 NEXT_OFF(scan) = val - scan;
5622 - regoptail - regtail on operand of first argument; nop if operandless
5624 /* TODO: All three parms should be const */
5626 S_regoptail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
5629 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5630 if (p == NULL || SIZE_ONLY)
5632 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5633 regtail(pRExC_state, NEXTOPER(p), val);
5635 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5636 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5643 - regcurly - a little FSA that accepts {\d+,?\d*}
5646 S_regcurly(register const char *s)
5665 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5668 Perl_regdump(pTHX_ const regexp *r)
5672 SV * const sv = sv_newmortal();
5674 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
5676 /* Header fields of interest. */
5677 if (r->anchored_substr)
5678 PerlIO_printf(Perl_debug_log,
5679 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5681 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5682 SvPVX_const(r->anchored_substr),
5684 SvTAIL(r->anchored_substr) ? "$" : "",
5685 (IV)r->anchored_offset);
5686 else if (r->anchored_utf8)
5687 PerlIO_printf(Perl_debug_log,
5688 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5690 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5691 SvPVX_const(r->anchored_utf8),
5693 SvTAIL(r->anchored_utf8) ? "$" : "",
5694 (IV)r->anchored_offset);
5695 if (r->float_substr)
5696 PerlIO_printf(Perl_debug_log,
5697 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5699 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5700 SvPVX_const(r->float_substr),
5702 SvTAIL(r->float_substr) ? "$" : "",
5703 (IV)r->float_min_offset, (UV)r->float_max_offset);
5704 else if (r->float_utf8)
5705 PerlIO_printf(Perl_debug_log,
5706 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5708 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5709 SvPVX_const(r->float_utf8),
5711 SvTAIL(r->float_utf8) ? "$" : "",
5712 (IV)r->float_min_offset, (UV)r->float_max_offset);
5713 if (r->check_substr || r->check_utf8)
5714 PerlIO_printf(Perl_debug_log,
5715 r->check_substr == r->float_substr
5716 && r->check_utf8 == r->float_utf8
5717 ? "(checking floating" : "(checking anchored");
5718 if (r->reganch & ROPT_NOSCAN)
5719 PerlIO_printf(Perl_debug_log, " noscan");
5720 if (r->reganch & ROPT_CHECK_ALL)
5721 PerlIO_printf(Perl_debug_log, " isall");
5722 if (r->check_substr || r->check_utf8)
5723 PerlIO_printf(Perl_debug_log, ") ");
5725 if (r->regstclass) {
5726 regprop(r, sv, r->regstclass);
5727 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5729 if (r->reganch & ROPT_ANCH) {
5730 PerlIO_printf(Perl_debug_log, "anchored");
5731 if (r->reganch & ROPT_ANCH_BOL)
5732 PerlIO_printf(Perl_debug_log, "(BOL)");
5733 if (r->reganch & ROPT_ANCH_MBOL)
5734 PerlIO_printf(Perl_debug_log, "(MBOL)");
5735 if (r->reganch & ROPT_ANCH_SBOL)
5736 PerlIO_printf(Perl_debug_log, "(SBOL)");
5737 if (r->reganch & ROPT_ANCH_GPOS)
5738 PerlIO_printf(Perl_debug_log, "(GPOS)");
5739 PerlIO_putc(Perl_debug_log, ' ');
5741 if (r->reganch & ROPT_GPOS_SEEN)
5742 PerlIO_printf(Perl_debug_log, "GPOS ");
5743 if (r->reganch & ROPT_SKIP)
5744 PerlIO_printf(Perl_debug_log, "plus ");
5745 if (r->reganch & ROPT_IMPLICIT)
5746 PerlIO_printf(Perl_debug_log, "implicit ");
5747 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5748 if (r->reganch & ROPT_EVAL_SEEN)
5749 PerlIO_printf(Perl_debug_log, "with eval ");
5750 PerlIO_printf(Perl_debug_log, "\n");
5752 const U32 len = r->offsets[0];
5753 GET_RE_DEBUG_FLAGS_DECL;
5756 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5757 for (i = 1; i <= len; i++)
5758 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5759 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5760 PerlIO_printf(Perl_debug_log, "\n");
5764 PERL_UNUSED_CONTEXT;
5766 #endif /* DEBUGGING */
5770 - regprop - printable representation of opcode
5773 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
5779 sv_setpvn(sv, "", 0);
5780 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5781 /* It would be nice to FAIL() here, but this may be called from
5782 regexec.c, and it would be hard to supply pRExC_state. */
5783 Perl_croak(aTHX_ "Corrupted regexp opcode");
5784 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5786 k = PL_regkind[(U8)OP(o)];
5789 SV * const dsv = sv_2mortal(newSVpvs(""));
5790 /* Using is_utf8_string() is a crude hack but it may
5791 * be the best for now since we have no flag "this EXACTish
5792 * node was UTF-8" --jhi */
5793 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5794 const char * const s = do_utf8 ?
5795 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5796 UNI_DISPLAY_REGEX) :
5798 const int len = do_utf8 ?
5801 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5805 } else if (k == TRIE) {
5807 /* print the details od the trie in dumpuntil instead, as
5808 * prog->data isn't available here */
5809 } else if (k == CURLY) {
5810 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5811 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5812 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5814 else if (k == WHILEM && o->flags) /* Ordinal/of */
5815 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5816 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5817 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5818 else if (k == LOGICAL)
5819 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5820 else if (k == ANYOF) {
5821 int i, rangestart = -1;
5822 const U8 flags = ANYOF_FLAGS(o);
5824 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5825 static const char * const anyofs[] = {
5858 if (flags & ANYOF_LOCALE)
5859 sv_catpvs(sv, "{loc}");
5860 if (flags & ANYOF_FOLD)
5861 sv_catpvs(sv, "{i}");
5862 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5863 if (flags & ANYOF_INVERT)
5865 for (i = 0; i <= 256; i++) {
5866 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5867 if (rangestart == -1)
5869 } else if (rangestart != -1) {
5870 if (i <= rangestart + 3)
5871 for (; rangestart < i; rangestart++)
5872 put_byte(sv, rangestart);
5874 put_byte(sv, rangestart);
5876 put_byte(sv, i - 1);
5882 if (o->flags & ANYOF_CLASS)
5883 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5884 if (ANYOF_CLASS_TEST(o,i))
5885 sv_catpv(sv, anyofs[i]);
5887 if (flags & ANYOF_UNICODE)
5888 sv_catpvs(sv, "{unicode}");
5889 else if (flags & ANYOF_UNICODE_ALL)
5890 sv_catpvs(sv, "{unicode_all}");
5894 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
5898 U8 s[UTF8_MAXBYTES_CASE+1];
5900 for (i = 0; i <= 256; i++) { /* just the first 256 */
5901 uvchr_to_utf8(s, i);
5903 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5904 if (rangestart == -1)
5906 } else if (rangestart != -1) {
5907 if (i <= rangestart + 3)
5908 for (; rangestart < i; rangestart++) {
5909 const U8 * const e = uvchr_to_utf8(s,rangestart);
5911 for(p = s; p < e; p++)
5915 const U8 *e = uvchr_to_utf8(s,rangestart);
5917 for (p = s; p < e; p++)
5920 e = uvchr_to_utf8(s, i-1);
5921 for (p = s; p < e; p++)
5928 sv_catpvs(sv, "..."); /* et cetera */
5932 char *s = savesvpv(lv);
5933 char * const origs = s;
5935 while(*s && *s != '\n') s++;
5938 const char * const t = ++s;
5956 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5958 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5959 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5961 PERL_UNUSED_CONTEXT;
5962 PERL_UNUSED_ARG(sv);
5964 #endif /* DEBUGGING */
5968 Perl_re_intuit_string(pTHX_ regexp *prog)
5969 { /* Assume that RE_INTUIT is set */
5971 GET_RE_DEBUG_FLAGS_DECL;
5972 PERL_UNUSED_CONTEXT;
5976 const char * const s = SvPV_nolen_const(prog->check_substr
5977 ? prog->check_substr : prog->check_utf8);
5979 if (!PL_colorset) reginitcolors();
5980 PerlIO_printf(Perl_debug_log,
5981 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
5983 prog->check_substr ? "" : "utf8 ",
5984 PL_colors[5],PL_colors[0],
5987 (strlen(s) > 60 ? "..." : ""));
5990 return prog->check_substr ? prog->check_substr : prog->check_utf8;
5994 Perl_pregfree(pTHX_ struct regexp *r)
5998 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
5999 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
6003 if (!r || (--r->refcnt > 0))
6005 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6006 const char * const s = (r->reganch & ROPT_UTF8)
6007 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6008 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6009 const int len = SvCUR(dsv);
6012 PerlIO_printf(Perl_debug_log,
6013 "%sFreeing REx:%s %s%*.*s%s%s\n",
6014 PL_colors[4],PL_colors[5],PL_colors[0],
6017 len > 60 ? "..." : "");
6020 /* gcov results gave these as non-null 100% of the time, so there's no
6021 optimisation in checking them before calling Safefree */
6022 Safefree(r->precomp);
6023 Safefree(r->offsets); /* 20010421 MJD */
6024 RX_MATCH_COPY_FREE(r);
6025 #ifdef PERL_OLD_COPY_ON_WRITE
6027 SvREFCNT_dec(r->saved_copy);
6030 if (r->anchored_substr)
6031 SvREFCNT_dec(r->anchored_substr);
6032 if (r->anchored_utf8)
6033 SvREFCNT_dec(r->anchored_utf8);
6034 if (r->float_substr)
6035 SvREFCNT_dec(r->float_substr);
6037 SvREFCNT_dec(r->float_utf8);
6038 Safefree(r->substrs);
6041 int n = r->data->count;
6042 PAD* new_comppad = NULL;
6047 /* If you add a ->what type here, update the comment in regcomp.h */
6048 switch (r->data->what[n]) {
6050 SvREFCNT_dec((SV*)r->data->data[n]);
6053 Safefree(r->data->data[n]);
6056 new_comppad = (AV*)r->data->data[n];
6059 if (new_comppad == NULL)
6060 Perl_croak(aTHX_ "panic: pregfree comppad");
6061 PAD_SAVE_LOCAL(old_comppad,
6062 /* Watch out for global destruction's random ordering. */
6063 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6066 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6069 op_free((OP_4tree*)r->data->data[n]);
6071 PAD_RESTORE_LOCAL(old_comppad);
6072 SvREFCNT_dec((SV*)new_comppad);
6079 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
6082 refcount = --trie->refcount;
6085 Safefree(trie->charmap);
6086 if (trie->widecharmap)
6087 SvREFCNT_dec((SV*)trie->widecharmap);
6088 Safefree(trie->states);
6089 Safefree(trie->trans);
6092 SvREFCNT_dec((SV*)trie->words);
6093 if (trie->revcharmap)
6094 SvREFCNT_dec((SV*)trie->revcharmap);
6096 Safefree(r->data->data[n]); /* do this last!!!! */
6101 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6104 Safefree(r->data->what);
6107 Safefree(r->startp);
6113 - regnext - dig the "next" pointer out of a node
6116 Perl_regnext(pTHX_ register regnode *p)
6119 register I32 offset;
6121 if (p == &PL_regdummy)
6124 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6132 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6135 STRLEN l1 = strlen(pat1);
6136 STRLEN l2 = strlen(pat2);
6139 const char *message;
6145 Copy(pat1, buf, l1 , char);
6146 Copy(pat2, buf + l1, l2 , char);
6147 buf[l1 + l2] = '\n';
6148 buf[l1 + l2 + 1] = '\0';
6150 /* ANSI variant takes additional second argument */
6151 va_start(args, pat2);
6155 msv = vmess(buf, &args);
6157 message = SvPV_const(msv,l1);
6160 Copy(message, buf, l1 , char);
6161 buf[l1-1] = '\0'; /* Overwrite \n */
6162 Perl_croak(aTHX_ "%s", buf);
6165 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6168 Perl_save_re_context(pTHX)
6172 struct re_save_state *state;
6174 SAVEVPTR(PL_curcop);
6175 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6177 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6178 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6179 SSPUSHINT(SAVEt_RE_STATE);
6181 Copy(&PL_reg_state, state, 1, struct re_save_state);
6183 PL_reg_start_tmp = 0;
6184 PL_reg_start_tmpl = 0;
6185 PL_reg_oldsaved = NULL;
6186 PL_reg_oldsavedlen = 0;
6188 PL_reg_leftiter = 0;
6189 PL_reg_poscache = NULL;
6190 PL_reg_poscache_size = 0;
6191 #ifdef PERL_OLD_COPY_ON_WRITE
6195 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6197 const REGEXP * const rx = PM_GETRE(PL_curpm);
6200 for (i = 1; i <= rx->nparens; i++) {
6201 char digits[TYPE_CHARS(long)];
6202 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
6203 GV *const *const gvp
6204 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6207 GV * const gv = *gvp;
6208 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6217 clear_re(pTHX_ void *r)
6220 ReREFCNT_dec((regexp *)r);
6226 S_put_byte(pTHX_ SV *sv, int c)
6228 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6229 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6230 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6231 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6233 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6237 STATIC const regnode *
6238 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6239 const regnode *last, SV* sv, I32 l)
6242 register U8 op = EXACT; /* Arbitrary non-END op. */
6243 register const regnode *next;
6245 while (op != END && (!last || node < last)) {
6246 /* While that wasn't END last time... */
6252 next = regnext((regnode *)node);
6254 if (OP(node) == OPTIMIZED)
6256 regprop(r, sv, node);
6257 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6258 (int)(2*l + 1), "", SvPVX_const(sv));
6259 if (next == NULL) /* Next ptr. */
6260 PerlIO_printf(Perl_debug_log, "(0)");
6262 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6263 (void)PerlIO_putc(Perl_debug_log, '\n');
6265 if (PL_regkind[(U8)op] == BRANCHJ) {
6266 register const regnode *nnode = (OP(next) == LONGJMP
6267 ? regnext((regnode *)next)
6269 if (last && nnode > last)
6271 node = dumpuntil(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6273 else if (PL_regkind[(U8)op] == BRANCH) {
6274 node = dumpuntil(r, start, NEXTOPER(node), next, sv, l + 1);
6276 else if ( PL_regkind[(U8)op] == TRIE ) {
6277 const I32 n = ARG(node);
6278 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6279 const I32 arry_len = av_len(trie->words)+1;
6281 PerlIO_printf(Perl_debug_log,
6282 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6286 (int)trie->charcount,
6287 trie->uniquecharcount,
6288 (IV)trie->laststate-1,
6289 node->flags ? " EVAL mode" : "");
6291 for (word_idx=0; word_idx < arry_len; word_idx++) {
6292 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
6294 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6297 SvPV_nolen_const(*elem_ptr),
6302 PerlIO_printf(Perl_debug_log, "(0)\n");
6304 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6310 node = NEXTOPER(node);
6311 node += regarglen[(U8)op];
6314 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6315 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6316 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6318 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6319 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6322 else if ( op == PLUS || op == STAR) {
6323 node = dumpuntil(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6325 else if (op == ANYOF) {
6326 /* arglen 1 + class block */
6327 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6328 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6329 node = NEXTOPER(node);
6331 else if (PL_regkind[(U8)op] == EXACT) {
6332 /* Literal string, where present. */
6333 node += NODE_SZ_STR(node) - 1;
6334 node = NEXTOPER(node);
6337 node = NEXTOPER(node);
6338 node += regarglen[(U8)op];
6340 if (op == CURLYX || op == OPEN)
6342 else if (op == WHILEM)
6348 #endif /* DEBUGGING */
6352 * c-indentation-style: bsd
6354 * indent-tabs-mode: t
6357 * ex: set ts=8 sts=4 sw=4 noet: