5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
287 #define Simple_vFAIL(m) STMT_START { \
288 const IV offset = RExC_parse - RExC_precomp; \
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
296 #define vFAIL(m) STMT_START { \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
303 * Like Simple_vFAIL(), but accepts two arguments.
305 #define Simple_vFAIL2(m,a1) STMT_START { \
306 const IV offset = RExC_parse - RExC_precomp; \
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
314 #define vFAIL2(m,a1) STMT_START { \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
322 * Like Simple_vFAIL(), but accepts three arguments.
324 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
325 const IV offset = RExC_parse - RExC_precomp; \
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
333 #define vFAIL3(m,a1,a2) STMT_START { \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
340 * Like Simple_vFAIL(), but accepts four arguments.
342 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
343 const IV offset = RExC_parse - RExC_precomp; \
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
348 #define vWARN(loc,m) STMT_START { \
349 const IV offset = loc - RExC_precomp; \
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
354 #define vWARNdep(loc,m) STMT_START { \
355 const IV offset = loc - RExC_precomp; \
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
362 #define vWARN2(loc, m, a1) STMT_START { \
363 const IV offset = loc - RExC_precomp; \
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
368 #define vWARN3(loc, m, a1, a2) STMT_START { \
369 const IV offset = loc - RExC_precomp; \
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
374 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
375 const IV offset = loc - RExC_precomp; \
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
381 const IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
387 /* Allow for side effects in s */
388 #define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
392 /* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
398 #define MJD_OFFSET_DEBUG(x)
399 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
402 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
409 RExC_offsets[2*(node)-1] = (byte); \
414 #define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
418 #define Set_Node_Length_To_R(node,len) STMT_START { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
421 __LINE__, (int)(node), (int)(len))); \
423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
425 RExC_offsets[2*(node)] = (len); \
430 #define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433 #define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
436 /* Get offsets and lengths */
437 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
440 static void clear_re(pTHX_ void *r);
442 /* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
447 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
453 SvSetMagicSV(*data->longest, data->last_found);
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
460 data->flags &= ~SF_FIX_BEFORE_EOL;
463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
464 data->offset_float_max = (l
465 ? data->last_start_max
466 : data->pos_min + data->pos_delta);
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
469 if (data->flags & SF_BEFORE_EOL)
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
473 data->flags &= ~SF_FL_BEFORE_EOL;
476 SvCUR_set(data->last_found, 0);
478 SV * const sv = data->last_found;
479 if (SvUTF8(sv) && SvMAGICAL(sv)) {
480 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
486 data->flags &= ~SF_BEFORE_EOL;
489 /* Can match anything (initialization) */
491 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
493 ANYOF_CLASS_ZERO(cl);
494 ANYOF_BITMAP_SETALL(cl);
495 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
497 cl->flags |= ANYOF_LOCALE;
500 /* Can match anything (initialization) */
502 S_cl_is_anything(const struct regnode_charclass_class *cl)
506 for (value = 0; value <= ANYOF_MAX; value += 2)
507 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
509 if (!(cl->flags & ANYOF_UNICODE_ALL))
511 if (!ANYOF_BITMAP_TESTALLSET(cl))
516 /* Can match anything (initialization) */
518 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
520 Zero(cl, 1, struct regnode_charclass_class);
522 cl_anything(pRExC_state, cl);
526 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
528 Zero(cl, 1, struct regnode_charclass_class);
530 cl_anything(pRExC_state, cl);
532 cl->flags |= ANYOF_LOCALE;
535 /* 'And' a given class with another one. Can create false positives */
536 /* We assume that cl is not inverted */
538 S_cl_and(struct regnode_charclass_class *cl,
539 const struct regnode_charclass_class *and_with)
541 if (!(and_with->flags & ANYOF_CLASS)
542 && !(cl->flags & ANYOF_CLASS)
543 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
544 && !(and_with->flags & ANYOF_FOLD)
545 && !(cl->flags & ANYOF_FOLD)) {
548 if (and_with->flags & ANYOF_INVERT)
549 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
550 cl->bitmap[i] &= ~and_with->bitmap[i];
552 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
553 cl->bitmap[i] &= and_with->bitmap[i];
554 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
555 if (!(and_with->flags & ANYOF_EOS))
556 cl->flags &= ~ANYOF_EOS;
558 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
559 !(and_with->flags & ANYOF_INVERT)) {
560 cl->flags &= ~ANYOF_UNICODE_ALL;
561 cl->flags |= ANYOF_UNICODE;
562 ARG_SET(cl, ARG(and_with));
564 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
565 !(and_with->flags & ANYOF_INVERT))
566 cl->flags &= ~ANYOF_UNICODE_ALL;
567 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
568 !(and_with->flags & ANYOF_INVERT))
569 cl->flags &= ~ANYOF_UNICODE;
572 /* 'OR' a given class with another one. Can create false positives */
573 /* We assume that cl is not inverted */
575 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
577 if (or_with->flags & ANYOF_INVERT) {
579 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
580 * <= (B1 | !B2) | (CL1 | !CL2)
581 * which is wasteful if CL2 is small, but we ignore CL2:
582 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
583 * XXXX Can we handle case-fold? Unclear:
584 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
585 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
587 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
588 && !(or_with->flags & ANYOF_FOLD)
589 && !(cl->flags & ANYOF_FOLD) ) {
592 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
593 cl->bitmap[i] |= ~or_with->bitmap[i];
594 } /* XXXX: logic is complicated otherwise */
596 cl_anything(pRExC_state, cl);
599 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
600 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
601 && (!(or_with->flags & ANYOF_FOLD)
602 || (cl->flags & ANYOF_FOLD)) ) {
605 /* OR char bitmap and class bitmap separately */
606 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
607 cl->bitmap[i] |= or_with->bitmap[i];
608 if (or_with->flags & ANYOF_CLASS) {
609 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
610 cl->classflags[i] |= or_with->classflags[i];
611 cl->flags |= ANYOF_CLASS;
614 else { /* XXXX: logic is complicated, leave it along for a moment. */
615 cl_anything(pRExC_state, cl);
618 if (or_with->flags & ANYOF_EOS)
619 cl->flags |= ANYOF_EOS;
621 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
622 ARG(cl) != ARG(or_with)) {
623 cl->flags |= ANYOF_UNICODE_ALL;
624 cl->flags &= ~ANYOF_UNICODE;
626 if (or_with->flags & ANYOF_UNICODE_ALL) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
634 make_trie(startbranch,first,last,tail,flags)
635 startbranch: the first branch in the whole branch sequence
636 first : start branch of sequence of branch-exact nodes.
637 May be the same as startbranch
638 last : Thing following the last branch.
639 May be the same as tail.
640 tail : item following the branch sequence
641 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
643 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
645 A trie is an N'ary tree where the branches are determined by digital
646 decomposition of the key. IE, at the root node you look up the 1st character and
647 follow that branch repeat until you find the end of the branches. Nodes can be
648 marked as "accepting" meaning they represent a complete word. Eg:
652 would convert into the following structure. Numbers represent states, letters
653 following numbers represent valid transitions on the letter from that state, if
654 the number is in square brackets it represents an accepting state, otherwise it
655 will be in parenthesis.
657 +-h->+-e->[3]-+-r->(8)-+-s->[9]
661 (1) +-i->(6)-+-s->[7]
663 +-s->(3)-+-h->(4)-+-e->[5]
665 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
667 This shows that when matching against the string 'hers' we will begin at state 1
668 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
669 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
670 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
671 single traverse. We store a mapping from accepting to state to which word was
672 matched, and then when we have multiple possibilities we try to complete the
673 rest of the regex in the order in which they occured in the alternation.
675 The only prior NFA like behaviour that would be changed by the TRIE support is
676 the silent ignoring of duplicate alternations which are of the form:
678 / (DUPE|DUPE) X? (?{ ... }) Y /x
680 Thus EVAL blocks follwing a trie may be called a different number of times with
681 and without the optimisation. With the optimisations dupes will be silently
682 ignored. This inconsistant behaviour of EVAL type nodes is well established as
683 the following demonstrates:
685 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
687 which prints out 'word' three times, but
689 'words'=~/(word|word|word)(?{ print $1 })S/
691 which doesnt print it out at all. This is due to other optimisations kicking in.
693 Example of what happens on a structural level:
695 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
697 1: CURLYM[1] {1,32767}(18)
708 This would be optimizable with startbranch=5, first=5, last=16, tail=16
709 and should turn into:
711 1: CURLYM[1] {1,32767}(18)
713 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
721 Cases where tail != last would be like /(?foo|bar)baz/:
731 which would be optimizable with startbranch=1, first=1, last=7, tail=8
732 and would end up looking like:
735 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
744 #define TRIE_DEBUG_CHAR \
745 DEBUG_TRIE_COMPILE_r({ \
748 tmp = newSVpvs( "" ); \
749 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
751 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
753 av_push( trie->revcharmap, tmp ); \
756 #define TRIE_READ_CHAR STMT_START { \
759 if ( foldlen > 0 ) { \
760 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
765 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
766 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
767 foldlen -= UNISKIP( uvc ); \
768 scan = foldbuf + UNISKIP( uvc ); \
771 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
780 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
781 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
782 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
783 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
785 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
786 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
787 TRIE_LIST_LEN( state ) *= 2; \
788 Renew( trie->states[ state ].trans.list, \
789 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
791 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
792 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
793 TRIE_LIST_CUR( state )++; \
796 #define TRIE_LIST_NEW(state) STMT_START { \
797 Newxz( trie->states[ state ].trans.list, \
798 4, reg_trie_trans_le ); \
799 TRIE_LIST_CUR( state ) = 1; \
800 TRIE_LIST_LEN( state ) = 4; \
804 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
807 /* first pass, loop through and scan words */
810 const U32 uniflags = UTF8_ALLOW_DEFAULT;
815 /* we just use folder as a flag in utf8 */
816 const U8 * const folder = ( flags == EXACTF
824 const U32 data_slot = add_data( pRExC_state, 1, "t" );
827 GET_RE_DEBUG_FLAGS_DECL;
829 Newxz( trie, 1, reg_trie_data );
831 RExC_rx->data->data[ data_slot ] = (void*)trie;
832 Newxz( trie->charmap, 256, U16 );
834 trie->words = newAV();
835 trie->revcharmap = newAV();
839 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
840 if (!SvIOK(re_trie_maxbuff)) {
841 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
844 /* -- First loop and Setup --
846 We first traverse the branches and scan each word to determine if it
847 contains widechars, and how many unique chars there are, this is
848 important as we have to build a table with at least as many columns as we
851 We use an array of integers to represent the character codes 0..255
852 (trie->charmap) and we use a an HV* to store unicode characters. We use the
853 native representation of the character value as the key and IV's for the
856 *TODO* If we keep track of how many times each character is used we can
857 remap the columns so that the table compression later on is more
858 efficient in terms of memory by ensuring most common value is in the
859 middle and the least common are on the outside. IMO this would be better
860 than a most to least common mapping as theres a decent chance the most
861 common letter will share a node with the least common, meaning the node
862 will not be compressable. With a middle is most common approach the worst
863 case is when we have the least common nodes twice.
868 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
869 regnode * const noper = NEXTOPER( cur );
870 const U8 *uc = (U8*)STRING( noper );
871 const U8 * const e = uc + STR_LEN( noper );
873 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
874 const U8 *scan = (U8*)NULL;
876 for ( ; uc < e ; uc += len ) {
880 if ( !trie->charmap[ uvc ] ) {
881 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
883 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
888 if ( !trie->widecharmap )
889 trie->widecharmap = newHV();
891 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
894 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
896 if ( !SvTRUE( *svpp ) ) {
897 sv_setiv( *svpp, ++trie->uniquecharcount );
903 } /* end first pass */
904 DEBUG_TRIE_COMPILE_r(
905 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
906 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
907 (int)trie->charcount, trie->uniquecharcount )
912 We now know what we are dealing with in terms of unique chars and
913 string sizes so we can calculate how much memory a naive
914 representation using a flat table will take. If it's over a reasonable
915 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
916 conservative but potentially much slower representation using an array
919 At the end we convert both representations into the same compressed
920 form that will be used in regexec.c for matching with. The latter
921 is a form that cannot be used to construct with but has memory
922 properties similar to the list form and access properties similar
923 to the table form making it both suitable for fast searches and
924 small enough that its feasable to store for the duration of a program.
926 See the comment in the code where the compressed table is produced
927 inplace from the flat tabe representation for an explanation of how
928 the compression works.
933 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
935 Second Pass -- Array Of Lists Representation
937 Each state will be represented by a list of charid:state records
938 (reg_trie_trans_le) the first such element holds the CUR and LEN
939 points of the allocated array. (See defines above).
941 We build the initial structure using the lists, and then convert
942 it into the compressed table form which allows faster lookups
943 (but cant be modified once converted).
949 STRLEN transcount = 1;
951 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
955 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
957 regnode * const noper = NEXTOPER( cur );
958 U8 *uc = (U8*)STRING( noper );
959 const U8 * const e = uc + STR_LEN( noper );
960 U32 state = 1; /* required init */
961 U16 charid = 0; /* sanity init */
962 U8 *scan = (U8*)NULL; /* sanity init */
963 STRLEN foldlen = 0; /* required init */
964 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
966 for ( ; uc < e ; uc += len ) {
971 charid = trie->charmap[ uvc ];
973 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
977 charid=(U16)SvIV( *svpp );
986 if ( !trie->states[ state ].trans.list ) {
987 TRIE_LIST_NEW( state );
989 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
990 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
991 newstate = TRIE_LIST_ITEM( state, check ).newstate;
996 newstate = next_alloc++;
997 TRIE_LIST_PUSH( state, charid, newstate );
1002 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1004 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1007 if ( !trie->states[ state ].wordnum ) {
1008 /* we havent inserted this word into the structure yet. */
1009 trie->states[ state ].wordnum = ++curword;
1012 /* store the word for dumping */
1013 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1014 if ( UTF ) SvUTF8_on( tmp );
1015 av_push( trie->words, tmp );
1019 /*EMPTY*/; /* It's a dupe. So ignore it. */
1022 } /* end second pass */
1024 trie->laststate = next_alloc;
1025 Renew( trie->states, next_alloc, reg_trie_state );
1027 DEBUG_TRIE_COMPILE_MORE_r({
1030 /* print out the table precompression. */
1032 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1033 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1035 for( state=1 ; state < next_alloc ; state ++ ) {
1038 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1039 if ( ! trie->states[ state ].wordnum ) {
1040 PerlIO_printf( Perl_debug_log, "%5s| ","");
1042 PerlIO_printf( Perl_debug_log, "W%04x| ",
1043 trie->states[ state ].wordnum
1046 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1047 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1048 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1049 SvPV_nolen_const( *tmp ),
1050 TRIE_LIST_ITEM(state,charid).forid,
1051 (UV)TRIE_LIST_ITEM(state,charid).newstate
1056 PerlIO_printf( Perl_debug_log, "\n\n" );
1059 Newxz( trie->trans, transcount ,reg_trie_trans );
1066 for( state=1 ; state < next_alloc ; state ++ ) {
1070 DEBUG_TRIE_COMPILE_MORE_r(
1071 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1075 if (trie->states[state].trans.list) {
1076 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1080 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1081 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1082 if ( forid < minid ) {
1084 } else if ( forid > maxid ) {
1088 if ( transcount < tp + maxid - minid + 1) {
1090 Renew( trie->trans, transcount, reg_trie_trans );
1091 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1093 base = trie->uniquecharcount + tp - minid;
1094 if ( maxid == minid ) {
1096 for ( ; zp < tp ; zp++ ) {
1097 if ( ! trie->trans[ zp ].next ) {
1098 base = trie->uniquecharcount + zp - minid;
1099 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1100 trie->trans[ zp ].check = state;
1106 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1107 trie->trans[ tp ].check = state;
1112 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1113 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1114 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1115 trie->trans[ tid ].check = state;
1117 tp += ( maxid - minid + 1 );
1119 Safefree(trie->states[ state ].trans.list);
1122 DEBUG_TRIE_COMPILE_MORE_r(
1123 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1126 trie->states[ state ].trans.base=base;
1128 trie->lasttrans = tp + 1;
1132 Second Pass -- Flat Table Representation.
1134 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1135 We know that we will need Charcount+1 trans at most to store the data
1136 (one row per char at worst case) So we preallocate both structures
1137 assuming worst case.
1139 We then construct the trie using only the .next slots of the entry
1142 We use the .check field of the first entry of the node temporarily to
1143 make compression both faster and easier by keeping track of how many non
1144 zero fields are in the node.
1146 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1149 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1150 number representing the first entry of the node, and state as a
1151 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1152 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1153 are 2 entrys per node. eg:
1161 The table is internally in the right hand, idx form. However as we also
1162 have to deal with the states array which is indexed by nodenum we have to
1163 use TRIE_NODENUM() to convert.
1167 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1169 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
1170 next_alloc = trie->uniquecharcount + 1;
1172 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1174 regnode * const noper = NEXTOPER( cur );
1175 const U8 *uc = (U8*)STRING( noper );
1176 const U8 * const e = uc + STR_LEN( noper );
1178 U32 state = 1; /* required init */
1180 U16 charid = 0; /* sanity init */
1181 U32 accept_state = 0; /* sanity init */
1182 U8 *scan = (U8*)NULL; /* sanity init */
1184 STRLEN foldlen = 0; /* required init */
1185 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1188 for ( ; uc < e ; uc += len ) {
1193 charid = trie->charmap[ uvc ];
1195 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1196 charid = svpp ? (U16)SvIV(*svpp) : 0;
1200 if ( !trie->trans[ state + charid ].next ) {
1201 trie->trans[ state + charid ].next = next_alloc;
1202 trie->trans[ state ].check++;
1203 next_alloc += trie->uniquecharcount;
1205 state = trie->trans[ state + charid ].next;
1207 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1209 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1212 accept_state = TRIE_NODENUM( state );
1213 if ( !trie->states[ accept_state ].wordnum ) {
1214 /* we havent inserted this word into the structure yet. */
1215 trie->states[ accept_state ].wordnum = ++curword;
1218 /* store the word for dumping */
1219 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1220 if ( UTF ) SvUTF8_on( tmp );
1221 av_push( trie->words, tmp );
1225 /*EMPTY*/; /* Its a dupe. So ignore it. */
1228 } /* end second pass */
1230 DEBUG_TRIE_COMPILE_MORE_r({
1232 print out the table precompression so that we can do a visual check
1233 that they are identical.
1237 PerlIO_printf( Perl_debug_log, "\nChar : " );
1239 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1240 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1242 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1246 PerlIO_printf( Perl_debug_log, "\nState+-" );
1248 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1249 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1252 PerlIO_printf( Perl_debug_log, "\n" );
1254 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1256 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1258 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1259 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1260 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1262 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1263 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1265 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1266 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1269 PerlIO_printf( Perl_debug_log, "\n\n" );
1273 * Inplace compress the table.*
1275 For sparse data sets the table constructed by the trie algorithm will
1276 be mostly 0/FAIL transitions or to put it another way mostly empty.
1277 (Note that leaf nodes will not contain any transitions.)
1279 This algorithm compresses the tables by eliminating most such
1280 transitions, at the cost of a modest bit of extra work during lookup:
1282 - Each states[] entry contains a .base field which indicates the
1283 index in the state[] array wheres its transition data is stored.
1285 - If .base is 0 there are no valid transitions from that node.
1287 - If .base is nonzero then charid is added to it to find an entry in
1290 -If trans[states[state].base+charid].check!=state then the
1291 transition is taken to be a 0/Fail transition. Thus if there are fail
1292 transitions at the front of the node then the .base offset will point
1293 somewhere inside the previous nodes data (or maybe even into a node
1294 even earlier), but the .check field determines if the transition is
1297 The following process inplace converts the table to the compressed
1298 table: We first do not compress the root node 1,and mark its all its
1299 .check pointers as 1 and set its .base pointer as 1 as well. This
1300 allows to do a DFA construction from the compressed table later, and
1301 ensures that any .base pointers we calculate later are greater than
1304 - We set 'pos' to indicate the first entry of the second node.
1306 - We then iterate over the columns of the node, finding the first and
1307 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1308 and set the .check pointers accordingly, and advance pos
1309 appropriately and repreat for the next node. Note that when we copy
1310 the next pointers we have to convert them from the original
1311 NODEIDX form to NODENUM form as the former is not valid post
1314 - If a node has no transitions used we mark its base as 0 and do not
1315 advance the pos pointer.
1317 - If a node only has one transition we use a second pointer into the
1318 structure to fill in allocated fail transitions from other states.
1319 This pointer is independent of the main pointer and scans forward
1320 looking for null transitions that are allocated to a state. When it
1321 finds one it writes the single transition into the "hole". If the
1322 pointer doesnt find one the single transition is appeneded as normal.
1324 - Once compressed we can Renew/realloc the structures to release the
1327 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1328 specifically Fig 3.47 and the associated pseudocode.
1332 const U32 laststate = TRIE_NODENUM( next_alloc );
1335 trie->laststate = laststate;
1337 for ( state = 1 ; state < laststate ; state++ ) {
1339 const U32 stateidx = TRIE_NODEIDX( state );
1340 const U32 o_used = trie->trans[ stateidx ].check;
1341 U32 used = trie->trans[ stateidx ].check;
1342 trie->trans[ stateidx ].check = 0;
1344 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1345 if ( flag || trie->trans[ stateidx + charid ].next ) {
1346 if ( trie->trans[ stateidx + charid ].next ) {
1348 for ( ; zp < pos ; zp++ ) {
1349 if ( ! trie->trans[ zp ].next ) {
1353 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1354 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1355 trie->trans[ zp ].check = state;
1356 if ( ++zp > pos ) pos = zp;
1363 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1365 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1366 trie->trans[ pos ].check = state;
1371 trie->lasttrans = pos + 1;
1372 Renew( trie->states, laststate + 1, reg_trie_state);
1373 DEBUG_TRIE_COMPILE_MORE_r(
1374 PerlIO_printf( Perl_debug_log,
1375 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1376 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1379 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1382 } /* end table compress */
1384 /* resize the trans array to remove unused space */
1385 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1387 DEBUG_TRIE_COMPILE_r({
1390 Now we print it out again, in a slightly different form as there is additional
1391 info we want to be able to see when its compressed. They are close enough for
1392 visual comparison though.
1394 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1396 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1397 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1399 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1402 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1404 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1405 PerlIO_printf( Perl_debug_log, "-----");
1406 PerlIO_printf( Perl_debug_log, "\n");
1408 for( state = 1 ; state < trie->laststate ; state++ ) {
1409 const U32 base = trie->states[ state ].trans.base;
1411 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1413 if ( trie->states[ state ].wordnum ) {
1414 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1416 PerlIO_printf( Perl_debug_log, "%6s", "" );
1419 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1424 while( ( base + ofs < trie->uniquecharcount ) ||
1425 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1426 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1429 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1431 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1432 if ( ( base + ofs >= trie->uniquecharcount ) &&
1433 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1434 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1436 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1437 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1439 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1443 PerlIO_printf( Perl_debug_log, "]");
1446 PerlIO_printf( Perl_debug_log, "\n" );
1451 /* now finally we "stitch in" the new TRIE node
1452 This means we convert either the first branch or the first Exact,
1453 depending on whether the thing following (in 'last') is a branch
1454 or not and whther first is the startbranch (ie is it a sub part of
1455 the alternation or is it the whole thing.)
1456 Assuming its a sub part we conver the EXACT otherwise we convert
1457 the whole branch sequence, including the first.
1464 if ( first == startbranch && OP( last ) != BRANCH ) {
1467 convert = NEXTOPER( first );
1468 NEXT_OFF( first ) = (U16)(last - first);
1471 OP( convert ) = TRIE + (U8)( flags - EXACT );
1472 NEXT_OFF( convert ) = (U16)(tail - convert);
1473 ARG_SET( convert, data_slot );
1475 /* tells us if we need to handle accept buffers specially */
1476 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1479 /* needed for dumping*/
1481 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1482 /* We now need to mark all of the space originally used by the
1483 branches as optimized away. This keeps the dumpuntil from
1484 throwing a wobbly as it doesnt use regnext() to traverse the
1487 while( optimize < last ) {
1488 OP( optimize ) = OPTIMIZED;
1492 } /* end node insert */
1499 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1500 * These need to be revisited when a newer toolchain becomes available.
1502 #if defined(__sparc64__) && defined(__GNUC__)
1503 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1504 # undef SPARC64_GCC_WORKAROUND
1505 # define SPARC64_GCC_WORKAROUND 1
1509 /* REx optimizer. Converts nodes into quickier variants "in place".
1510 Finds fixed substrings. */
1512 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1513 to the position after last scanned or to NULL. */
1517 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1518 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1519 /* scanp: Start here (read-write). */
1520 /* deltap: Write maxlen-minlen here. */
1521 /* last: Stop before this one. */
1524 I32 min = 0, pars = 0, code;
1525 regnode *scan = *scanp, *next;
1527 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1528 int is_inf_internal = 0; /* The studied chunk is infinite */
1529 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1530 scan_data_t data_fake;
1531 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1532 SV *re_trie_maxbuff = NULL;
1534 GET_RE_DEBUG_FLAGS_DECL;
1536 while (scan && OP(scan) != END && scan < last) {
1537 /* Peephole optimizer: */
1539 SV * const mysv=sv_newmortal();
1540 regprop(RExC_rx, mysv, scan);
1541 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1542 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1545 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1546 /* Merge several consecutive EXACTish nodes into one. */
1547 regnode *n = regnext(scan);
1550 regnode *stop = scan;
1553 next = scan + NODE_SZ_STR(scan);
1554 /* Skip NOTHING, merge EXACT*. */
1556 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1557 (stringok && (OP(n) == OP(scan))))
1559 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1560 if (OP(n) == TAIL || n > next)
1562 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1563 NEXT_OFF(scan) += NEXT_OFF(n);
1564 next = n + NODE_STEP_REGNODE;
1571 else if (stringok) {
1572 const int oldl = STR_LEN(scan);
1573 regnode * const nnext = regnext(n);
1575 if (oldl + STR_LEN(n) > U8_MAX)
1577 NEXT_OFF(scan) += NEXT_OFF(n);
1578 STR_LEN(scan) += STR_LEN(n);
1579 next = n + NODE_SZ_STR(n);
1580 /* Now we can overwrite *n : */
1581 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1589 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1591 Two problematic code points in Unicode casefolding of EXACT nodes:
1593 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1594 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1600 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1601 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1603 This means that in case-insensitive matching (or "loose matching",
1604 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1605 length of the above casefolded versions) can match a target string
1606 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1607 This would rather mess up the minimum length computation.
1609 What we'll do is to look for the tail four bytes, and then peek
1610 at the preceding two bytes to see whether we need to decrease
1611 the minimum length by four (six minus two).
1613 Thanks to the design of UTF-8, there cannot be false matches:
1614 A sequence of valid UTF-8 bytes cannot be a subsequence of
1615 another valid sequence of UTF-8 bytes.
1618 char * const s0 = STRING(scan), *s, *t;
1619 char * const s1 = s0 + STR_LEN(scan) - 1;
1620 char * const s2 = s1 - 4;
1621 const char t0[] = "\xcc\x88\xcc\x81";
1622 const char * const t1 = t0 + 3;
1625 s < s2 && (t = ninstr(s, s1, t0, t1));
1627 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1628 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1635 n = scan + NODE_SZ_STR(scan);
1637 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1648 /* Follow the next-chain of the current node and optimize
1649 away all the NOTHINGs from it. */
1650 if (OP(scan) != CURLYX) {
1651 const int max = (reg_off_by_arg[OP(scan)]
1653 /* I32 may be smaller than U16 on CRAYs! */
1654 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1655 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1659 /* Skip NOTHING and LONGJMP. */
1660 while ((n = regnext(n))
1661 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1662 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1663 && off + noff < max)
1665 if (reg_off_by_arg[OP(scan)])
1668 NEXT_OFF(scan) = off;
1671 /* The principal pseudo-switch. Cannot be a switch, since we
1672 look into several different things. */
1673 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1674 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1675 next = regnext(scan);
1677 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1679 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1680 I32 max1 = 0, min1 = I32_MAX, num = 0;
1681 struct regnode_charclass_class accum;
1682 regnode * const startbranch=scan;
1684 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1685 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1686 if (flags & SCF_DO_STCLASS)
1687 cl_init_zero(pRExC_state, &accum);
1689 while (OP(scan) == code) {
1690 I32 deltanext, minnext, f = 0, fake;
1691 struct regnode_charclass_class this_class;
1694 data_fake.flags = 0;
1696 data_fake.whilem_c = data->whilem_c;
1697 data_fake.last_closep = data->last_closep;
1700 data_fake.last_closep = &fake;
1701 next = regnext(scan);
1702 scan = NEXTOPER(scan);
1704 scan = NEXTOPER(scan);
1705 if (flags & SCF_DO_STCLASS) {
1706 cl_init(pRExC_state, &this_class);
1707 data_fake.start_class = &this_class;
1708 f = SCF_DO_STCLASS_AND;
1710 if (flags & SCF_WHILEM_VISITED_POS)
1711 f |= SCF_WHILEM_VISITED_POS;
1713 /* we suppose the run is continuous, last=next...*/
1714 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1715 next, &data_fake, f,depth+1);
1718 if (max1 < minnext + deltanext)
1719 max1 = minnext + deltanext;
1720 if (deltanext == I32_MAX)
1721 is_inf = is_inf_internal = 1;
1723 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1725 if (data && (data_fake.flags & SF_HAS_EVAL))
1726 data->flags |= SF_HAS_EVAL;
1728 data->whilem_c = data_fake.whilem_c;
1729 if (flags & SCF_DO_STCLASS)
1730 cl_or(pRExC_state, &accum, &this_class);
1731 if (code == SUSPEND)
1734 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1736 if (flags & SCF_DO_SUBSTR) {
1737 data->pos_min += min1;
1738 data->pos_delta += max1 - min1;
1739 if (max1 != min1 || is_inf)
1740 data->longest = &(data->longest_float);
1743 delta += max1 - min1;
1744 if (flags & SCF_DO_STCLASS_OR) {
1745 cl_or(pRExC_state, data->start_class, &accum);
1747 cl_and(data->start_class, &and_with);
1748 flags &= ~SCF_DO_STCLASS;
1751 else if (flags & SCF_DO_STCLASS_AND) {
1753 cl_and(data->start_class, &accum);
1754 flags &= ~SCF_DO_STCLASS;
1757 /* Switch to OR mode: cache the old value of
1758 * data->start_class */
1759 StructCopy(data->start_class, &and_with,
1760 struct regnode_charclass_class);
1761 flags &= ~SCF_DO_STCLASS_AND;
1762 StructCopy(&accum, data->start_class,
1763 struct regnode_charclass_class);
1764 flags |= SCF_DO_STCLASS_OR;
1765 data->start_class->flags |= ANYOF_EOS;
1771 Assuming this was/is a branch we are dealing with: 'scan' now
1772 points at the item that follows the branch sequence, whatever
1773 it is. We now start at the beginning of the sequence and look
1779 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1781 If we can find such a subseqence we need to turn the first
1782 element into a trie and then add the subsequent branch exact
1783 strings to the trie.
1787 1. patterns where the whole set of branch can be converted to a trie,
1789 2. patterns where only a subset of the alternations can be
1790 converted to a trie.
1792 In case 1 we can replace the whole set with a single regop
1793 for the trie. In case 2 we need to keep the start and end
1796 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1797 becomes BRANCH TRIE; BRANCH X;
1799 Hypthetically when we know the regex isnt anchored we can
1800 turn a case 1 into a DFA and let it rip... Every time it finds a match
1801 it would just call its tail, no WHILEM/CURLY needed.
1805 if (!re_trie_maxbuff) {
1806 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1807 if (!SvIOK(re_trie_maxbuff))
1808 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1810 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1812 regnode *first = (regnode *)NULL;
1813 regnode *last = (regnode *)NULL;
1814 regnode *tail = scan;
1819 SV * const mysv = sv_newmortal(); /* for dumping */
1821 /* var tail is used because there may be a TAIL
1822 regop in the way. Ie, the exacts will point to the
1823 thing following the TAIL, but the last branch will
1824 point at the TAIL. So we advance tail. If we
1825 have nested (?:) we may have to move through several
1829 while ( OP( tail ) == TAIL ) {
1830 /* this is the TAIL generated by (?:) */
1831 tail = regnext( tail );
1835 regprop(RExC_rx, mysv, tail );
1836 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1837 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1838 (RExC_seen_evals) ? "[EVAL]" : ""
1843 step through the branches, cur represents each
1844 branch, noper is the first thing to be matched
1845 as part of that branch and noper_next is the
1846 regnext() of that node. if noper is an EXACT
1847 and noper_next is the same as scan (our current
1848 position in the regex) then the EXACT branch is
1849 a possible optimization target. Once we have
1850 two or more consequetive such branches we can
1851 create a trie of the EXACT's contents and stich
1852 it in place. If the sequence represents all of
1853 the branches we eliminate the whole thing and
1854 replace it with a single TRIE. If it is a
1855 subsequence then we need to stitch it in. This
1856 means the first branch has to remain, and needs
1857 to be repointed at the item on the branch chain
1858 following the last branch optimized. This could
1859 be either a BRANCH, in which case the
1860 subsequence is internal, or it could be the
1861 item following the branch sequence in which
1862 case the subsequence is at the end.
1866 /* dont use tail as the end marker for this traverse */
1867 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1868 regnode * const noper = NEXTOPER( cur );
1869 regnode * const noper_next = regnext( noper );
1872 regprop(RExC_rx, mysv, cur);
1873 PerlIO_printf( Perl_debug_log, "%*s%s",
1874 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1876 regprop(RExC_rx, mysv, noper);
1877 PerlIO_printf( Perl_debug_log, " -> %s",
1878 SvPV_nolen_const(mysv));
1881 regprop(RExC_rx, mysv, noper_next );
1882 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1883 SvPV_nolen_const(mysv));
1885 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1888 if ( ( first ? OP( noper ) == optype
1889 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1890 && noper_next == tail && count<U16_MAX)
1895 optype = OP( noper );
1899 regprop(RExC_rx, mysv, first);
1900 PerlIO_printf( Perl_debug_log, "%*s%s",
1901 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1902 regprop(RExC_rx, mysv, NEXTOPER(first) );
1903 PerlIO_printf( Perl_debug_log, " -> %s\n",
1904 SvPV_nolen_const( mysv ) );
1909 regprop(RExC_rx, mysv, cur);
1910 PerlIO_printf( Perl_debug_log, "%*s%s",
1911 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1912 regprop(RExC_rx, mysv, noper );
1913 PerlIO_printf( Perl_debug_log, " -> %s\n",
1914 SvPV_nolen_const( mysv ) );
1920 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1921 (int)depth * 2 + 2, "E:", "**END**" );
1923 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1925 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1926 && noper_next == tail )
1930 optype = OP( noper );
1940 regprop(RExC_rx, mysv, cur);
1941 PerlIO_printf( Perl_debug_log,
1942 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1943 " ", SvPV_nolen_const( mysv ), first, last, cur);
1948 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1949 (int)depth * 2 + 2, "E:", "==END==" );
1951 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1956 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1957 scan = NEXTOPER(NEXTOPER(scan));
1958 } else /* single branch is optimized. */
1959 scan = NEXTOPER(scan);
1962 else if (OP(scan) == EXACT) {
1963 I32 l = STR_LEN(scan);
1966 const U8 * const s = (U8*)STRING(scan);
1967 l = utf8_length(s, s + l);
1968 uc = utf8_to_uvchr(s, NULL);
1970 uc = *((U8*)STRING(scan));
1973 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1974 /* The code below prefers earlier match for fixed
1975 offset, later match for variable offset. */
1976 if (data->last_end == -1) { /* Update the start info. */
1977 data->last_start_min = data->pos_min;
1978 data->last_start_max = is_inf
1979 ? I32_MAX : data->pos_min + data->pos_delta;
1981 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
1983 SvUTF8_on(data->last_found);
1985 SV * const sv = data->last_found;
1986 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
1987 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1988 if (mg && mg->mg_len >= 0)
1989 mg->mg_len += utf8_length((U8*)STRING(scan),
1990 (U8*)STRING(scan)+STR_LEN(scan));
1992 data->last_end = data->pos_min + l;
1993 data->pos_min += l; /* As in the first entry. */
1994 data->flags &= ~SF_BEFORE_EOL;
1996 if (flags & SCF_DO_STCLASS_AND) {
1997 /* Check whether it is compatible with what we know already! */
2001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2002 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2003 && (!(data->start_class->flags & ANYOF_FOLD)
2004 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2007 ANYOF_CLASS_ZERO(data->start_class);
2008 ANYOF_BITMAP_ZERO(data->start_class);
2010 ANYOF_BITMAP_SET(data->start_class, uc);
2011 data->start_class->flags &= ~ANYOF_EOS;
2013 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2015 else if (flags & SCF_DO_STCLASS_OR) {
2016 /* false positive possible if the class is case-folded */
2018 ANYOF_BITMAP_SET(data->start_class, uc);
2020 data->start_class->flags |= ANYOF_UNICODE_ALL;
2021 data->start_class->flags &= ~ANYOF_EOS;
2022 cl_and(data->start_class, &and_with);
2024 flags &= ~SCF_DO_STCLASS;
2026 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2027 I32 l = STR_LEN(scan);
2028 UV uc = *((U8*)STRING(scan));
2030 /* Search for fixed substrings supports EXACT only. */
2031 if (flags & SCF_DO_SUBSTR) {
2033 scan_commit(pRExC_state, data);
2036 const U8 * const s = (U8 *)STRING(scan);
2037 l = utf8_length(s, s + l);
2038 uc = utf8_to_uvchr(s, NULL);
2041 if (flags & SCF_DO_SUBSTR)
2043 if (flags & SCF_DO_STCLASS_AND) {
2044 /* Check whether it is compatible with what we know already! */
2048 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2049 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2050 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2052 ANYOF_CLASS_ZERO(data->start_class);
2053 ANYOF_BITMAP_ZERO(data->start_class);
2055 ANYOF_BITMAP_SET(data->start_class, uc);
2056 data->start_class->flags &= ~ANYOF_EOS;
2057 data->start_class->flags |= ANYOF_FOLD;
2058 if (OP(scan) == EXACTFL)
2059 data->start_class->flags |= ANYOF_LOCALE;
2062 else if (flags & SCF_DO_STCLASS_OR) {
2063 if (data->start_class->flags & ANYOF_FOLD) {
2064 /* false positive possible if the class is case-folded.
2065 Assume that the locale settings are the same... */
2067 ANYOF_BITMAP_SET(data->start_class, uc);
2068 data->start_class->flags &= ~ANYOF_EOS;
2070 cl_and(data->start_class, &and_with);
2072 flags &= ~SCF_DO_STCLASS;
2074 else if (strchr((const char*)PL_varies,OP(scan))) {
2075 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2076 I32 f = flags, pos_before = 0;
2077 regnode * const oscan = scan;
2078 struct regnode_charclass_class this_class;
2079 struct regnode_charclass_class *oclass = NULL;
2080 I32 next_is_eval = 0;
2082 switch (PL_regkind[(U8)OP(scan)]) {
2083 case WHILEM: /* End of (?:...)* . */
2084 scan = NEXTOPER(scan);
2087 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2088 next = NEXTOPER(scan);
2089 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2091 maxcount = REG_INFTY;
2092 next = regnext(scan);
2093 scan = NEXTOPER(scan);
2097 if (flags & SCF_DO_SUBSTR)
2102 if (flags & SCF_DO_STCLASS) {
2104 maxcount = REG_INFTY;
2105 next = regnext(scan);
2106 scan = NEXTOPER(scan);
2109 is_inf = is_inf_internal = 1;
2110 scan = regnext(scan);
2111 if (flags & SCF_DO_SUBSTR) {
2112 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2113 data->longest = &(data->longest_float);
2115 goto optimize_curly_tail;
2117 mincount = ARG1(scan);
2118 maxcount = ARG2(scan);
2119 next = regnext(scan);
2120 if (OP(scan) == CURLYX) {
2121 I32 lp = (data ? *(data->last_closep) : 0);
2122 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2124 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2125 next_is_eval = (OP(scan) == EVAL);
2127 if (flags & SCF_DO_SUBSTR) {
2128 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2129 pos_before = data->pos_min;
2133 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2135 data->flags |= SF_IS_INF;
2137 if (flags & SCF_DO_STCLASS) {
2138 cl_init(pRExC_state, &this_class);
2139 oclass = data->start_class;
2140 data->start_class = &this_class;
2141 f |= SCF_DO_STCLASS_AND;
2142 f &= ~SCF_DO_STCLASS_OR;
2144 /* These are the cases when once a subexpression
2145 fails at a particular position, it cannot succeed
2146 even after backtracking at the enclosing scope.
2148 XXXX what if minimal match and we are at the
2149 initial run of {n,m}? */
2150 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2151 f &= ~SCF_WHILEM_VISITED_POS;
2153 /* This will finish on WHILEM, setting scan, or on NULL: */
2154 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2156 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2158 if (flags & SCF_DO_STCLASS)
2159 data->start_class = oclass;
2160 if (mincount == 0 || minnext == 0) {
2161 if (flags & SCF_DO_STCLASS_OR) {
2162 cl_or(pRExC_state, data->start_class, &this_class);
2164 else if (flags & SCF_DO_STCLASS_AND) {
2165 /* Switch to OR mode: cache the old value of
2166 * data->start_class */
2167 StructCopy(data->start_class, &and_with,
2168 struct regnode_charclass_class);
2169 flags &= ~SCF_DO_STCLASS_AND;
2170 StructCopy(&this_class, data->start_class,
2171 struct regnode_charclass_class);
2172 flags |= SCF_DO_STCLASS_OR;
2173 data->start_class->flags |= ANYOF_EOS;
2175 } else { /* Non-zero len */
2176 if (flags & SCF_DO_STCLASS_OR) {
2177 cl_or(pRExC_state, data->start_class, &this_class);
2178 cl_and(data->start_class, &and_with);
2180 else if (flags & SCF_DO_STCLASS_AND)
2181 cl_and(data->start_class, &this_class);
2182 flags &= ~SCF_DO_STCLASS;
2184 if (!scan) /* It was not CURLYX, but CURLY. */
2186 if ( /* ? quantifier ok, except for (?{ ... }) */
2187 (next_is_eval || !(mincount == 0 && maxcount == 1))
2188 && (minnext == 0) && (deltanext == 0)
2189 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2190 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2191 && ckWARN(WARN_REGEXP))
2194 "Quantifier unexpected on zero-length expression");
2197 min += minnext * mincount;
2198 is_inf_internal |= ((maxcount == REG_INFTY
2199 && (minnext + deltanext) > 0)
2200 || deltanext == I32_MAX);
2201 is_inf |= is_inf_internal;
2202 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2204 /* Try powerful optimization CURLYX => CURLYN. */
2205 if ( OP(oscan) == CURLYX && data
2206 && data->flags & SF_IN_PAR
2207 && !(data->flags & SF_HAS_EVAL)
2208 && !deltanext && minnext == 1 ) {
2209 /* Try to optimize to CURLYN. */
2210 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2211 regnode * const nxt1 = nxt;
2218 if (!strchr((const char*)PL_simple,OP(nxt))
2219 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2220 && STR_LEN(nxt) == 1))
2226 if (OP(nxt) != CLOSE)
2228 /* Now we know that nxt2 is the only contents: */
2229 oscan->flags = (U8)ARG(nxt);
2231 OP(nxt1) = NOTHING; /* was OPEN. */
2233 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2234 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2235 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2236 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2237 OP(nxt + 1) = OPTIMIZED; /* was count. */
2238 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2243 /* Try optimization CURLYX => CURLYM. */
2244 if ( OP(oscan) == CURLYX && data
2245 && !(data->flags & SF_HAS_PAR)
2246 && !(data->flags & SF_HAS_EVAL)
2247 && !deltanext /* atom is fixed width */
2248 && minnext != 0 /* CURLYM can't handle zero width */
2250 /* XXXX How to optimize if data == 0? */
2251 /* Optimize to a simpler form. */
2252 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2256 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2257 && (OP(nxt2) != WHILEM))
2259 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2260 /* Need to optimize away parenths. */
2261 if (data->flags & SF_IN_PAR) {
2262 /* Set the parenth number. */
2263 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2265 if (OP(nxt) != CLOSE)
2266 FAIL("Panic opt close");
2267 oscan->flags = (U8)ARG(nxt);
2268 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2269 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2271 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2272 OP(nxt + 1) = OPTIMIZED; /* was count. */
2273 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2274 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2277 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2278 regnode *nnxt = regnext(nxt1);
2281 if (reg_off_by_arg[OP(nxt1)])
2282 ARG_SET(nxt1, nxt2 - nxt1);
2283 else if (nxt2 - nxt1 < U16_MAX)
2284 NEXT_OFF(nxt1) = nxt2 - nxt1;
2286 OP(nxt) = NOTHING; /* Cannot beautify */
2291 /* Optimize again: */
2292 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2298 else if ((OP(oscan) == CURLYX)
2299 && (flags & SCF_WHILEM_VISITED_POS)
2300 /* See the comment on a similar expression above.
2301 However, this time it not a subexpression
2302 we care about, but the expression itself. */
2303 && (maxcount == REG_INFTY)
2304 && data && ++data->whilem_c < 16) {
2305 /* This stays as CURLYX, we can put the count/of pair. */
2306 /* Find WHILEM (as in regexec.c) */
2307 regnode *nxt = oscan + NEXT_OFF(oscan);
2309 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2311 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2312 | (RExC_whilem_seen << 4)); /* On WHILEM */
2314 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2316 if (flags & SCF_DO_SUBSTR) {
2317 SV *last_str = NULL;
2318 int counted = mincount != 0;
2320 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2321 #if defined(SPARC64_GCC_WORKAROUND)
2324 const char *s = NULL;
2327 if (pos_before >= data->last_start_min)
2330 b = data->last_start_min;
2333 s = SvPV_const(data->last_found, l);
2334 old = b - data->last_start_min;
2337 I32 b = pos_before >= data->last_start_min
2338 ? pos_before : data->last_start_min;
2340 const char * const s = SvPV_const(data->last_found, l);
2341 I32 old = b - data->last_start_min;
2345 old = utf8_hop((U8*)s, old) - (U8*)s;
2348 /* Get the added string: */
2349 last_str = newSVpvn(s + old, l);
2351 SvUTF8_on(last_str);
2352 if (deltanext == 0 && pos_before == b) {
2353 /* What was added is a constant string */
2355 SvGROW(last_str, (mincount * l) + 1);
2356 repeatcpy(SvPVX(last_str) + l,
2357 SvPVX_const(last_str), l, mincount - 1);
2358 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2359 /* Add additional parts. */
2360 SvCUR_set(data->last_found,
2361 SvCUR(data->last_found) - l);
2362 sv_catsv(data->last_found, last_str);
2364 SV * sv = data->last_found;
2366 SvUTF8(sv) && SvMAGICAL(sv) ?
2367 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2368 if (mg && mg->mg_len >= 0)
2369 mg->mg_len += CHR_SVLEN(last_str);
2371 data->last_end += l * (mincount - 1);
2374 /* start offset must point into the last copy */
2375 data->last_start_min += minnext * (mincount - 1);
2376 data->last_start_max += is_inf ? I32_MAX
2377 : (maxcount - 1) * (minnext + data->pos_delta);
2380 /* It is counted once already... */
2381 data->pos_min += minnext * (mincount - counted);
2382 data->pos_delta += - counted * deltanext +
2383 (minnext + deltanext) * maxcount - minnext * mincount;
2384 if (mincount != maxcount) {
2385 /* Cannot extend fixed substrings found inside
2387 scan_commit(pRExC_state,data);
2388 if (mincount && last_str) {
2389 SV * const sv = data->last_found;
2390 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2391 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2395 sv_setsv(sv, last_str);
2396 data->last_end = data->pos_min;
2397 data->last_start_min =
2398 data->pos_min - CHR_SVLEN(last_str);
2399 data->last_start_max = is_inf
2401 : data->pos_min + data->pos_delta
2402 - CHR_SVLEN(last_str);
2404 data->longest = &(data->longest_float);
2406 SvREFCNT_dec(last_str);
2408 if (data && (fl & SF_HAS_EVAL))
2409 data->flags |= SF_HAS_EVAL;
2410 optimize_curly_tail:
2411 if (OP(oscan) != CURLYX) {
2412 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2414 NEXT_OFF(oscan) += NEXT_OFF(next);
2417 default: /* REF and CLUMP only? */
2418 if (flags & SCF_DO_SUBSTR) {
2419 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2420 data->longest = &(data->longest_float);
2422 is_inf = is_inf_internal = 1;
2423 if (flags & SCF_DO_STCLASS_OR)
2424 cl_anything(pRExC_state, data->start_class);
2425 flags &= ~SCF_DO_STCLASS;
2429 else if (strchr((const char*)PL_simple,OP(scan))) {
2432 if (flags & SCF_DO_SUBSTR) {
2433 scan_commit(pRExC_state,data);
2437 if (flags & SCF_DO_STCLASS) {
2438 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2440 /* Some of the logic below assumes that switching
2441 locale on will only add false positives. */
2442 switch (PL_regkind[(U8)OP(scan)]) {
2446 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2447 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2448 cl_anything(pRExC_state, data->start_class);
2451 if (OP(scan) == SANY)
2453 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2454 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2455 || (data->start_class->flags & ANYOF_CLASS));
2456 cl_anything(pRExC_state, data->start_class);
2458 if (flags & SCF_DO_STCLASS_AND || !value)
2459 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2462 if (flags & SCF_DO_STCLASS_AND)
2463 cl_and(data->start_class,
2464 (struct regnode_charclass_class*)scan);
2466 cl_or(pRExC_state, data->start_class,
2467 (struct regnode_charclass_class*)scan);
2470 if (flags & SCF_DO_STCLASS_AND) {
2471 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2472 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2473 for (value = 0; value < 256; value++)
2474 if (!isALNUM(value))
2475 ANYOF_BITMAP_CLEAR(data->start_class, value);
2479 if (data->start_class->flags & ANYOF_LOCALE)
2480 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2482 for (value = 0; value < 256; value++)
2484 ANYOF_BITMAP_SET(data->start_class, value);
2489 if (flags & SCF_DO_STCLASS_AND) {
2490 if (data->start_class->flags & ANYOF_LOCALE)
2491 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2494 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2495 data->start_class->flags |= ANYOF_LOCALE;
2499 if (flags & SCF_DO_STCLASS_AND) {
2500 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2501 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2502 for (value = 0; value < 256; value++)
2504 ANYOF_BITMAP_CLEAR(data->start_class, value);
2508 if (data->start_class->flags & ANYOF_LOCALE)
2509 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2511 for (value = 0; value < 256; value++)
2512 if (!isALNUM(value))
2513 ANYOF_BITMAP_SET(data->start_class, value);
2518 if (flags & SCF_DO_STCLASS_AND) {
2519 if (data->start_class->flags & ANYOF_LOCALE)
2520 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2523 data->start_class->flags |= ANYOF_LOCALE;
2524 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2528 if (flags & SCF_DO_STCLASS_AND) {
2529 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2530 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2531 for (value = 0; value < 256; value++)
2532 if (!isSPACE(value))
2533 ANYOF_BITMAP_CLEAR(data->start_class, value);
2537 if (data->start_class->flags & ANYOF_LOCALE)
2538 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2540 for (value = 0; value < 256; value++)
2542 ANYOF_BITMAP_SET(data->start_class, value);
2547 if (flags & SCF_DO_STCLASS_AND) {
2548 if (data->start_class->flags & ANYOF_LOCALE)
2549 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2552 data->start_class->flags |= ANYOF_LOCALE;
2553 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2557 if (flags & SCF_DO_STCLASS_AND) {
2558 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2559 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2560 for (value = 0; value < 256; value++)
2562 ANYOF_BITMAP_CLEAR(data->start_class, value);
2566 if (data->start_class->flags & ANYOF_LOCALE)
2567 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2569 for (value = 0; value < 256; value++)
2570 if (!isSPACE(value))
2571 ANYOF_BITMAP_SET(data->start_class, value);
2576 if (flags & SCF_DO_STCLASS_AND) {
2577 if (data->start_class->flags & ANYOF_LOCALE) {
2578 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2579 for (value = 0; value < 256; value++)
2580 if (!isSPACE(value))
2581 ANYOF_BITMAP_CLEAR(data->start_class, value);
2585 data->start_class->flags |= ANYOF_LOCALE;
2586 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2590 if (flags & SCF_DO_STCLASS_AND) {
2591 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2592 for (value = 0; value < 256; value++)
2593 if (!isDIGIT(value))
2594 ANYOF_BITMAP_CLEAR(data->start_class, value);
2597 if (data->start_class->flags & ANYOF_LOCALE)
2598 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2600 for (value = 0; value < 256; value++)
2602 ANYOF_BITMAP_SET(data->start_class, value);
2607 if (flags & SCF_DO_STCLASS_AND) {
2608 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2609 for (value = 0; value < 256; value++)
2611 ANYOF_BITMAP_CLEAR(data->start_class, value);
2614 if (data->start_class->flags & ANYOF_LOCALE)
2615 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2617 for (value = 0; value < 256; value++)
2618 if (!isDIGIT(value))
2619 ANYOF_BITMAP_SET(data->start_class, value);
2624 if (flags & SCF_DO_STCLASS_OR)
2625 cl_and(data->start_class, &and_with);
2626 flags &= ~SCF_DO_STCLASS;
2629 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2630 data->flags |= (OP(scan) == MEOL
2634 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2635 /* Lookbehind, or need to calculate parens/evals/stclass: */
2636 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2637 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2638 /* Lookahead/lookbehind */
2639 I32 deltanext, minnext, fake = 0;
2641 struct regnode_charclass_class intrnl;
2644 data_fake.flags = 0;
2646 data_fake.whilem_c = data->whilem_c;
2647 data_fake.last_closep = data->last_closep;
2650 data_fake.last_closep = &fake;
2651 if ( flags & SCF_DO_STCLASS && !scan->flags
2652 && OP(scan) == IFMATCH ) { /* Lookahead */
2653 cl_init(pRExC_state, &intrnl);
2654 data_fake.start_class = &intrnl;
2655 f |= SCF_DO_STCLASS_AND;
2657 if (flags & SCF_WHILEM_VISITED_POS)
2658 f |= SCF_WHILEM_VISITED_POS;
2659 next = regnext(scan);
2660 nscan = NEXTOPER(NEXTOPER(scan));
2661 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2664 vFAIL("Variable length lookbehind not implemented");
2666 else if (minnext > U8_MAX) {
2667 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2669 scan->flags = (U8)minnext;
2671 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2673 if (data && (data_fake.flags & SF_HAS_EVAL))
2674 data->flags |= SF_HAS_EVAL;
2676 data->whilem_c = data_fake.whilem_c;
2677 if (f & SCF_DO_STCLASS_AND) {
2678 const int was = (data->start_class->flags & ANYOF_EOS);
2680 cl_and(data->start_class, &intrnl);
2682 data->start_class->flags |= ANYOF_EOS;
2685 else if (OP(scan) == OPEN) {
2688 else if (OP(scan) == CLOSE) {
2689 if ((I32)ARG(scan) == is_par) {
2690 next = regnext(scan);
2692 if ( next && (OP(next) != WHILEM) && next < last)
2693 is_par = 0; /* Disable optimization */
2696 *(data->last_closep) = ARG(scan);
2698 else if (OP(scan) == EVAL) {
2700 data->flags |= SF_HAS_EVAL;
2702 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2703 if (flags & SCF_DO_SUBSTR) {
2704 scan_commit(pRExC_state,data);
2705 data->longest = &(data->longest_float);
2707 is_inf = is_inf_internal = 1;
2708 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2709 cl_anything(pRExC_state, data->start_class);
2710 flags &= ~SCF_DO_STCLASS;
2712 /* Else: zero-length, ignore. */
2713 scan = regnext(scan);
2718 *deltap = is_inf_internal ? I32_MAX : delta;
2719 if (flags & SCF_DO_SUBSTR && is_inf)
2720 data->pos_delta = I32_MAX - data->pos_min;
2721 if (is_par > U8_MAX)
2723 if (is_par && pars==1 && data) {
2724 data->flags |= SF_IN_PAR;
2725 data->flags &= ~SF_HAS_PAR;
2727 else if (pars && data) {
2728 data->flags |= SF_HAS_PAR;
2729 data->flags &= ~SF_IN_PAR;
2731 if (flags & SCF_DO_STCLASS_OR)
2732 cl_and(data->start_class, &and_with);
2737 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
2739 if (RExC_rx->data) {
2740 Renewc(RExC_rx->data,
2741 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2742 char, struct reg_data);
2743 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2744 RExC_rx->data->count += n;
2747 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2748 char, struct reg_data);
2749 Newx(RExC_rx->data->what, n, U8);
2750 RExC_rx->data->count = n;
2752 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2753 return RExC_rx->data->count - n;
2757 Perl_reginitcolors(pTHX)
2760 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2762 char *t = savepv(s);
2766 t = strchr(t, '\t');
2772 PL_colors[i] = t = (char *)"";
2777 PL_colors[i++] = (char *)"";
2784 - pregcomp - compile a regular expression into internal code
2786 * We can't allocate space until we know how big the compiled form will be,
2787 * but we can't compile it (and thus know how big it is) until we've got a
2788 * place to put the code. So we cheat: we compile it twice, once with code
2789 * generation turned off and size counting turned on, and once "for real".
2790 * This also means that we don't allocate space until we are sure that the
2791 * thing really will compile successfully, and we never have to move the
2792 * code and thus invalidate pointers into it. (Note that it has to be in
2793 * one piece because free() must be able to free it all.) [NB: not true in perl]
2795 * Beware that the optimization-preparation code in here knows about some
2796 * of the structure of the compiled regexp. [I'll say.]
2799 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2810 RExC_state_t RExC_state;
2811 RExC_state_t *pRExC_state = &RExC_state;
2813 GET_RE_DEBUG_FLAGS_DECL;
2816 FAIL("NULL regexp argument");
2818 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2821 DEBUG_r(if (!PL_colorset) reginitcolors());
2823 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2824 PL_colors[4],PL_colors[5],PL_colors[0],
2825 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2827 RExC_flags = pm->op_pmflags;
2831 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2832 RExC_seen_evals = 0;
2835 /* First pass: determine size, legality. */
2842 RExC_emit = &PL_regdummy;
2843 RExC_whilem_seen = 0;
2844 #if 0 /* REGC() is (currently) a NOP at the first pass.
2845 * Clever compilers notice this and complain. --jhi */
2846 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2848 if (reg(pRExC_state, 0, &flags) == NULL) {
2849 RExC_precomp = NULL;
2852 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2854 /* Small enough for pointer-storage convention?
2855 If extralen==0, this means that we will not need long jumps. */
2856 if (RExC_size >= 0x10000L && RExC_extralen)
2857 RExC_size += RExC_extralen;
2860 if (RExC_whilem_seen > 15)
2861 RExC_whilem_seen = 15;
2863 /* Allocate space and initialize. */
2864 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2867 FAIL("Regexp out of space");
2870 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2871 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2874 r->prelen = xend - exp;
2875 r->precomp = savepvn(RExC_precomp, r->prelen);
2877 #ifdef PERL_OLD_COPY_ON_WRITE
2878 r->saved_copy = NULL;
2880 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2881 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2882 r->lastparen = 0; /* mg.c reads this. */
2884 r->substrs = 0; /* Useful during FAIL. */
2885 r->startp = 0; /* Useful during FAIL. */
2886 r->endp = 0; /* Useful during FAIL. */
2888 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2890 r->offsets[0] = RExC_size;
2892 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2893 "%s %"UVuf" bytes for offset annotations.\n",
2894 r->offsets ? "Got" : "Couldn't get",
2895 (UV)((2*RExC_size+1) * sizeof(U32))));
2899 /* Second pass: emit code. */
2900 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2905 RExC_emit_start = r->program;
2906 RExC_emit = r->program;
2907 /* Store the count of eval-groups for security checks: */
2908 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2909 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2911 if (reg(pRExC_state, 0, &flags) == NULL)
2915 /* Dig out information for optimizations. */
2916 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2917 pm->op_pmflags = RExC_flags;
2919 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2920 r->regstclass = NULL;
2921 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2922 r->reganch |= ROPT_NAUGHTY;
2923 scan = r->program + 1; /* First BRANCH. */
2925 /* XXXX To minimize changes to RE engine we always allocate
2926 3-units-long substrs field. */
2927 Newxz(r->substrs, 1, struct reg_substr_data);
2929 StructCopy(&zero_scan_data, &data, scan_data_t);
2930 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2931 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2933 STRLEN longest_float_length, longest_fixed_length;
2934 struct regnode_charclass_class ch_class;
2939 /* Skip introductions and multiplicators >= 1. */
2940 while ((OP(first) == OPEN && (sawopen = 1)) ||
2941 /* An OR of *one* alternative - should not happen now. */
2942 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2943 (OP(first) == PLUS) ||
2944 (OP(first) == MINMOD) ||
2945 /* An {n,m} with n>0 */
2946 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2947 if (OP(first) == PLUS)
2950 first += regarglen[(U8)OP(first)];
2951 first = NEXTOPER(first);
2954 /* Starting-point info. */
2956 if (PL_regkind[(U8)OP(first)] == EXACT) {
2957 if (OP(first) == EXACT)
2958 /*EMPTY*/; /* Empty, get anchored substr later. */
2959 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2960 r->regstclass = first;
2962 else if (strchr((const char*)PL_simple,OP(first)))
2963 r->regstclass = first;
2964 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2965 PL_regkind[(U8)OP(first)] == NBOUND)
2966 r->regstclass = first;
2967 else if (PL_regkind[(U8)OP(first)] == BOL) {
2968 r->reganch |= (OP(first) == MBOL
2970 : (OP(first) == SBOL
2973 first = NEXTOPER(first);
2976 else if (OP(first) == GPOS) {
2977 r->reganch |= ROPT_ANCH_GPOS;
2978 first = NEXTOPER(first);
2981 else if (!sawopen && (OP(first) == STAR &&
2982 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2983 !(r->reganch & ROPT_ANCH) )
2985 /* turn .* into ^.* with an implied $*=1 */
2987 (OP(NEXTOPER(first)) == REG_ANY)
2990 r->reganch |= type | ROPT_IMPLICIT;
2991 first = NEXTOPER(first);
2994 if (sawplus && (!sawopen || !RExC_sawback)
2995 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
2996 /* x+ must match at the 1st pos of run of x's */
2997 r->reganch |= ROPT_SKIP;
2999 /* Scan is after the zeroth branch, first is atomic matcher. */
3000 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3001 (IV)(first - scan + 1)));
3003 * If there's something expensive in the r.e., find the
3004 * longest literal string that must appear and make it the
3005 * regmust. Resolve ties in favor of later strings, since
3006 * the regstart check works with the beginning of the r.e.
3007 * and avoiding duplication strengthens checking. Not a
3008 * strong reason, but sufficient in the absence of others.
3009 * [Now we resolve ties in favor of the earlier string if
3010 * it happens that c_offset_min has been invalidated, since the
3011 * earlier string may buy us something the later one won't.]
3015 data.longest_fixed = newSVpvs("");
3016 data.longest_float = newSVpvs("");
3017 data.last_found = newSVpvs("");
3018 data.longest = &(data.longest_fixed);
3020 if (!r->regstclass) {
3021 cl_init(pRExC_state, &ch_class);
3022 data.start_class = &ch_class;
3023 stclass_flag = SCF_DO_STCLASS_AND;
3024 } else /* XXXX Check for BOUND? */
3026 data.last_closep = &last_close;
3028 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3029 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3030 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3031 && data.last_start_min == 0 && data.last_end > 0
3032 && !RExC_seen_zerolen
3033 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3034 r->reganch |= ROPT_CHECK_ALL;
3035 scan_commit(pRExC_state, &data);
3036 SvREFCNT_dec(data.last_found);
3038 longest_float_length = CHR_SVLEN(data.longest_float);
3039 if (longest_float_length
3040 || (data.flags & SF_FL_BEFORE_EOL
3041 && (!(data.flags & SF_FL_BEFORE_MEOL)
3042 || (RExC_flags & PMf_MULTILINE)))) {
3045 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3046 && data.offset_fixed == data.offset_float_min
3047 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3048 goto remove_float; /* As in (a)+. */
3050 if (SvUTF8(data.longest_float)) {
3051 r->float_utf8 = data.longest_float;
3052 r->float_substr = NULL;
3054 r->float_substr = data.longest_float;
3055 r->float_utf8 = NULL;
3057 r->float_min_offset = data.offset_float_min;
3058 r->float_max_offset = data.offset_float_max;
3059 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3060 && (!(data.flags & SF_FL_BEFORE_MEOL)
3061 || (RExC_flags & PMf_MULTILINE)));
3062 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3066 r->float_substr = r->float_utf8 = NULL;
3067 SvREFCNT_dec(data.longest_float);
3068 longest_float_length = 0;
3071 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3072 if (longest_fixed_length
3073 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3074 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3075 || (RExC_flags & PMf_MULTILINE)))) {
3078 if (SvUTF8(data.longest_fixed)) {
3079 r->anchored_utf8 = data.longest_fixed;
3080 r->anchored_substr = NULL;
3082 r->anchored_substr = data.longest_fixed;
3083 r->anchored_utf8 = NULL;
3085 r->anchored_offset = data.offset_fixed;
3086 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3087 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3088 || (RExC_flags & PMf_MULTILINE)));
3089 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3092 r->anchored_substr = r->anchored_utf8 = NULL;
3093 SvREFCNT_dec(data.longest_fixed);
3094 longest_fixed_length = 0;
3097 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3098 r->regstclass = NULL;
3099 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3101 && !(data.start_class->flags & ANYOF_EOS)
3102 && !cl_is_anything(data.start_class))
3104 const I32 n = add_data(pRExC_state, 1, "f");
3106 Newx(RExC_rx->data->data[n], 1,
3107 struct regnode_charclass_class);
3108 StructCopy(data.start_class,
3109 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3110 struct regnode_charclass_class);
3111 r->regstclass = (regnode*)RExC_rx->data->data[n];
3112 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3113 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3114 regprop(r, sv, (regnode*)data.start_class);
3115 PerlIO_printf(Perl_debug_log,
3116 "synthetic stclass \"%s\".\n",
3117 SvPVX_const(sv));});
3120 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3121 if (longest_fixed_length > longest_float_length) {
3122 r->check_substr = r->anchored_substr;
3123 r->check_utf8 = r->anchored_utf8;
3124 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3125 if (r->reganch & ROPT_ANCH_SINGLE)
3126 r->reganch |= ROPT_NOSCAN;
3129 r->check_substr = r->float_substr;
3130 r->check_utf8 = r->float_utf8;
3131 r->check_offset_min = data.offset_float_min;
3132 r->check_offset_max = data.offset_float_max;
3134 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3135 This should be changed ASAP! */
3136 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3137 r->reganch |= RE_USE_INTUIT;
3138 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3139 r->reganch |= RE_INTUIT_TAIL;
3143 /* Several toplevels. Best we can is to set minlen. */
3145 struct regnode_charclass_class ch_class;
3148 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3149 scan = r->program + 1;
3150 cl_init(pRExC_state, &ch_class);
3151 data.start_class = &ch_class;
3152 data.last_closep = &last_close;
3153 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3154 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3155 = r->float_substr = r->float_utf8 = NULL;
3156 if (!(data.start_class->flags & ANYOF_EOS)
3157 && !cl_is_anything(data.start_class))
3159 const I32 n = add_data(pRExC_state, 1, "f");
3161 Newx(RExC_rx->data->data[n], 1,
3162 struct regnode_charclass_class);
3163 StructCopy(data.start_class,
3164 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3165 struct regnode_charclass_class);
3166 r->regstclass = (regnode*)RExC_rx->data->data[n];
3167 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3168 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3169 regprop(r, sv, (regnode*)data.start_class);
3170 PerlIO_printf(Perl_debug_log,
3171 "synthetic stclass \"%s\".\n",
3172 SvPVX_const(sv));});
3177 if (RExC_seen & REG_SEEN_GPOS)
3178 r->reganch |= ROPT_GPOS_SEEN;
3179 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3180 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3181 if (RExC_seen & REG_SEEN_EVAL)
3182 r->reganch |= ROPT_EVAL_SEEN;
3183 if (RExC_seen & REG_SEEN_CANY)
3184 r->reganch |= ROPT_CANY_SEEN;
3185 Newxz(r->startp, RExC_npar, I32);
3186 Newxz(r->endp, RExC_npar, I32);
3187 DEBUG_COMPILE_r(regdump(r));
3192 - reg - regular expression, i.e. main body or parenthesized thing
3194 * Caller must absorb opening parenthesis.
3196 * Combining parenthesis handling with the base level of regular expression
3197 * is a trifle forced, but the need to tie the tails of the branches to what
3198 * follows makes it hard to avoid.
3201 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3202 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3205 register regnode *ret; /* Will be the head of the group. */
3206 register regnode *br;
3207 register regnode *lastbr;
3208 register regnode *ender = NULL;
3209 register I32 parno = 0;
3211 const I32 oregflags = RExC_flags;
3212 bool have_branch = 0;
3215 /* for (?g), (?gc), and (?o) warnings; warning
3216 about (?c) will warn about (?g) -- japhy */
3218 #define WASTED_O 0x01
3219 #define WASTED_G 0x02
3220 #define WASTED_C 0x04
3221 #define WASTED_GC (0x02|0x04)
3222 I32 wastedflags = 0x00;
3224 char * parse_start = RExC_parse; /* MJD */
3225 char * const oregcomp_parse = RExC_parse;
3227 *flagp = 0; /* Tentatively. */
3230 /* Make an OPEN node, if parenthesized. */
3232 if (*RExC_parse == '?') { /* (?...) */
3233 U32 posflags = 0, negflags = 0;
3234 U32 *flagsp = &posflags;
3235 bool is_logical = 0;
3236 const char * const seqstart = RExC_parse;
3239 paren = *RExC_parse++;
3240 ret = NULL; /* For look-ahead/behind. */
3242 case '<': /* (?<...) */
3243 RExC_seen |= REG_SEEN_LOOKBEHIND;
3244 if (*RExC_parse == '!')
3246 if (*RExC_parse != '=' && *RExC_parse != '!')
3249 case '=': /* (?=...) */
3250 case '!': /* (?!...) */
3251 RExC_seen_zerolen++;
3252 case ':': /* (?:...) */
3253 case '>': /* (?>...) */
3255 case '$': /* (?$...) */
3256 case '@': /* (?@...) */
3257 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3259 case '#': /* (?#...) */
3260 while (*RExC_parse && *RExC_parse != ')')
3262 if (*RExC_parse != ')')
3263 FAIL("Sequence (?#... not terminated");
3264 nextchar(pRExC_state);
3267 case 'p': /* (?p...) */
3268 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3269 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3271 case '?': /* (??...) */
3273 if (*RExC_parse != '{')
3275 paren = *RExC_parse++;
3277 case '{': /* (?{...}) */
3279 I32 count = 1, n = 0;
3281 char *s = RExC_parse;
3283 RExC_seen_zerolen++;
3284 RExC_seen |= REG_SEEN_EVAL;
3285 while (count && (c = *RExC_parse)) {
3296 if (*RExC_parse != ')') {
3298 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3302 OP_4tree *sop, *rop;
3303 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3306 Perl_save_re_context(aTHX);
3307 rop = sv_compile_2op(sv, &sop, "re", &pad);
3308 sop->op_private |= OPpREFCOUNTED;
3309 /* re_dup will OpREFCNT_inc */
3310 OpREFCNT_set(sop, 1);
3313 n = add_data(pRExC_state, 3, "nop");
3314 RExC_rx->data->data[n] = (void*)rop;
3315 RExC_rx->data->data[n+1] = (void*)sop;
3316 RExC_rx->data->data[n+2] = (void*)pad;
3319 else { /* First pass */
3320 if (PL_reginterp_cnt < ++RExC_seen_evals
3322 /* No compiled RE interpolated, has runtime
3323 components ===> unsafe. */
3324 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3325 if (PL_tainting && PL_tainted)
3326 FAIL("Eval-group in insecure regular expression");
3327 if (IN_PERL_COMPILETIME)
3331 nextchar(pRExC_state);
3333 ret = reg_node(pRExC_state, LOGICAL);
3336 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3337 /* deal with the length of this later - MJD */
3340 ret = reganode(pRExC_state, EVAL, n);
3341 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3342 Set_Node_Offset(ret, parse_start);
3345 case '(': /* (?(?{...})...) and (?(?=...)...) */
3347 if (RExC_parse[0] == '?') { /* (?(?...)) */
3348 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3349 || RExC_parse[1] == '<'
3350 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3353 ret = reg_node(pRExC_state, LOGICAL);
3356 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3360 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3363 parno = atoi(RExC_parse++);
3365 while (isDIGIT(*RExC_parse))
3367 ret = reganode(pRExC_state, GROUPP, parno);
3369 if ((c = *nextchar(pRExC_state)) != ')')
3370 vFAIL("Switch condition not recognized");
3372 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3373 br = regbranch(pRExC_state, &flags, 1);
3375 br = reganode(pRExC_state, LONGJMP, 0);
3377 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3378 c = *nextchar(pRExC_state);
3382 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3383 regbranch(pRExC_state, &flags, 1);
3384 regtail(pRExC_state, ret, lastbr);
3387 c = *nextchar(pRExC_state);
3392 vFAIL("Switch (?(condition)... contains too many branches");
3393 ender = reg_node(pRExC_state, TAIL);
3394 regtail(pRExC_state, br, ender);
3396 regtail(pRExC_state, lastbr, ender);
3397 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3400 regtail(pRExC_state, ret, ender);
3404 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3408 RExC_parse--; /* for vFAIL to print correctly */
3409 vFAIL("Sequence (? incomplete");
3413 parse_flags: /* (?i) */
3414 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3415 /* (?g), (?gc) and (?o) are useless here
3416 and must be globally applied -- japhy */
3418 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3419 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3420 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3421 if (! (wastedflags & wflagbit) ) {
3422 wastedflags |= wflagbit;
3425 "Useless (%s%c) - %suse /%c modifier",
3426 flagsp == &negflags ? "?-" : "?",
3428 flagsp == &negflags ? "don't " : "",
3434 else if (*RExC_parse == 'c') {
3435 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3436 if (! (wastedflags & WASTED_C) ) {
3437 wastedflags |= WASTED_GC;
3440 "Useless (%sc) - %suse /gc modifier",
3441 flagsp == &negflags ? "?-" : "?",
3442 flagsp == &negflags ? "don't " : ""
3447 else { pmflag(flagsp, *RExC_parse); }
3451 if (*RExC_parse == '-') {
3453 wastedflags = 0; /* reset so (?g-c) warns twice */
3457 RExC_flags |= posflags;
3458 RExC_flags &= ~negflags;
3459 if (*RExC_parse == ':') {
3465 if (*RExC_parse != ')') {
3467 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3469 nextchar(pRExC_state);
3477 ret = reganode(pRExC_state, OPEN, parno);
3478 Set_Node_Length(ret, 1); /* MJD */
3479 Set_Node_Offset(ret, RExC_parse); /* MJD */
3486 /* Pick up the branches, linking them together. */
3487 parse_start = RExC_parse; /* MJD */
3488 br = regbranch(pRExC_state, &flags, 1);
3489 /* branch_len = (paren != 0); */
3493 if (*RExC_parse == '|') {
3494 if (!SIZE_ONLY && RExC_extralen) {
3495 reginsert(pRExC_state, BRANCHJ, br);
3498 reginsert(pRExC_state, BRANCH, br);
3499 Set_Node_Length(br, paren != 0);
3500 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3504 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3506 else if (paren == ':') {
3507 *flagp |= flags&SIMPLE;
3509 if (is_open) { /* Starts with OPEN. */
3510 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3512 else if (paren != '?') /* Not Conditional */
3514 *flagp |= flags & (SPSTART | HASWIDTH);
3516 while (*RExC_parse == '|') {
3517 if (!SIZE_ONLY && RExC_extralen) {
3518 ender = reganode(pRExC_state, LONGJMP,0);
3519 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3522 RExC_extralen += 2; /* Account for LONGJMP. */
3523 nextchar(pRExC_state);
3524 br = regbranch(pRExC_state, &flags, 0);
3528 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3532 *flagp |= flags&SPSTART;
3535 if (have_branch || paren != ':') {
3536 /* Make a closing node, and hook it on the end. */
3539 ender = reg_node(pRExC_state, TAIL);
3542 ender = reganode(pRExC_state, CLOSE, parno);
3543 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3544 Set_Node_Length(ender,1); /* MJD */
3550 *flagp &= ~HASWIDTH;
3553 ender = reg_node(pRExC_state, SUCCEED);
3556 ender = reg_node(pRExC_state, END);
3559 regtail(pRExC_state, lastbr, ender);
3562 /* Hook the tails of the branches to the closing node. */
3563 for (br = ret; br != NULL; br = regnext(br)) {
3564 regoptail(pRExC_state, br, ender);
3571 static const char parens[] = "=!<,>";
3573 if (paren && (p = strchr(parens, paren))) {
3574 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3575 int flag = (p - parens) > 1;
3578 node = SUSPEND, flag = 0;
3579 reginsert(pRExC_state, node,ret);
3580 Set_Node_Cur_Length(ret);
3581 Set_Node_Offset(ret, parse_start + 1);
3583 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3587 /* Check for proper termination. */
3589 RExC_flags = oregflags;
3590 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3591 RExC_parse = oregcomp_parse;
3592 vFAIL("Unmatched (");
3595 else if (!paren && RExC_parse < RExC_end) {
3596 if (*RExC_parse == ')') {
3598 vFAIL("Unmatched )");
3601 FAIL("Junk on end of regexp"); /* "Can't happen". */
3609 - regbranch - one alternative of an | operator
3611 * Implements the concatenation operator.
3614 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3617 register regnode *ret;
3618 register regnode *chain = NULL;
3619 register regnode *latest;
3620 I32 flags = 0, c = 0;
3625 if (!SIZE_ONLY && RExC_extralen)
3626 ret = reganode(pRExC_state, BRANCHJ,0);
3628 ret = reg_node(pRExC_state, BRANCH);
3629 Set_Node_Length(ret, 1);
3633 if (!first && SIZE_ONLY)
3634 RExC_extralen += 1; /* BRANCHJ */
3636 *flagp = WORST; /* Tentatively. */
3639 nextchar(pRExC_state);
3640 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3642 latest = regpiece(pRExC_state, &flags);
3643 if (latest == NULL) {
3644 if (flags & TRYAGAIN)
3648 else if (ret == NULL)
3650 *flagp |= flags&HASWIDTH;
3651 if (chain == NULL) /* First piece. */
3652 *flagp |= flags&SPSTART;
3655 regtail(pRExC_state, chain, latest);
3660 if (chain == NULL) { /* Loop ran zero times. */
3661 chain = reg_node(pRExC_state, NOTHING);
3666 *flagp |= flags&SIMPLE;
3673 - regpiece - something followed by possible [*+?]
3675 * Note that the branching code sequences used for ? and the general cases
3676 * of * and + are somewhat optimized: they use the same NOTHING node as
3677 * both the endmarker for their branch list and the body of the last branch.
3678 * It might seem that this node could be dispensed with entirely, but the
3679 * endmarker role is not redundant.
3682 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3685 register regnode *ret;
3687 register char *next;
3689 const char * const origparse = RExC_parse;
3692 I32 max = REG_INFTY;
3695 ret = regatom(pRExC_state, &flags);
3697 if (flags & TRYAGAIN)
3704 if (op == '{' && regcurly(RExC_parse)) {
3705 parse_start = RExC_parse; /* MJD */
3706 next = RExC_parse + 1;
3708 while (isDIGIT(*next) || *next == ',') {
3717 if (*next == '}') { /* got one */
3721 min = atoi(RExC_parse);
3725 maxpos = RExC_parse;
3727 if (!max && *maxpos != '0')
3728 max = REG_INFTY; /* meaning "infinity" */
3729 else if (max >= REG_INFTY)
3730 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3732 nextchar(pRExC_state);
3735 if ((flags&SIMPLE)) {
3736 RExC_naughty += 2 + RExC_naughty / 2;
3737 reginsert(pRExC_state, CURLY, ret);
3738 Set_Node_Offset(ret, parse_start+1); /* MJD */
3739 Set_Node_Cur_Length(ret);
3742 regnode *w = reg_node(pRExC_state, WHILEM);
3745 regtail(pRExC_state, ret, w);
3746 if (!SIZE_ONLY && RExC_extralen) {
3747 reginsert(pRExC_state, LONGJMP,ret);
3748 reginsert(pRExC_state, NOTHING,ret);
3749 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3751 reginsert(pRExC_state, CURLYX,ret);
3753 Set_Node_Offset(ret, parse_start+1);
3754 Set_Node_Length(ret,
3755 op == '{' ? (RExC_parse - parse_start) : 1);
3757 if (!SIZE_ONLY && RExC_extralen)
3758 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3759 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3761 RExC_whilem_seen++, RExC_extralen += 3;
3762 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3770 if (max && max < min)
3771 vFAIL("Can't do {n,m} with n > m");
3773 ARG1_SET(ret, (U16)min);
3774 ARG2_SET(ret, (U16)max);
3786 #if 0 /* Now runtime fix should be reliable. */
3788 /* if this is reinstated, don't forget to put this back into perldiag:
3790 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3792 (F) The part of the regexp subject to either the * or + quantifier
3793 could match an empty string. The {#} shows in the regular
3794 expression about where the problem was discovered.
3798 if (!(flags&HASWIDTH) && op != '?')
3799 vFAIL("Regexp *+ operand could be empty");
3802 parse_start = RExC_parse;
3803 nextchar(pRExC_state);
3805 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3807 if (op == '*' && (flags&SIMPLE)) {
3808 reginsert(pRExC_state, STAR, ret);
3812 else if (op == '*') {
3816 else if (op == '+' && (flags&SIMPLE)) {
3817 reginsert(pRExC_state, PLUS, ret);
3821 else if (op == '+') {
3825 else if (op == '?') {
3830 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
3832 "%.*s matches null string many times",
3833 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
3837 if (*RExC_parse == '?') {
3838 nextchar(pRExC_state);
3839 reginsert(pRExC_state, MINMOD, ret);
3840 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3842 if (ISMULT2(RExC_parse)) {
3844 vFAIL("Nested quantifiers");
3851 - regatom - the lowest level
3853 * Optimization: gobbles an entire sequence of ordinary characters so that
3854 * it can turn them into a single node, which is smaller to store and
3855 * faster to run. Backslashed characters are exceptions, each becoming a
3856 * separate node; the code is simpler that way and it's not worth fixing.
3858 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3860 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3863 register regnode *ret = NULL;
3865 char *parse_start = RExC_parse;
3867 *flagp = WORST; /* Tentatively. */
3870 switch (*RExC_parse) {
3872 RExC_seen_zerolen++;
3873 nextchar(pRExC_state);
3874 if (RExC_flags & PMf_MULTILINE)
3875 ret = reg_node(pRExC_state, MBOL);
3876 else if (RExC_flags & PMf_SINGLELINE)
3877 ret = reg_node(pRExC_state, SBOL);
3879 ret = reg_node(pRExC_state, BOL);
3880 Set_Node_Length(ret, 1); /* MJD */
3883 nextchar(pRExC_state);
3885 RExC_seen_zerolen++;
3886 if (RExC_flags & PMf_MULTILINE)
3887 ret = reg_node(pRExC_state, MEOL);
3888 else if (RExC_flags & PMf_SINGLELINE)
3889 ret = reg_node(pRExC_state, SEOL);
3891 ret = reg_node(pRExC_state, EOL);
3892 Set_Node_Length(ret, 1); /* MJD */
3895 nextchar(pRExC_state);
3896 if (RExC_flags & PMf_SINGLELINE)
3897 ret = reg_node(pRExC_state, SANY);
3899 ret = reg_node(pRExC_state, REG_ANY);
3900 *flagp |= HASWIDTH|SIMPLE;
3902 Set_Node_Length(ret, 1); /* MJD */
3906 char *oregcomp_parse = ++RExC_parse;
3907 ret = regclass(pRExC_state);
3908 if (*RExC_parse != ']') {
3909 RExC_parse = oregcomp_parse;
3910 vFAIL("Unmatched [");
3912 nextchar(pRExC_state);
3913 *flagp |= HASWIDTH|SIMPLE;
3914 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3918 nextchar(pRExC_state);
3919 ret = reg(pRExC_state, 1, &flags);
3921 if (flags & TRYAGAIN) {
3922 if (RExC_parse == RExC_end) {
3923 /* Make parent create an empty node if needed. */
3931 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3935 if (flags & TRYAGAIN) {
3939 vFAIL("Internal urp");
3940 /* Supposed to be caught earlier. */
3943 if (!regcurly(RExC_parse)) {
3952 vFAIL("Quantifier follows nothing");
3955 switch (*++RExC_parse) {
3957 RExC_seen_zerolen++;
3958 ret = reg_node(pRExC_state, SBOL);
3960 nextchar(pRExC_state);
3961 Set_Node_Length(ret, 2); /* MJD */
3964 ret = reg_node(pRExC_state, GPOS);
3965 RExC_seen |= REG_SEEN_GPOS;
3967 nextchar(pRExC_state);
3968 Set_Node_Length(ret, 2); /* MJD */
3971 ret = reg_node(pRExC_state, SEOL);
3973 RExC_seen_zerolen++; /* Do not optimize RE away */
3974 nextchar(pRExC_state);
3977 ret = reg_node(pRExC_state, EOS);
3979 RExC_seen_zerolen++; /* Do not optimize RE away */
3980 nextchar(pRExC_state);
3981 Set_Node_Length(ret, 2); /* MJD */
3984 ret = reg_node(pRExC_state, CANY);
3985 RExC_seen |= REG_SEEN_CANY;
3986 *flagp |= HASWIDTH|SIMPLE;
3987 nextchar(pRExC_state);
3988 Set_Node_Length(ret, 2); /* MJD */
3991 ret = reg_node(pRExC_state, CLUMP);
3993 nextchar(pRExC_state);
3994 Set_Node_Length(ret, 2); /* MJD */
3997 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
3998 *flagp |= HASWIDTH|SIMPLE;
3999 nextchar(pRExC_state);
4000 Set_Node_Length(ret, 2); /* MJD */
4003 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4004 *flagp |= HASWIDTH|SIMPLE;
4005 nextchar(pRExC_state);
4006 Set_Node_Length(ret, 2); /* MJD */
4009 RExC_seen_zerolen++;
4010 RExC_seen |= REG_SEEN_LOOKBEHIND;
4011 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4013 nextchar(pRExC_state);
4014 Set_Node_Length(ret, 2); /* MJD */
4017 RExC_seen_zerolen++;
4018 RExC_seen |= REG_SEEN_LOOKBEHIND;
4019 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4021 nextchar(pRExC_state);
4022 Set_Node_Length(ret, 2); /* MJD */
4025 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4026 *flagp |= HASWIDTH|SIMPLE;
4027 nextchar(pRExC_state);
4028 Set_Node_Length(ret, 2); /* MJD */
4031 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4032 *flagp |= HASWIDTH|SIMPLE;
4033 nextchar(pRExC_state);
4034 Set_Node_Length(ret, 2); /* MJD */
4037 ret = reg_node(pRExC_state, DIGIT);
4038 *flagp |= HASWIDTH|SIMPLE;
4039 nextchar(pRExC_state);
4040 Set_Node_Length(ret, 2); /* MJD */
4043 ret = reg_node(pRExC_state, NDIGIT);
4044 *flagp |= HASWIDTH|SIMPLE;
4045 nextchar(pRExC_state);
4046 Set_Node_Length(ret, 2); /* MJD */
4051 char* oldregxend = RExC_end;
4052 char* parse_start = RExC_parse - 2;
4054 if (RExC_parse[1] == '{') {
4055 /* a lovely hack--pretend we saw [\pX] instead */
4056 RExC_end = strchr(RExC_parse, '}');
4058 U8 c = (U8)*RExC_parse;
4060 RExC_end = oldregxend;
4061 vFAIL2("Missing right brace on \\%c{}", c);
4066 RExC_end = RExC_parse + 2;
4067 if (RExC_end > oldregxend)
4068 RExC_end = oldregxend;
4072 ret = regclass(pRExC_state);
4074 RExC_end = oldregxend;
4077 Set_Node_Offset(ret, parse_start + 2);
4078 Set_Node_Cur_Length(ret);
4079 nextchar(pRExC_state);
4080 *flagp |= HASWIDTH|SIMPLE;
4093 case '1': case '2': case '3': case '4':
4094 case '5': case '6': case '7': case '8': case '9':
4096 const I32 num = atoi(RExC_parse);
4098 if (num > 9 && num >= RExC_npar)
4101 char * parse_start = RExC_parse - 1; /* MJD */
4102 while (isDIGIT(*RExC_parse))
4105 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4106 vFAIL("Reference to nonexistent group");
4108 ret = reganode(pRExC_state,
4109 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4113 /* override incorrect value set in reganode MJD */
4114 Set_Node_Offset(ret, parse_start+1);
4115 Set_Node_Cur_Length(ret); /* MJD */
4117 nextchar(pRExC_state);
4122 if (RExC_parse >= RExC_end)
4123 FAIL("Trailing \\");
4126 /* Do not generate "unrecognized" warnings here, we fall
4127 back into the quick-grab loop below */
4134 if (RExC_flags & PMf_EXTENDED) {
4135 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4136 if (RExC_parse < RExC_end)
4142 register STRLEN len;
4147 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4149 parse_start = RExC_parse - 1;
4155 ret = reg_node(pRExC_state,
4156 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4158 for (len = 0, p = RExC_parse - 1;
4159 len < 127 && p < RExC_end;
4164 if (RExC_flags & PMf_EXTENDED)
4165 p = regwhite(p, RExC_end);
4212 ender = ASCII_TO_NATIVE('\033');
4216 ender = ASCII_TO_NATIVE('\007');
4221 char* const e = strchr(p, '}');
4225 vFAIL("Missing right brace on \\x{}");
4228 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4229 | PERL_SCAN_DISALLOW_PREFIX;
4230 STRLEN numlen = e - p - 1;
4231 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4238 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4240 ender = grok_hex(p, &numlen, &flags, NULL);
4246 ender = UCHARAT(p++);
4247 ender = toCTRL(ender);
4249 case '0': case '1': case '2': case '3':case '4':
4250 case '5': case '6': case '7': case '8':case '9':
4252 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4255 ender = grok_oct(p, &numlen, &flags, NULL);
4265 FAIL("Trailing \\");
4268 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4269 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4270 goto normal_default;
4275 if (UTF8_IS_START(*p) && UTF) {
4277 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4278 &numlen, UTF8_ALLOW_DEFAULT);
4285 if (RExC_flags & PMf_EXTENDED)
4286 p = regwhite(p, RExC_end);
4288 /* Prime the casefolded buffer. */
4289 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4291 if (ISMULT2(p)) { /* Back off on ?+*. */
4298 /* Emit all the Unicode characters. */
4300 for (foldbuf = tmpbuf;
4302 foldlen -= numlen) {
4303 ender = utf8_to_uvchr(foldbuf, &numlen);
4305 reguni(pRExC_state, ender, s, &unilen);
4308 /* In EBCDIC the numlen
4309 * and unilen can differ. */
4311 if (numlen >= foldlen)
4315 break; /* "Can't happen." */
4319 reguni(pRExC_state, ender, s, &unilen);
4328 REGC((char)ender, s++);
4336 /* Emit all the Unicode characters. */
4338 for (foldbuf = tmpbuf;
4340 foldlen -= numlen) {
4341 ender = utf8_to_uvchr(foldbuf, &numlen);
4343 reguni(pRExC_state, ender, s, &unilen);
4346 /* In EBCDIC the numlen
4347 * and unilen can differ. */
4349 if (numlen >= foldlen)
4357 reguni(pRExC_state, ender, s, &unilen);
4366 REGC((char)ender, s++);
4370 Set_Node_Cur_Length(ret); /* MJD */
4371 nextchar(pRExC_state);
4373 /* len is STRLEN which is unsigned, need to copy to signed */
4376 vFAIL("Internal disaster");
4380 if (len == 1 && UNI_IS_INVARIANT(ender))
4385 RExC_size += STR_SZ(len);
4387 RExC_emit += STR_SZ(len);
4392 /* If the encoding pragma is in effect recode the text of
4393 * any EXACT-kind nodes. */
4394 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4395 STRLEN oldlen = STR_LEN(ret);
4396 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4400 if (sv_utf8_downgrade(sv, TRUE)) {
4401 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4402 const STRLEN newlen = SvCUR(sv);
4407 GET_RE_DEBUG_FLAGS_DECL;
4408 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4409 (int)oldlen, STRING(ret),
4411 Copy(s, STRING(ret), newlen, char);
4412 STR_LEN(ret) += newlen - oldlen;
4413 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4415 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4423 S_regwhite(char *p, const char *e)
4428 else if (*p == '#') {
4431 } while (p < e && *p != '\n');
4439 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4440 Character classes ([:foo:]) can also be negated ([:^foo:]).
4441 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4442 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4443 but trigger failures because they are currently unimplemented. */
4445 #define POSIXCC_DONE(c) ((c) == ':')
4446 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4447 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4450 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4453 I32 namedclass = OOB_NAMEDCLASS;
4455 if (value == '[' && RExC_parse + 1 < RExC_end &&
4456 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4457 POSIXCC(UCHARAT(RExC_parse))) {
4458 const char c = UCHARAT(RExC_parse);
4459 char* const s = RExC_parse++;
4461 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4463 if (RExC_parse == RExC_end)
4464 /* Grandfather lone [:, [=, [. */
4467 const char* t = RExC_parse++; /* skip over the c */
4468 const char *posixcc;
4472 if (UCHARAT(RExC_parse) == ']') {
4473 RExC_parse++; /* skip over the ending ] */
4476 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4477 const I32 skip = t - posixcc;
4479 /* Initially switch on the length of the name. */
4482 if (memEQ(posixcc, "word", 4)) {
4483 /* this is not POSIX, this is the Perl \w */;
4485 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4489 /* Names all of length 5. */
4490 /* alnum alpha ascii blank cntrl digit graph lower
4491 print punct space upper */
4492 /* Offset 4 gives the best switch position. */
4493 switch (posixcc[4]) {
4495 if (memEQ(posixcc, "alph", 4)) {
4498 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4502 if (memEQ(posixcc, "spac", 4)) {
4505 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4509 if (memEQ(posixcc, "grap", 4)) {
4512 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4516 if (memEQ(posixcc, "asci", 4)) {
4519 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4523 if (memEQ(posixcc, "blan", 4)) {
4526 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4530 if (memEQ(posixcc, "cntr", 4)) {
4533 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4537 if (memEQ(posixcc, "alnu", 4)) {
4540 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4544 if (memEQ(posixcc, "lowe", 4)) {
4547 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4549 if (memEQ(posixcc, "uppe", 4)) {
4552 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4556 if (memEQ(posixcc, "digi", 4)) {
4559 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4561 if (memEQ(posixcc, "prin", 4)) {
4564 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4566 if (memEQ(posixcc, "punc", 4)) {
4569 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4575 if (memEQ(posixcc, "xdigit", 6)) {
4577 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4582 if (namedclass == OOB_NAMEDCLASS)
4584 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4587 assert (posixcc[skip] == ':');
4588 assert (posixcc[skip+1] == ']');
4589 } else if (!SIZE_ONLY) {
4590 /* [[=foo=]] and [[.foo.]] are still future. */
4592 /* adjust RExC_parse so the warning shows after
4594 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4596 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4599 /* Maternal grandfather:
4600 * "[:" ending in ":" but not in ":]" */
4610 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4613 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4614 const char *s = RExC_parse;
4615 const char c = *s++;
4617 while(*s && isALNUM(*s))
4619 if (*s && c == *s && s[1] == ']') {
4620 if (ckWARN(WARN_REGEXP))
4622 "POSIX syntax [%c %c] belongs inside character classes",
4625 /* [[=foo=]] and [[.foo.]] are still future. */
4626 if (POSIXCC_NOTYET(c)) {
4627 /* adjust RExC_parse so the error shows after
4629 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4631 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4638 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4642 register UV nextvalue;
4643 register IV prevvalue = OOB_UNICODE;
4644 register IV range = 0;
4645 register regnode *ret;
4648 char *rangebegin = NULL;
4649 bool need_class = 0;
4653 bool optimize_invert = TRUE;
4654 AV* unicode_alternate = NULL;
4656 UV literal_endpoint = 0;
4659 ret = reganode(pRExC_state, ANYOF, 0);
4662 ANYOF_FLAGS(ret) = 0;
4664 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4668 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4672 RExC_size += ANYOF_SKIP;
4673 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
4676 RExC_emit += ANYOF_SKIP;
4678 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4680 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4681 ANYOF_BITMAP_ZERO(ret);
4682 listsv = newSVpvs("# comment\n");
4685 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4687 if (!SIZE_ONLY && POSIXCC(nextvalue))
4688 checkposixcc(pRExC_state);
4690 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4691 if (UCHARAT(RExC_parse) == ']')
4694 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4698 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4701 rangebegin = RExC_parse;
4703 value = utf8n_to_uvchr((U8*)RExC_parse,
4704 RExC_end - RExC_parse,
4705 &numlen, UTF8_ALLOW_DEFAULT);
4706 RExC_parse += numlen;
4709 value = UCHARAT(RExC_parse++);
4710 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4711 if (value == '[' && POSIXCC(nextvalue))
4712 namedclass = regpposixcc(pRExC_state, value);
4713 else if (value == '\\') {
4715 value = utf8n_to_uvchr((U8*)RExC_parse,
4716 RExC_end - RExC_parse,
4717 &numlen, UTF8_ALLOW_DEFAULT);
4718 RExC_parse += numlen;
4721 value = UCHARAT(RExC_parse++);
4722 /* Some compilers cannot handle switching on 64-bit integer
4723 * values, therefore value cannot be an UV. Yes, this will
4724 * be a problem later if we want switch on Unicode.
4725 * A similar issue a little bit later when switching on
4726 * namedclass. --jhi */
4727 switch ((I32)value) {
4728 case 'w': namedclass = ANYOF_ALNUM; break;
4729 case 'W': namedclass = ANYOF_NALNUM; break;
4730 case 's': namedclass = ANYOF_SPACE; break;
4731 case 'S': namedclass = ANYOF_NSPACE; break;
4732 case 'd': namedclass = ANYOF_DIGIT; break;
4733 case 'D': namedclass = ANYOF_NDIGIT; break;
4736 if (RExC_parse >= RExC_end)
4737 vFAIL2("Empty \\%c{}", (U8)value);
4738 if (*RExC_parse == '{') {
4739 const U8 c = (U8)value;
4740 e = strchr(RExC_parse++, '}');
4742 vFAIL2("Missing right brace on \\%c{}", c);
4743 while (isSPACE(UCHARAT(RExC_parse)))
4745 if (e == RExC_parse)
4746 vFAIL2("Empty \\%c{}", c);
4748 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4756 if (UCHARAT(RExC_parse) == '^') {
4759 value = value == 'p' ? 'P' : 'p'; /* toggle */
4760 while (isSPACE(UCHARAT(RExC_parse))) {
4765 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
4766 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
4769 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4770 namedclass = ANYOF_MAX; /* no official name, but it's named */
4772 case 'n': value = '\n'; break;
4773 case 'r': value = '\r'; break;
4774 case 't': value = '\t'; break;
4775 case 'f': value = '\f'; break;
4776 case 'b': value = '\b'; break;
4777 case 'e': value = ASCII_TO_NATIVE('\033');break;
4778 case 'a': value = ASCII_TO_NATIVE('\007');break;
4780 if (*RExC_parse == '{') {
4781 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4782 | PERL_SCAN_DISALLOW_PREFIX;
4783 e = strchr(RExC_parse++, '}');
4785 vFAIL("Missing right brace on \\x{}");
4787 numlen = e - RExC_parse;
4788 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4792 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4794 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4795 RExC_parse += numlen;
4799 value = UCHARAT(RExC_parse++);
4800 value = toCTRL(value);
4802 case '0': case '1': case '2': case '3': case '4':
4803 case '5': case '6': case '7': case '8': case '9':
4807 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4808 RExC_parse += numlen;
4812 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
4814 "Unrecognized escape \\%c in character class passed through",
4818 } /* end of \blah */
4824 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4826 if (!SIZE_ONLY && !need_class)
4827 ANYOF_CLASS_ZERO(ret);
4831 /* a bad range like a-\d, a-[:digit:] ? */
4834 if (ckWARN(WARN_REGEXP)) {
4836 RExC_parse >= rangebegin ?
4837 RExC_parse - rangebegin : 0;
4839 "False [] range \"%*.*s\"",
4842 if (prevvalue < 256) {
4843 ANYOF_BITMAP_SET(ret, prevvalue);
4844 ANYOF_BITMAP_SET(ret, '-');
4847 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4848 Perl_sv_catpvf(aTHX_ listsv,
4849 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4853 range = 0; /* this was not a true range */
4857 const char *what = NULL;
4860 if (namedclass > OOB_NAMEDCLASS)
4861 optimize_invert = FALSE;
4862 /* Possible truncation here but in some 64-bit environments
4863 * the compiler gets heartburn about switch on 64-bit values.
4864 * A similar issue a little earlier when switching on value.
4866 switch ((I32)namedclass) {
4869 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4871 for (value = 0; value < 256; value++)
4873 ANYOF_BITMAP_SET(ret, value);
4880 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4882 for (value = 0; value < 256; value++)
4883 if (!isALNUM(value))
4884 ANYOF_BITMAP_SET(ret, value);
4891 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4893 for (value = 0; value < 256; value++)
4894 if (isALNUMC(value))
4895 ANYOF_BITMAP_SET(ret, value);
4902 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4904 for (value = 0; value < 256; value++)
4905 if (!isALNUMC(value))
4906 ANYOF_BITMAP_SET(ret, value);
4913 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4915 for (value = 0; value < 256; value++)
4917 ANYOF_BITMAP_SET(ret, value);
4924 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4926 for (value = 0; value < 256; value++)
4927 if (!isALPHA(value))
4928 ANYOF_BITMAP_SET(ret, value);
4935 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4938 for (value = 0; value < 128; value++)
4939 ANYOF_BITMAP_SET(ret, value);
4941 for (value = 0; value < 256; value++) {
4943 ANYOF_BITMAP_SET(ret, value);
4952 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4955 for (value = 128; value < 256; value++)
4956 ANYOF_BITMAP_SET(ret, value);
4958 for (value = 0; value < 256; value++) {
4959 if (!isASCII(value))
4960 ANYOF_BITMAP_SET(ret, value);
4969 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4971 for (value = 0; value < 256; value++)
4973 ANYOF_BITMAP_SET(ret, value);
4980 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4982 for (value = 0; value < 256; value++)
4983 if (!isBLANK(value))
4984 ANYOF_BITMAP_SET(ret, value);
4991 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
4993 for (value = 0; value < 256; value++)
4995 ANYOF_BITMAP_SET(ret, value);
5002 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5004 for (value = 0; value < 256; value++)
5005 if (!isCNTRL(value))
5006 ANYOF_BITMAP_SET(ret, value);
5013 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5015 /* consecutive digits assumed */
5016 for (value = '0'; value <= '9'; value++)
5017 ANYOF_BITMAP_SET(ret, value);
5024 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5026 /* consecutive digits assumed */
5027 for (value = 0; value < '0'; value++)
5028 ANYOF_BITMAP_SET(ret, value);
5029 for (value = '9' + 1; value < 256; value++)
5030 ANYOF_BITMAP_SET(ret, value);
5037 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5039 for (value = 0; value < 256; value++)
5041 ANYOF_BITMAP_SET(ret, value);
5048 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5050 for (value = 0; value < 256; value++)
5051 if (!isGRAPH(value))
5052 ANYOF_BITMAP_SET(ret, value);
5059 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5061 for (value = 0; value < 256; value++)
5063 ANYOF_BITMAP_SET(ret, value);
5070 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5072 for (value = 0; value < 256; value++)
5073 if (!isLOWER(value))
5074 ANYOF_BITMAP_SET(ret, value);
5081 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5083 for (value = 0; value < 256; value++)
5085 ANYOF_BITMAP_SET(ret, value);
5092 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5094 for (value = 0; value < 256; value++)
5095 if (!isPRINT(value))
5096 ANYOF_BITMAP_SET(ret, value);
5103 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5105 for (value = 0; value < 256; value++)
5106 if (isPSXSPC(value))
5107 ANYOF_BITMAP_SET(ret, value);
5114 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5116 for (value = 0; value < 256; value++)
5117 if (!isPSXSPC(value))
5118 ANYOF_BITMAP_SET(ret, value);
5125 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5127 for (value = 0; value < 256; value++)
5129 ANYOF_BITMAP_SET(ret, value);
5136 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5138 for (value = 0; value < 256; value++)
5139 if (!isPUNCT(value))
5140 ANYOF_BITMAP_SET(ret, value);
5147 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5149 for (value = 0; value < 256; value++)
5151 ANYOF_BITMAP_SET(ret, value);
5158 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5160 for (value = 0; value < 256; value++)
5161 if (!isSPACE(value))
5162 ANYOF_BITMAP_SET(ret, value);
5169 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5171 for (value = 0; value < 256; value++)
5173 ANYOF_BITMAP_SET(ret, value);
5180 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5182 for (value = 0; value < 256; value++)
5183 if (!isUPPER(value))
5184 ANYOF_BITMAP_SET(ret, value);
5191 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5193 for (value = 0; value < 256; value++)
5194 if (isXDIGIT(value))
5195 ANYOF_BITMAP_SET(ret, value);
5202 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5204 for (value = 0; value < 256; value++)
5205 if (!isXDIGIT(value))
5206 ANYOF_BITMAP_SET(ret, value);
5212 /* this is to handle \p and \P */
5215 vFAIL("Invalid [::] class");
5219 /* Strings such as "+utf8::isWord\n" */
5220 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5223 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5226 } /* end of namedclass \blah */
5229 if (prevvalue > (IV)value) /* b-a */ {
5230 const int w = RExC_parse - rangebegin;
5231 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5232 range = 0; /* not a valid range */
5236 prevvalue = value; /* save the beginning of the range */
5237 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5238 RExC_parse[1] != ']') {
5241 /* a bad range like \w-, [:word:]- ? */
5242 if (namedclass > OOB_NAMEDCLASS) {
5243 if (ckWARN(WARN_REGEXP)) {
5245 RExC_parse >= rangebegin ?
5246 RExC_parse - rangebegin : 0;
5248 "False [] range \"%*.*s\"",
5252 ANYOF_BITMAP_SET(ret, '-');
5254 range = 1; /* yeah, it's a range! */
5255 continue; /* but do it the next time */
5259 /* now is the next time */
5263 if (prevvalue < 256) {
5264 const IV ceilvalue = value < 256 ? value : 255;
5267 /* In EBCDIC [\x89-\x91] should include
5268 * the \x8e but [i-j] should not. */
5269 if (literal_endpoint == 2 &&
5270 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5271 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5273 if (isLOWER(prevvalue)) {
5274 for (i = prevvalue; i <= ceilvalue; i++)
5276 ANYOF_BITMAP_SET(ret, i);
5278 for (i = prevvalue; i <= ceilvalue; i++)
5280 ANYOF_BITMAP_SET(ret, i);
5285 for (i = prevvalue; i <= ceilvalue; i++)
5286 ANYOF_BITMAP_SET(ret, i);
5288 if (value > 255 || UTF) {
5289 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5290 const UV natvalue = NATIVE_TO_UNI(value);
5292 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5293 if (prevnatvalue < natvalue) { /* what about > ? */
5294 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5295 prevnatvalue, natvalue);
5297 else if (prevnatvalue == natvalue) {
5298 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5300 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5302 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5304 /* If folding and foldable and a single
5305 * character, insert also the folded version
5306 * to the charclass. */
5308 if (foldlen == (STRLEN)UNISKIP(f))
5309 Perl_sv_catpvf(aTHX_ listsv,
5312 /* Any multicharacter foldings
5313 * require the following transform:
5314 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5315 * where E folds into "pq" and F folds
5316 * into "rst", all other characters
5317 * fold to single characters. We save
5318 * away these multicharacter foldings,
5319 * to be later saved as part of the
5320 * additional "s" data. */
5323 if (!unicode_alternate)
5324 unicode_alternate = newAV();
5325 sv = newSVpvn((char*)foldbuf, foldlen);
5327 av_push(unicode_alternate, sv);
5331 /* If folding and the value is one of the Greek
5332 * sigmas insert a few more sigmas to make the
5333 * folding rules of the sigmas to work right.
5334 * Note that not all the possible combinations
5335 * are handled here: some of them are handled
5336 * by the standard folding rules, and some of
5337 * them (literal or EXACTF cases) are handled
5338 * during runtime in regexec.c:S_find_byclass(). */
5339 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5340 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5341 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5342 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5343 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5345 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5346 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5347 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5352 literal_endpoint = 0;
5356 range = 0; /* this range (if it was one) is done now */
5360 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5362 RExC_size += ANYOF_CLASS_ADD_SKIP;
5364 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5367 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5369 /* If the only flag is folding (plus possibly inversion). */
5370 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5372 for (value = 0; value < 256; ++value) {
5373 if (ANYOF_BITMAP_TEST(ret, value)) {
5374 UV fold = PL_fold[value];
5377 ANYOF_BITMAP_SET(ret, fold);
5380 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5383 /* optimize inverted simple patterns (e.g. [^a-z]) */
5384 if (!SIZE_ONLY && optimize_invert &&
5385 /* If the only flag is inversion. */
5386 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5387 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5388 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5389 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5393 AV * const av = newAV();
5396 /* The 0th element stores the character class description
5397 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5398 * to initialize the appropriate swash (which gets stored in
5399 * the 1st element), and also useful for dumping the regnode.
5400 * The 2nd element stores the multicharacter foldings,
5401 * used later (regexec.c:S_reginclass()). */
5402 av_store(av, 0, listsv);
5403 av_store(av, 1, NULL);
5404 av_store(av, 2, (SV*)unicode_alternate);
5405 rv = newRV_noinc((SV*)av);
5406 n = add_data(pRExC_state, 1, "s");
5407 RExC_rx->data->data[n] = (void*)rv;
5415 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5417 char* const retval = RExC_parse++;
5420 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5421 RExC_parse[2] == '#') {
5422 while (*RExC_parse != ')') {
5423 if (RExC_parse == RExC_end)
5424 FAIL("Sequence (?#... not terminated");
5430 if (RExC_flags & PMf_EXTENDED) {
5431 if (isSPACE(*RExC_parse)) {
5435 else if (*RExC_parse == '#') {
5436 while (RExC_parse < RExC_end)
5437 if (*RExC_parse++ == '\n') break;
5446 - reg_node - emit a node
5448 STATIC regnode * /* Location. */
5449 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5452 register regnode *ptr;
5453 regnode * const ret = RExC_emit;
5456 SIZE_ALIGN(RExC_size);
5461 NODE_ALIGN_FILL(ret);
5463 FILL_ADVANCE_NODE(ptr, op);
5464 if (RExC_offsets) { /* MJD */
5465 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5466 "reg_node", __LINE__,
5468 RExC_emit - RExC_emit_start > RExC_offsets[0]
5469 ? "Overwriting end of array!\n" : "OK",
5470 RExC_emit - RExC_emit_start,
5471 RExC_parse - RExC_start,
5473 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5482 - reganode - emit a node with an argument
5484 STATIC regnode * /* Location. */
5485 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5488 register regnode *ptr;
5489 regnode * const ret = RExC_emit;
5492 SIZE_ALIGN(RExC_size);
5497 NODE_ALIGN_FILL(ret);
5499 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5500 if (RExC_offsets) { /* MJD */
5501 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5505 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5506 "Overwriting end of array!\n" : "OK",
5507 RExC_emit - RExC_emit_start,
5508 RExC_parse - RExC_start,
5510 Set_Cur_Node_Offset;
5519 - reguni - emit (if appropriate) a Unicode character
5522 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5525 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5529 - reginsert - insert an operator in front of already-emitted operand
5531 * Means relocating the operand.
5534 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5537 register regnode *src;
5538 register regnode *dst;
5539 register regnode *place;
5540 const int offset = regarglen[(U8)op];
5542 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5545 RExC_size += NODE_STEP_REGNODE + offset;
5550 RExC_emit += NODE_STEP_REGNODE + offset;
5552 while (src > opnd) {
5553 StructCopy(--src, --dst, regnode);
5554 if (RExC_offsets) { /* MJD 20010112 */
5555 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5559 dst - RExC_emit_start > RExC_offsets[0]
5560 ? "Overwriting end of array!\n" : "OK",
5561 src - RExC_emit_start,
5562 dst - RExC_emit_start,
5564 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5565 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5570 place = opnd; /* Op node, where operand used to be. */
5571 if (RExC_offsets) { /* MJD */
5572 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5576 place - RExC_emit_start > RExC_offsets[0]
5577 ? "Overwriting end of array!\n" : "OK",
5578 place - RExC_emit_start,
5579 RExC_parse - RExC_start,
5581 Set_Node_Offset(place, RExC_parse);
5582 Set_Node_Length(place, 1);
5584 src = NEXTOPER(place);
5585 FILL_ADVANCE_NODE(place, op);
5586 Zero(src, offset, regnode);
5590 - regtail - set the next-pointer at the end of a node chain of p to val.
5592 /* TODO: All three parms should be const */
5594 S_regtail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
5597 register regnode *scan;
5602 /* Find last node. */
5605 regnode * const temp = regnext(scan);
5611 if (reg_off_by_arg[OP(scan)]) {
5612 ARG_SET(scan, val - scan);
5615 NEXT_OFF(scan) = val - scan;
5620 - regoptail - regtail on operand of first argument; nop if operandless
5622 /* TODO: All three parms should be const */
5624 S_regoptail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
5627 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5628 if (p == NULL || SIZE_ONLY)
5630 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5631 regtail(pRExC_state, NEXTOPER(p), val);
5633 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5634 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5641 - regcurly - a little FSA that accepts {\d+,?\d*}
5644 S_regcurly(register const char *s)
5663 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5666 Perl_regdump(pTHX_ const regexp *r)
5670 SV * const sv = sv_newmortal();
5672 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
5674 /* Header fields of interest. */
5675 if (r->anchored_substr)
5676 PerlIO_printf(Perl_debug_log,
5677 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5679 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5680 SvPVX_const(r->anchored_substr),
5682 SvTAIL(r->anchored_substr) ? "$" : "",
5683 (IV)r->anchored_offset);
5684 else if (r->anchored_utf8)
5685 PerlIO_printf(Perl_debug_log,
5686 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5688 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5689 SvPVX_const(r->anchored_utf8),
5691 SvTAIL(r->anchored_utf8) ? "$" : "",
5692 (IV)r->anchored_offset);
5693 if (r->float_substr)
5694 PerlIO_printf(Perl_debug_log,
5695 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5697 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5698 SvPVX_const(r->float_substr),
5700 SvTAIL(r->float_substr) ? "$" : "",
5701 (IV)r->float_min_offset, (UV)r->float_max_offset);
5702 else if (r->float_utf8)
5703 PerlIO_printf(Perl_debug_log,
5704 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5706 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5707 SvPVX_const(r->float_utf8),
5709 SvTAIL(r->float_utf8) ? "$" : "",
5710 (IV)r->float_min_offset, (UV)r->float_max_offset);
5711 if (r->check_substr || r->check_utf8)
5712 PerlIO_printf(Perl_debug_log,
5713 r->check_substr == r->float_substr
5714 && r->check_utf8 == r->float_utf8
5715 ? "(checking floating" : "(checking anchored");
5716 if (r->reganch & ROPT_NOSCAN)
5717 PerlIO_printf(Perl_debug_log, " noscan");
5718 if (r->reganch & ROPT_CHECK_ALL)
5719 PerlIO_printf(Perl_debug_log, " isall");
5720 if (r->check_substr || r->check_utf8)
5721 PerlIO_printf(Perl_debug_log, ") ");
5723 if (r->regstclass) {
5724 regprop(r, sv, r->regstclass);
5725 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5727 if (r->reganch & ROPT_ANCH) {
5728 PerlIO_printf(Perl_debug_log, "anchored");
5729 if (r->reganch & ROPT_ANCH_BOL)
5730 PerlIO_printf(Perl_debug_log, "(BOL)");
5731 if (r->reganch & ROPT_ANCH_MBOL)
5732 PerlIO_printf(Perl_debug_log, "(MBOL)");
5733 if (r->reganch & ROPT_ANCH_SBOL)
5734 PerlIO_printf(Perl_debug_log, "(SBOL)");
5735 if (r->reganch & ROPT_ANCH_GPOS)
5736 PerlIO_printf(Perl_debug_log, "(GPOS)");
5737 PerlIO_putc(Perl_debug_log, ' ');
5739 if (r->reganch & ROPT_GPOS_SEEN)
5740 PerlIO_printf(Perl_debug_log, "GPOS ");
5741 if (r->reganch & ROPT_SKIP)
5742 PerlIO_printf(Perl_debug_log, "plus ");
5743 if (r->reganch & ROPT_IMPLICIT)
5744 PerlIO_printf(Perl_debug_log, "implicit ");
5745 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5746 if (r->reganch & ROPT_EVAL_SEEN)
5747 PerlIO_printf(Perl_debug_log, "with eval ");
5748 PerlIO_printf(Perl_debug_log, "\n");
5750 const U32 len = r->offsets[0];
5751 GET_RE_DEBUG_FLAGS_DECL;
5754 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5755 for (i = 1; i <= len; i++)
5756 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5757 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5758 PerlIO_printf(Perl_debug_log, "\n");
5762 PERL_UNUSED_CONTEXT;
5764 #endif /* DEBUGGING */
5768 - regprop - printable representation of opcode
5771 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
5777 sv_setpvn(sv, "", 0);
5778 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5779 /* It would be nice to FAIL() here, but this may be called from
5780 regexec.c, and it would be hard to supply pRExC_state. */
5781 Perl_croak(aTHX_ "Corrupted regexp opcode");
5782 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5784 k = PL_regkind[(U8)OP(o)];
5787 SV * const dsv = sv_2mortal(newSVpvs(""));
5788 /* Using is_utf8_string() is a crude hack but it may
5789 * be the best for now since we have no flag "this EXACTish
5790 * node was UTF-8" --jhi */
5791 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5792 const char * const s = do_utf8 ?
5793 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5794 UNI_DISPLAY_REGEX) :
5796 const int len = do_utf8 ?
5799 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5803 } else if (k == TRIE) {
5805 /* print the details od the trie in dumpuntil instead, as
5806 * prog->data isn't available here */
5807 } else if (k == CURLY) {
5808 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5809 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5810 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5812 else if (k == WHILEM && o->flags) /* Ordinal/of */
5813 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5814 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5815 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5816 else if (k == LOGICAL)
5817 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5818 else if (k == ANYOF) {
5819 int i, rangestart = -1;
5820 const U8 flags = ANYOF_FLAGS(o);
5822 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5823 static const char * const anyofs[] = {
5856 if (flags & ANYOF_LOCALE)
5857 sv_catpvs(sv, "{loc}");
5858 if (flags & ANYOF_FOLD)
5859 sv_catpvs(sv, "{i}");
5860 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5861 if (flags & ANYOF_INVERT)
5863 for (i = 0; i <= 256; i++) {
5864 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5865 if (rangestart == -1)
5867 } else if (rangestart != -1) {
5868 if (i <= rangestart + 3)
5869 for (; rangestart < i; rangestart++)
5870 put_byte(sv, rangestart);
5872 put_byte(sv, rangestart);
5874 put_byte(sv, i - 1);
5880 if (o->flags & ANYOF_CLASS)
5881 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5882 if (ANYOF_CLASS_TEST(o,i))
5883 sv_catpv(sv, anyofs[i]);
5885 if (flags & ANYOF_UNICODE)
5886 sv_catpvs(sv, "{unicode}");
5887 else if (flags & ANYOF_UNICODE_ALL)
5888 sv_catpvs(sv, "{unicode_all}");
5892 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
5896 U8 s[UTF8_MAXBYTES_CASE+1];
5898 for (i = 0; i <= 256; i++) { /* just the first 256 */
5899 uvchr_to_utf8(s, i);
5901 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5902 if (rangestart == -1)
5904 } else if (rangestart != -1) {
5905 if (i <= rangestart + 3)
5906 for (; rangestart < i; rangestart++) {
5907 const U8 * const e = uvchr_to_utf8(s,rangestart);
5909 for(p = s; p < e; p++)
5913 const U8 *e = uvchr_to_utf8(s,rangestart);
5915 for (p = s; p < e; p++)
5918 e = uvchr_to_utf8(s, i-1);
5919 for (p = s; p < e; p++)
5926 sv_catpvs(sv, "..."); /* et cetera */
5930 char *s = savesvpv(lv);
5931 char * const origs = s;
5933 while(*s && *s != '\n') s++;
5936 const char * const t = ++s;
5954 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5956 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5957 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5959 PERL_UNUSED_CONTEXT;
5960 PERL_UNUSED_ARG(sv);
5962 #endif /* DEBUGGING */
5966 Perl_re_intuit_string(pTHX_ regexp *prog)
5967 { /* Assume that RE_INTUIT is set */
5969 GET_RE_DEBUG_FLAGS_DECL;
5970 PERL_UNUSED_CONTEXT;
5974 const char * const s = SvPV_nolen_const(prog->check_substr
5975 ? prog->check_substr : prog->check_utf8);
5977 if (!PL_colorset) reginitcolors();
5978 PerlIO_printf(Perl_debug_log,
5979 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
5981 prog->check_substr ? "" : "utf8 ",
5982 PL_colors[5],PL_colors[0],
5985 (strlen(s) > 60 ? "..." : ""));
5988 return prog->check_substr ? prog->check_substr : prog->check_utf8;
5992 Perl_pregfree(pTHX_ struct regexp *r)
5996 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
5998 GET_RE_DEBUG_FLAGS_DECL;
6000 if (!r || (--r->refcnt > 0))
6002 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6003 const char * const s = (r->reganch & ROPT_UTF8)
6004 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6005 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6006 const int len = SvCUR(dsv);
6009 PerlIO_printf(Perl_debug_log,
6010 "%sFreeing REx:%s %s%*.*s%s%s\n",
6011 PL_colors[4],PL_colors[5],PL_colors[0],
6014 len > 60 ? "..." : "");
6017 /* gcov results gave these as non-null 100% of the time, so there's no
6018 optimisation in checking them before calling Safefree */
6019 Safefree(r->precomp);
6020 Safefree(r->offsets); /* 20010421 MJD */
6021 RX_MATCH_COPY_FREE(r);
6022 #ifdef PERL_OLD_COPY_ON_WRITE
6024 SvREFCNT_dec(r->saved_copy);
6027 if (r->anchored_substr)
6028 SvREFCNT_dec(r->anchored_substr);
6029 if (r->anchored_utf8)
6030 SvREFCNT_dec(r->anchored_utf8);
6031 if (r->float_substr)
6032 SvREFCNT_dec(r->float_substr);
6034 SvREFCNT_dec(r->float_utf8);
6035 Safefree(r->substrs);
6038 int n = r->data->count;
6039 PAD* new_comppad = NULL;
6044 /* If you add a ->what type here, update the comment in regcomp.h */
6045 switch (r->data->what[n]) {
6047 SvREFCNT_dec((SV*)r->data->data[n]);
6050 Safefree(r->data->data[n]);
6053 new_comppad = (AV*)r->data->data[n];
6056 if (new_comppad == NULL)
6057 Perl_croak(aTHX_ "panic: pregfree comppad");
6058 PAD_SAVE_LOCAL(old_comppad,
6059 /* Watch out for global destruction's random ordering. */
6060 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6063 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6066 op_free((OP_4tree*)r->data->data[n]);
6068 PAD_RESTORE_LOCAL(old_comppad);
6069 SvREFCNT_dec((SV*)new_comppad);
6076 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
6079 refcount = --trie->refcount;
6082 Safefree(trie->charmap);
6083 if (trie->widecharmap)
6084 SvREFCNT_dec((SV*)trie->widecharmap);
6085 Safefree(trie->states);
6086 Safefree(trie->trans);
6089 SvREFCNT_dec((SV*)trie->words);
6090 if (trie->revcharmap)
6091 SvREFCNT_dec((SV*)trie->revcharmap);
6093 Safefree(r->data->data[n]); /* do this last!!!! */
6098 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6101 Safefree(r->data->what);
6104 Safefree(r->startp);
6110 - regnext - dig the "next" pointer out of a node
6113 Perl_regnext(pTHX_ register regnode *p)
6116 register I32 offset;
6118 if (p == &PL_regdummy)
6121 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6129 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6132 STRLEN l1 = strlen(pat1);
6133 STRLEN l2 = strlen(pat2);
6136 const char *message;
6142 Copy(pat1, buf, l1 , char);
6143 Copy(pat2, buf + l1, l2 , char);
6144 buf[l1 + l2] = '\n';
6145 buf[l1 + l2 + 1] = '\0';
6147 /* ANSI variant takes additional second argument */
6148 va_start(args, pat2);
6152 msv = vmess(buf, &args);
6154 message = SvPV_const(msv,l1);
6157 Copy(message, buf, l1 , char);
6158 buf[l1-1] = '\0'; /* Overwrite \n */
6159 Perl_croak(aTHX_ "%s", buf);
6162 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6165 Perl_save_re_context(pTHX)
6169 struct re_save_state *state;
6171 SAVEVPTR(PL_curcop);
6172 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6174 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6175 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6176 SSPUSHINT(SAVEt_RE_STATE);
6178 Copy(&PL_reg_state, state, 1, struct re_save_state);
6180 PL_reg_start_tmp = 0;
6181 PL_reg_start_tmpl = 0;
6182 PL_reg_oldsaved = NULL;
6183 PL_reg_oldsavedlen = 0;
6185 PL_reg_leftiter = 0;
6186 PL_reg_poscache = NULL;
6187 PL_reg_poscache_size = 0;
6188 #ifdef PERL_OLD_COPY_ON_WRITE
6192 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6194 const REGEXP * const rx = PM_GETRE(PL_curpm);
6197 for (i = 1; i <= rx->nparens; i++) {
6198 char digits[TYPE_CHARS(long)];
6199 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
6200 GV *const *const gvp
6201 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6204 GV * const gv = *gvp;
6205 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6214 clear_re(pTHX_ void *r)
6217 ReREFCNT_dec((regexp *)r);
6223 S_put_byte(pTHX_ SV *sv, int c)
6225 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6226 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6227 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6228 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6230 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6234 STATIC const regnode *
6235 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6236 const regnode *last, SV* sv, I32 l)
6239 register U8 op = EXACT; /* Arbitrary non-END op. */
6240 register const regnode *next;
6242 while (op != END && (!last || node < last)) {
6243 /* While that wasn't END last time... */
6249 next = regnext((regnode *)node);
6251 if (OP(node) == OPTIMIZED)
6253 regprop(r, sv, node);
6254 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6255 (int)(2*l + 1), "", SvPVX_const(sv));
6256 if (next == NULL) /* Next ptr. */
6257 PerlIO_printf(Perl_debug_log, "(0)");
6259 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6260 (void)PerlIO_putc(Perl_debug_log, '\n');
6262 if (PL_regkind[(U8)op] == BRANCHJ) {
6263 register const regnode *nnode = (OP(next) == LONGJMP
6264 ? regnext((regnode *)next)
6266 if (last && nnode > last)
6268 node = dumpuntil(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6270 else if (PL_regkind[(U8)op] == BRANCH) {
6271 node = dumpuntil(r, start, NEXTOPER(node), next, sv, l + 1);
6273 else if ( PL_regkind[(U8)op] == TRIE ) {
6274 const I32 n = ARG(node);
6275 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6276 const I32 arry_len = av_len(trie->words)+1;
6278 PerlIO_printf(Perl_debug_log,
6279 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6283 (int)trie->charcount,
6284 trie->uniquecharcount,
6285 (IV)trie->laststate-1,
6286 node->flags ? " EVAL mode" : "");
6288 for (word_idx=0; word_idx < arry_len; word_idx++) {
6289 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
6291 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6294 SvPV_nolen_const(*elem_ptr),
6299 PerlIO_printf(Perl_debug_log, "(0)\n");
6301 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6307 node = NEXTOPER(node);
6308 node += regarglen[(U8)op];
6311 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6312 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6313 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6315 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6316 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6319 else if ( op == PLUS || op == STAR) {
6320 node = dumpuntil(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6322 else if (op == ANYOF) {
6323 /* arglen 1 + class block */
6324 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6325 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6326 node = NEXTOPER(node);
6328 else if (PL_regkind[(U8)op] == EXACT) {
6329 /* Literal string, where present. */
6330 node += NODE_SZ_STR(node) - 1;
6331 node = NEXTOPER(node);
6334 node = NEXTOPER(node);
6335 node += regarglen[(U8)op];
6337 if (op == CURLYX || op == OPEN)
6339 else if (op == WHILEM)
6345 #endif /* DEBUGGING */
6349 * c-indentation-style: bsd
6351 * indent-tabs-mode: t
6354 * ex: set ts=8 sts=4 sw=4 noet: