5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
287 #define Simple_vFAIL(m) STMT_START { \
288 const IV offset = RExC_parse - RExC_precomp; \
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
296 #define vFAIL(m) STMT_START { \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
303 * Like Simple_vFAIL(), but accepts two arguments.
305 #define Simple_vFAIL2(m,a1) STMT_START { \
306 const IV offset = RExC_parse - RExC_precomp; \
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
314 #define vFAIL2(m,a1) STMT_START { \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
322 * Like Simple_vFAIL(), but accepts three arguments.
324 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
325 const IV offset = RExC_parse - RExC_precomp; \
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
333 #define vFAIL3(m,a1,a2) STMT_START { \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
340 * Like Simple_vFAIL(), but accepts four arguments.
342 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
343 const IV offset = RExC_parse - RExC_precomp; \
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
348 #define vWARN(loc,m) STMT_START { \
349 const IV offset = loc - RExC_precomp; \
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
354 #define vWARNdep(loc,m) STMT_START { \
355 const IV offset = loc - RExC_precomp; \
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
362 #define vWARN2(loc, m, a1) STMT_START { \
363 const IV offset = loc - RExC_precomp; \
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
368 #define vWARN3(loc, m, a1, a2) STMT_START { \
369 const IV offset = loc - RExC_precomp; \
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
374 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
375 const IV offset = loc - RExC_precomp; \
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
381 const IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
387 /* Allow for side effects in s */
388 #define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
392 /* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
398 #define MJD_OFFSET_DEBUG(x)
399 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
402 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
409 RExC_offsets[2*(node)-1] = (byte); \
414 #define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
418 #define Set_Node_Length_To_R(node,len) STMT_START { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
421 __LINE__, (int)(node), (int)(len))); \
423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
425 RExC_offsets[2*(node)] = (len); \
430 #define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433 #define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
436 /* Get offsets and lengths */
437 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
440 static void clear_re(pTHX_ void *r);
442 /* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
447 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
453 SvSetMagicSV(*data->longest, data->last_found);
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
460 data->flags &= ~SF_FIX_BEFORE_EOL;
463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
464 data->offset_float_max = (l
465 ? data->last_start_max
466 : data->pos_min + data->pos_delta);
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
469 if (data->flags & SF_BEFORE_EOL)
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
473 data->flags &= ~SF_FL_BEFORE_EOL;
476 SvCUR_set(data->last_found, 0);
478 SV * const sv = data->last_found;
480 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
481 if (mg && mg->mg_len > 0)
485 data->flags &= ~SF_BEFORE_EOL;
488 /* Can match anything (initialization) */
490 S_cl_anything(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
492 ANYOF_CLASS_ZERO(cl);
493 ANYOF_BITMAP_SETALL(cl);
494 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
496 cl->flags |= ANYOF_LOCALE;
499 /* Can match anything (initialization) */
501 S_cl_is_anything(const struct regnode_charclass_class *cl)
505 for (value = 0; value <= ANYOF_MAX; value += 2)
506 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
508 if (!(cl->flags & ANYOF_UNICODE_ALL))
510 if (!ANYOF_BITMAP_TESTALLSET(cl))
515 /* Can match anything (initialization) */
517 S_cl_init(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
519 Zero(cl, 1, struct regnode_charclass_class);
521 cl_anything(pRExC_state, cl);
525 S_cl_init_zero(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
527 Zero(cl, 1, struct regnode_charclass_class);
529 cl_anything(pRExC_state, cl);
531 cl->flags |= ANYOF_LOCALE;
534 /* 'And' a given class with another one. Can create false positives */
535 /* We assume that cl is not inverted */
537 S_cl_and(struct regnode_charclass_class *cl,
538 const struct regnode_charclass_class *and_with)
540 if (!(and_with->flags & ANYOF_CLASS)
541 && !(cl->flags & ANYOF_CLASS)
542 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
543 && !(and_with->flags & ANYOF_FOLD)
544 && !(cl->flags & ANYOF_FOLD)) {
547 if (and_with->flags & ANYOF_INVERT)
548 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
549 cl->bitmap[i] &= ~and_with->bitmap[i];
551 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
552 cl->bitmap[i] &= and_with->bitmap[i];
553 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
554 if (!(and_with->flags & ANYOF_EOS))
555 cl->flags &= ~ANYOF_EOS;
557 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
558 !(and_with->flags & ANYOF_INVERT)) {
559 cl->flags &= ~ANYOF_UNICODE_ALL;
560 cl->flags |= ANYOF_UNICODE;
561 ARG_SET(cl, ARG(and_with));
563 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
564 !(and_with->flags & ANYOF_INVERT))
565 cl->flags &= ~ANYOF_UNICODE_ALL;
566 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
567 !(and_with->flags & ANYOF_INVERT))
568 cl->flags &= ~ANYOF_UNICODE;
571 /* 'OR' a given class with another one. Can create false positives */
572 /* We assume that cl is not inverted */
574 S_cl_or(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
576 if (or_with->flags & ANYOF_INVERT) {
578 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
579 * <= (B1 | !B2) | (CL1 | !CL2)
580 * which is wasteful if CL2 is small, but we ignore CL2:
581 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
582 * XXXX Can we handle case-fold? Unclear:
583 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
584 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
586 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
587 && !(or_with->flags & ANYOF_FOLD)
588 && !(cl->flags & ANYOF_FOLD) ) {
591 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
592 cl->bitmap[i] |= ~or_with->bitmap[i];
593 } /* XXXX: logic is complicated otherwise */
595 cl_anything(pRExC_state, cl);
598 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
599 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
600 && (!(or_with->flags & ANYOF_FOLD)
601 || (cl->flags & ANYOF_FOLD)) ) {
604 /* OR char bitmap and class bitmap separately */
605 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
606 cl->bitmap[i] |= or_with->bitmap[i];
607 if (or_with->flags & ANYOF_CLASS) {
608 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
609 cl->classflags[i] |= or_with->classflags[i];
610 cl->flags |= ANYOF_CLASS;
613 else { /* XXXX: logic is complicated, leave it along for a moment. */
614 cl_anything(pRExC_state, cl);
617 if (or_with->flags & ANYOF_EOS)
618 cl->flags |= ANYOF_EOS;
620 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
621 ARG(cl) != ARG(or_with)) {
622 cl->flags |= ANYOF_UNICODE_ALL;
623 cl->flags &= ~ANYOF_UNICODE;
625 if (or_with->flags & ANYOF_UNICODE_ALL) {
626 cl->flags |= ANYOF_UNICODE_ALL;
627 cl->flags &= ~ANYOF_UNICODE;
633 make_trie(startbranch,first,last,tail,flags)
634 startbranch: the first branch in the whole branch sequence
635 first : start branch of sequence of branch-exact nodes.
636 May be the same as startbranch
637 last : Thing following the last branch.
638 May be the same as tail.
639 tail : item following the branch sequence
640 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
642 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
644 A trie is an N'ary tree where the branches are determined by digital
645 decomposition of the key. IE, at the root node you look up the 1st character and
646 follow that branch repeat until you find the end of the branches. Nodes can be
647 marked as "accepting" meaning they represent a complete word. Eg:
651 would convert into the following structure. Numbers represent states, letters
652 following numbers represent valid transitions on the letter from that state, if
653 the number is in square brackets it represents an accepting state, otherwise it
654 will be in parenthesis.
656 +-h->+-e->[3]-+-r->(8)-+-s->[9]
660 (1) +-i->(6)-+-s->[7]
662 +-s->(3)-+-h->(4)-+-e->[5]
664 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
666 This shows that when matching against the string 'hers' we will begin at state 1
667 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
668 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
669 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
670 single traverse. We store a mapping from accepting to state to which word was
671 matched, and then when we have multiple possibilities we try to complete the
672 rest of the regex in the order in which they occured in the alternation.
674 The only prior NFA like behaviour that would be changed by the TRIE support is
675 the silent ignoring of duplicate alternations which are of the form:
677 / (DUPE|DUPE) X? (?{ ... }) Y /x
679 Thus EVAL blocks follwing a trie may be called a different number of times with
680 and without the optimisation. With the optimisations dupes will be silently
681 ignored. This inconsistant behaviour of EVAL type nodes is well established as
682 the following demonstrates:
684 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
686 which prints out 'word' three times, but
688 'words'=~/(word|word|word)(?{ print $1 })S/
690 which doesnt print it out at all. This is due to other optimisations kicking in.
692 Example of what happens on a structural level:
694 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
696 1: CURLYM[1] {1,32767}(18)
707 This would be optimizable with startbranch=5, first=5, last=16, tail=16
708 and should turn into:
710 1: CURLYM[1] {1,32767}(18)
712 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
720 Cases where tail != last would be like /(?foo|bar)baz/:
730 which would be optimizable with startbranch=1, first=1, last=7, tail=8
731 and would end up looking like:
734 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
743 #define TRIE_DEBUG_CHAR \
744 DEBUG_TRIE_COMPILE_r({ \
747 tmp = newSVpvs( "" ); \
748 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
750 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
752 av_push( trie->revcharmap, tmp ); \
755 #define TRIE_READ_CHAR STMT_START { \
758 if ( foldlen > 0 ) { \
759 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
764 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
765 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
766 foldlen -= UNISKIP( uvc ); \
767 scan = foldbuf + UNISKIP( uvc ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
779 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
780 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
781 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
782 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
784 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
785 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
786 TRIE_LIST_LEN( state ) *= 2; \
787 Renew( trie->states[ state ].trans.list, \
788 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
790 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
791 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
792 TRIE_LIST_CUR( state )++; \
795 #define TRIE_LIST_NEW(state) STMT_START { \
796 Newxz( trie->states[ state ].trans.list, \
797 4, reg_trie_trans_le ); \
798 TRIE_LIST_CUR( state ) = 1; \
799 TRIE_LIST_LEN( state ) = 4; \
803 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
806 /* first pass, loop through and scan words */
809 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
814 /* we just use folder as a flag in utf8 */
815 const U8 * const folder = ( flags == EXACTF
823 const U32 data_slot = add_data( pRExC_state, 1, "t" );
826 GET_RE_DEBUG_FLAGS_DECL;
828 Newxz( trie, 1, reg_trie_data );
830 RExC_rx->data->data[ data_slot ] = (void*)trie;
831 Newxz( trie->charmap, 256, U16 );
833 trie->words = newAV();
834 trie->revcharmap = newAV();
838 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
839 if (!SvIOK(re_trie_maxbuff)) {
840 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
843 /* -- First loop and Setup --
845 We first traverse the branches and scan each word to determine if it
846 contains widechars, and how many unique chars there are, this is
847 important as we have to build a table with at least as many columns as we
850 We use an array of integers to represent the character codes 0..255
851 (trie->charmap) and we use a an HV* to store unicode characters. We use the
852 native representation of the character value as the key and IV's for the
855 *TODO* If we keep track of how many times each character is used we can
856 remap the columns so that the table compression later on is more
857 efficient in terms of memory by ensuring most common value is in the
858 middle and the least common are on the outside. IMO this would be better
859 than a most to least common mapping as theres a decent chance the most
860 common letter will share a node with the least common, meaning the node
861 will not be compressable. With a middle is most common approach the worst
862 case is when we have the least common nodes twice.
867 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
868 regnode * const noper = NEXTOPER( cur );
869 const U8 *uc = (U8*)STRING( noper );
870 const U8 * const e = uc + STR_LEN( noper );
872 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
873 const U8 *scan = (U8*)NULL;
875 for ( ; uc < e ; uc += len ) {
879 if ( !trie->charmap[ uvc ] ) {
880 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
882 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
887 if ( !trie->widecharmap )
888 trie->widecharmap = newHV();
890 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
893 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
895 if ( !SvTRUE( *svpp ) ) {
896 sv_setiv( *svpp, ++trie->uniquecharcount );
902 } /* end first pass */
903 DEBUG_TRIE_COMPILE_r(
904 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
905 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
906 (int)trie->charcount, trie->uniquecharcount )
911 We now know what we are dealing with in terms of unique chars and
912 string sizes so we can calculate how much memory a naive
913 representation using a flat table will take. If it's over a reasonable
914 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
915 conservative but potentially much slower representation using an array
918 At the end we convert both representations into the same compressed
919 form that will be used in regexec.c for matching with. The latter
920 is a form that cannot be used to construct with but has memory
921 properties similar to the list form and access properties similar
922 to the table form making it both suitable for fast searches and
923 small enough that its feasable to store for the duration of a program.
925 See the comment in the code where the compressed table is produced
926 inplace from the flat tabe representation for an explanation of how
927 the compression works.
932 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
934 Second Pass -- Array Of Lists Representation
936 Each state will be represented by a list of charid:state records
937 (reg_trie_trans_le) the first such element holds the CUR and LEN
938 points of the allocated array. (See defines above).
940 We build the initial structure using the lists, and then convert
941 it into the compressed table form which allows faster lookups
942 (but cant be modified once converted).
948 STRLEN transcount = 1;
950 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
954 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
956 regnode * const noper = NEXTOPER( cur );
957 U8 *uc = (U8*)STRING( noper );
958 const U8 * const e = uc + STR_LEN( noper );
959 U32 state = 1; /* required init */
960 U16 charid = 0; /* sanity init */
961 U8 *scan = (U8*)NULL; /* sanity init */
962 STRLEN foldlen = 0; /* required init */
963 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
965 for ( ; uc < e ; uc += len ) {
970 charid = trie->charmap[ uvc ];
972 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
976 charid=(U16)SvIV( *svpp );
985 if ( !trie->states[ state ].trans.list ) {
986 TRIE_LIST_NEW( state );
988 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
989 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
990 newstate = TRIE_LIST_ITEM( state, check ).newstate;
995 newstate = next_alloc++;
996 TRIE_LIST_PUSH( state, charid, newstate );
1001 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1003 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1006 if ( !trie->states[ state ].wordnum ) {
1007 /* we havent inserted this word into the structure yet. */
1008 trie->states[ state ].wordnum = ++curword;
1011 /* store the word for dumping */
1012 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1013 if ( UTF ) SvUTF8_on( tmp );
1014 av_push( trie->words, tmp );
1018 /*EMPTY*/; /* It's a dupe. So ignore it. */
1021 } /* end second pass */
1023 trie->laststate = next_alloc;
1024 Renew( trie->states, next_alloc, reg_trie_state );
1026 DEBUG_TRIE_COMPILE_MORE_r({
1029 /* print out the table precompression. */
1031 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1032 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1034 for( state=1 ; state < next_alloc ; state ++ ) {
1037 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1038 if ( ! trie->states[ state ].wordnum ) {
1039 PerlIO_printf( Perl_debug_log, "%5s| ","");
1041 PerlIO_printf( Perl_debug_log, "W%04x| ",
1042 trie->states[ state ].wordnum
1045 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1046 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1047 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1048 SvPV_nolen_const( *tmp ),
1049 TRIE_LIST_ITEM(state,charid).forid,
1050 (UV)TRIE_LIST_ITEM(state,charid).newstate
1055 PerlIO_printf( Perl_debug_log, "\n\n" );
1058 Newxz( trie->trans, transcount ,reg_trie_trans );
1065 for( state=1 ; state < next_alloc ; state ++ ) {
1069 DEBUG_TRIE_COMPILE_MORE_r(
1070 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1074 if (trie->states[state].trans.list) {
1075 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1079 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1080 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1081 if ( forid < minid ) {
1083 } else if ( forid > maxid ) {
1087 if ( transcount < tp + maxid - minid + 1) {
1089 Renew( trie->trans, transcount, reg_trie_trans );
1090 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1092 base = trie->uniquecharcount + tp - minid;
1093 if ( maxid == minid ) {
1095 for ( ; zp < tp ; zp++ ) {
1096 if ( ! trie->trans[ zp ].next ) {
1097 base = trie->uniquecharcount + zp - minid;
1098 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1099 trie->trans[ zp ].check = state;
1105 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1106 trie->trans[ tp ].check = state;
1111 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1112 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1113 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1114 trie->trans[ tid ].check = state;
1116 tp += ( maxid - minid + 1 );
1118 Safefree(trie->states[ state ].trans.list);
1121 DEBUG_TRIE_COMPILE_MORE_r(
1122 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1125 trie->states[ state ].trans.base=base;
1127 trie->lasttrans = tp + 1;
1131 Second Pass -- Flat Table Representation.
1133 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1134 We know that we will need Charcount+1 trans at most to store the data
1135 (one row per char at worst case) So we preallocate both structures
1136 assuming worst case.
1138 We then construct the trie using only the .next slots of the entry
1141 We use the .check field of the first entry of the node temporarily to
1142 make compression both faster and easier by keeping track of how many non
1143 zero fields are in the node.
1145 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1148 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1149 number representing the first entry of the node, and state as a
1150 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1151 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1152 are 2 entrys per node. eg:
1160 The table is internally in the right hand, idx form. However as we also
1161 have to deal with the states array which is indexed by nodenum we have to
1162 use TRIE_NODENUM() to convert.
1166 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1168 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
1169 next_alloc = trie->uniquecharcount + 1;
1171 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1173 regnode * const noper = NEXTOPER( cur );
1174 const U8 *uc = (U8*)STRING( noper );
1175 const U8 * const e = uc + STR_LEN( noper );
1177 U32 state = 1; /* required init */
1179 U16 charid = 0; /* sanity init */
1180 U32 accept_state = 0; /* sanity init */
1181 U8 *scan = (U8*)NULL; /* sanity init */
1183 STRLEN foldlen = 0; /* required init */
1184 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1187 for ( ; uc < e ; uc += len ) {
1192 charid = trie->charmap[ uvc ];
1194 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1195 charid = svpp ? (U16)SvIV(*svpp) : 0;
1199 if ( !trie->trans[ state + charid ].next ) {
1200 trie->trans[ state + charid ].next = next_alloc;
1201 trie->trans[ state ].check++;
1202 next_alloc += trie->uniquecharcount;
1204 state = trie->trans[ state + charid ].next;
1206 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1208 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1211 accept_state = TRIE_NODENUM( state );
1212 if ( !trie->states[ accept_state ].wordnum ) {
1213 /* we havent inserted this word into the structure yet. */
1214 trie->states[ accept_state ].wordnum = ++curword;
1217 /* store the word for dumping */
1218 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1219 if ( UTF ) SvUTF8_on( tmp );
1220 av_push( trie->words, tmp );
1224 /*EMPTY*/; /* Its a dupe. So ignore it. */
1227 } /* end second pass */
1229 DEBUG_TRIE_COMPILE_MORE_r({
1231 print out the table precompression so that we can do a visual check
1232 that they are identical.
1236 PerlIO_printf( Perl_debug_log, "\nChar : " );
1238 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1239 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1241 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1245 PerlIO_printf( Perl_debug_log, "\nState+-" );
1247 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1248 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1251 PerlIO_printf( Perl_debug_log, "\n" );
1253 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1255 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1257 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1258 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1259 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1261 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1262 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1264 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1265 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1268 PerlIO_printf( Perl_debug_log, "\n\n" );
1272 * Inplace compress the table.*
1274 For sparse data sets the table constructed by the trie algorithm will
1275 be mostly 0/FAIL transitions or to put it another way mostly empty.
1276 (Note that leaf nodes will not contain any transitions.)
1278 This algorithm compresses the tables by eliminating most such
1279 transitions, at the cost of a modest bit of extra work during lookup:
1281 - Each states[] entry contains a .base field which indicates the
1282 index in the state[] array wheres its transition data is stored.
1284 - If .base is 0 there are no valid transitions from that node.
1286 - If .base is nonzero then charid is added to it to find an entry in
1289 -If trans[states[state].base+charid].check!=state then the
1290 transition is taken to be a 0/Fail transition. Thus if there are fail
1291 transitions at the front of the node then the .base offset will point
1292 somewhere inside the previous nodes data (or maybe even into a node
1293 even earlier), but the .check field determines if the transition is
1296 The following process inplace converts the table to the compressed
1297 table: We first do not compress the root node 1,and mark its all its
1298 .check pointers as 1 and set its .base pointer as 1 as well. This
1299 allows to do a DFA construction from the compressed table later, and
1300 ensures that any .base pointers we calculate later are greater than
1303 - We set 'pos' to indicate the first entry of the second node.
1305 - We then iterate over the columns of the node, finding the first and
1306 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1307 and set the .check pointers accordingly, and advance pos
1308 appropriately and repreat for the next node. Note that when we copy
1309 the next pointers we have to convert them from the original
1310 NODEIDX form to NODENUM form as the former is not valid post
1313 - If a node has no transitions used we mark its base as 0 and do not
1314 advance the pos pointer.
1316 - If a node only has one transition we use a second pointer into the
1317 structure to fill in allocated fail transitions from other states.
1318 This pointer is independent of the main pointer and scans forward
1319 looking for null transitions that are allocated to a state. When it
1320 finds one it writes the single transition into the "hole". If the
1321 pointer doesnt find one the single transition is appeneded as normal.
1323 - Once compressed we can Renew/realloc the structures to release the
1326 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1327 specifically Fig 3.47 and the associated pseudocode.
1331 const U32 laststate = TRIE_NODENUM( next_alloc );
1334 trie->laststate = laststate;
1336 for ( state = 1 ; state < laststate ; state++ ) {
1338 const U32 stateidx = TRIE_NODEIDX( state );
1339 const U32 o_used = trie->trans[ stateidx ].check;
1340 U32 used = trie->trans[ stateidx ].check;
1341 trie->trans[ stateidx ].check = 0;
1343 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1344 if ( flag || trie->trans[ stateidx + charid ].next ) {
1345 if ( trie->trans[ stateidx + charid ].next ) {
1347 for ( ; zp < pos ; zp++ ) {
1348 if ( ! trie->trans[ zp ].next ) {
1352 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1353 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1354 trie->trans[ zp ].check = state;
1355 if ( ++zp > pos ) pos = zp;
1362 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1364 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1365 trie->trans[ pos ].check = state;
1370 trie->lasttrans = pos + 1;
1371 Renew( trie->states, laststate + 1, reg_trie_state);
1372 DEBUG_TRIE_COMPILE_MORE_r(
1373 PerlIO_printf( Perl_debug_log,
1374 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1375 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1378 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1381 } /* end table compress */
1383 /* resize the trans array to remove unused space */
1384 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1386 DEBUG_TRIE_COMPILE_r({
1389 Now we print it out again, in a slightly different form as there is additional
1390 info we want to be able to see when its compressed. They are close enough for
1391 visual comparison though.
1393 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1395 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1396 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1398 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1401 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1403 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1404 PerlIO_printf( Perl_debug_log, "-----");
1405 PerlIO_printf( Perl_debug_log, "\n");
1407 for( state = 1 ; state < trie->laststate ; state++ ) {
1408 const U32 base = trie->states[ state ].trans.base;
1410 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1412 if ( trie->states[ state ].wordnum ) {
1413 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1415 PerlIO_printf( Perl_debug_log, "%6s", "" );
1418 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1423 while( ( base + ofs < trie->uniquecharcount ) ||
1424 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1425 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1428 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1430 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1431 if ( ( base + ofs >= trie->uniquecharcount ) &&
1432 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1433 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1435 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1436 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1438 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1442 PerlIO_printf( Perl_debug_log, "]");
1445 PerlIO_printf( Perl_debug_log, "\n" );
1450 /* now finally we "stitch in" the new TRIE node
1451 This means we convert either the first branch or the first Exact,
1452 depending on whether the thing following (in 'last') is a branch
1453 or not and whther first is the startbranch (ie is it a sub part of
1454 the alternation or is it the whole thing.)
1455 Assuming its a sub part we conver the EXACT otherwise we convert
1456 the whole branch sequence, including the first.
1463 if ( first == startbranch && OP( last ) != BRANCH ) {
1466 convert = NEXTOPER( first );
1467 NEXT_OFF( first ) = (U16)(last - first);
1470 OP( convert ) = TRIE + (U8)( flags - EXACT );
1471 NEXT_OFF( convert ) = (U16)(tail - convert);
1472 ARG_SET( convert, data_slot );
1474 /* tells us if we need to handle accept buffers specially */
1475 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1478 /* needed for dumping*/
1480 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1481 /* We now need to mark all of the space originally used by the
1482 branches as optimized away. This keeps the dumpuntil from
1483 throwing a wobbly as it doesnt use regnext() to traverse the
1486 while( optimize < last ) {
1487 OP( optimize ) = OPTIMIZED;
1491 } /* end node insert */
1498 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1499 * These need to be revisited when a newer toolchain becomes available.
1501 #if defined(__sparc64__) && defined(__GNUC__)
1502 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1503 # undef SPARC64_GCC_WORKAROUND
1504 # define SPARC64_GCC_WORKAROUND 1
1508 /* REx optimizer. Converts nodes into quickier variants "in place".
1509 Finds fixed substrings. */
1511 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1512 to the position after last scanned or to NULL. */
1516 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1517 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1518 /* scanp: Start here (read-write). */
1519 /* deltap: Write maxlen-minlen here. */
1520 /* last: Stop before this one. */
1523 I32 min = 0, pars = 0, code;
1524 regnode *scan = *scanp, *next;
1526 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1527 int is_inf_internal = 0; /* The studied chunk is infinite */
1528 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1529 scan_data_t data_fake;
1530 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1531 SV *re_trie_maxbuff = NULL;
1533 GET_RE_DEBUG_FLAGS_DECL;
1535 while (scan && OP(scan) != END && scan < last) {
1536 /* Peephole optimizer: */
1538 SV * const mysv=sv_newmortal();
1539 regprop( mysv, scan);
1540 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1541 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1544 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1545 /* Merge several consecutive EXACTish nodes into one. */
1546 regnode *n = regnext(scan);
1549 regnode *stop = scan;
1552 next = scan + NODE_SZ_STR(scan);
1553 /* Skip NOTHING, merge EXACT*. */
1555 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1556 (stringok && (OP(n) == OP(scan))))
1558 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1559 if (OP(n) == TAIL || n > next)
1561 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1562 NEXT_OFF(scan) += NEXT_OFF(n);
1563 next = n + NODE_STEP_REGNODE;
1570 else if (stringok) {
1571 const int oldl = STR_LEN(scan);
1572 regnode * const nnext = regnext(n);
1574 if (oldl + STR_LEN(n) > U8_MAX)
1576 NEXT_OFF(scan) += NEXT_OFF(n);
1577 STR_LEN(scan) += STR_LEN(n);
1578 next = n + NODE_SZ_STR(n);
1579 /* Now we can overwrite *n : */
1580 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1588 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1590 Two problematic code points in Unicode casefolding of EXACT nodes:
1592 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1593 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1599 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1600 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1602 This means that in case-insensitive matching (or "loose matching",
1603 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1604 length of the above casefolded versions) can match a target string
1605 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1606 This would rather mess up the minimum length computation.
1608 What we'll do is to look for the tail four bytes, and then peek
1609 at the preceding two bytes to see whether we need to decrease
1610 the minimum length by four (six minus two).
1612 Thanks to the design of UTF-8, there cannot be false matches:
1613 A sequence of valid UTF-8 bytes cannot be a subsequence of
1614 another valid sequence of UTF-8 bytes.
1617 char * const s0 = STRING(scan), *s, *t;
1618 char * const s1 = s0 + STR_LEN(scan) - 1;
1619 char * const s2 = s1 - 4;
1620 const char * const t0 = "\xcc\x88\xcc\x81";
1621 const char * const t1 = t0 + 3;
1624 s < s2 && (t = ninstr(s, s1, t0, t1));
1626 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1627 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1634 n = scan + NODE_SZ_STR(scan);
1636 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1647 /* Follow the next-chain of the current node and optimize
1648 away all the NOTHINGs from it. */
1649 if (OP(scan) != CURLYX) {
1650 const int max = (reg_off_by_arg[OP(scan)]
1652 /* I32 may be smaller than U16 on CRAYs! */
1653 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1654 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1658 /* Skip NOTHING and LONGJMP. */
1659 while ((n = regnext(n))
1660 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1661 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1662 && off + noff < max)
1664 if (reg_off_by_arg[OP(scan)])
1667 NEXT_OFF(scan) = off;
1670 /* The principal pseudo-switch. Cannot be a switch, since we
1671 look into several different things. */
1672 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1673 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1674 next = regnext(scan);
1676 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1678 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1679 I32 max1 = 0, min1 = I32_MAX, num = 0;
1680 struct regnode_charclass_class accum;
1681 regnode *startbranch=scan;
1683 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1684 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1685 if (flags & SCF_DO_STCLASS)
1686 cl_init_zero(pRExC_state, &accum);
1688 while (OP(scan) == code) {
1689 I32 deltanext, minnext, f = 0, fake;
1690 struct regnode_charclass_class this_class;
1693 data_fake.flags = 0;
1695 data_fake.whilem_c = data->whilem_c;
1696 data_fake.last_closep = data->last_closep;
1699 data_fake.last_closep = &fake;
1700 next = regnext(scan);
1701 scan = NEXTOPER(scan);
1703 scan = NEXTOPER(scan);
1704 if (flags & SCF_DO_STCLASS) {
1705 cl_init(pRExC_state, &this_class);
1706 data_fake.start_class = &this_class;
1707 f = SCF_DO_STCLASS_AND;
1709 if (flags & SCF_WHILEM_VISITED_POS)
1710 f |= SCF_WHILEM_VISITED_POS;
1712 /* we suppose the run is continuous, last=next...*/
1713 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1714 next, &data_fake, f,depth+1);
1717 if (max1 < minnext + deltanext)
1718 max1 = minnext + deltanext;
1719 if (deltanext == I32_MAX)
1720 is_inf = is_inf_internal = 1;
1722 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1724 if (data && (data_fake.flags & SF_HAS_EVAL))
1725 data->flags |= SF_HAS_EVAL;
1727 data->whilem_c = data_fake.whilem_c;
1728 if (flags & SCF_DO_STCLASS)
1729 cl_or(pRExC_state, &accum, &this_class);
1730 if (code == SUSPEND)
1733 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1735 if (flags & SCF_DO_SUBSTR) {
1736 data->pos_min += min1;
1737 data->pos_delta += max1 - min1;
1738 if (max1 != min1 || is_inf)
1739 data->longest = &(data->longest_float);
1742 delta += max1 - min1;
1743 if (flags & SCF_DO_STCLASS_OR) {
1744 cl_or(pRExC_state, data->start_class, &accum);
1746 cl_and(data->start_class, &and_with);
1747 flags &= ~SCF_DO_STCLASS;
1750 else if (flags & SCF_DO_STCLASS_AND) {
1752 cl_and(data->start_class, &accum);
1753 flags &= ~SCF_DO_STCLASS;
1756 /* Switch to OR mode: cache the old value of
1757 * data->start_class */
1758 StructCopy(data->start_class, &and_with,
1759 struct regnode_charclass_class);
1760 flags &= ~SCF_DO_STCLASS_AND;
1761 StructCopy(&accum, data->start_class,
1762 struct regnode_charclass_class);
1763 flags |= SCF_DO_STCLASS_OR;
1764 data->start_class->flags |= ANYOF_EOS;
1770 Assuming this was/is a branch we are dealing with: 'scan' now
1771 points at the item that follows the branch sequence, whatever
1772 it is. We now start at the beginning of the sequence and look
1778 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1780 If we can find such a subseqence we need to turn the first
1781 element into a trie and then add the subsequent branch exact
1782 strings to the trie.
1786 1. patterns where the whole set of branch can be converted to a trie,
1788 2. patterns where only a subset of the alternations can be
1789 converted to a trie.
1791 In case 1 we can replace the whole set with a single regop
1792 for the trie. In case 2 we need to keep the start and end
1795 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1796 becomes BRANCH TRIE; BRANCH X;
1798 Hypthetically when we know the regex isnt anchored we can
1799 turn a case 1 into a DFA and let it rip... Every time it finds a match
1800 it would just call its tail, no WHILEM/CURLY needed.
1804 if (!re_trie_maxbuff) {
1805 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1806 if (!SvIOK(re_trie_maxbuff))
1807 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1809 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1811 regnode *first = (regnode *)NULL;
1812 regnode *last = (regnode *)NULL;
1813 regnode *tail = scan;
1818 SV * const mysv = sv_newmortal(); /* for dumping */
1820 /* var tail is used because there may be a TAIL
1821 regop in the way. Ie, the exacts will point to the
1822 thing following the TAIL, but the last branch will
1823 point at the TAIL. So we advance tail. If we
1824 have nested (?:) we may have to move through several
1828 while ( OP( tail ) == TAIL ) {
1829 /* this is the TAIL generated by (?:) */
1830 tail = regnext( tail );
1834 regprop( mysv, tail );
1835 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1836 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1837 (RExC_seen_evals) ? "[EVAL]" : ""
1842 step through the branches, cur represents each
1843 branch, noper is the first thing to be matched
1844 as part of that branch and noper_next is the
1845 regnext() of that node. if noper is an EXACT
1846 and noper_next is the same as scan (our current
1847 position in the regex) then the EXACT branch is
1848 a possible optimization target. Once we have
1849 two or more consequetive such branches we can
1850 create a trie of the EXACT's contents and stich
1851 it in place. If the sequence represents all of
1852 the branches we eliminate the whole thing and
1853 replace it with a single TRIE. If it is a
1854 subsequence then we need to stitch it in. This
1855 means the first branch has to remain, and needs
1856 to be repointed at the item on the branch chain
1857 following the last branch optimized. This could
1858 be either a BRANCH, in which case the
1859 subsequence is internal, or it could be the
1860 item following the branch sequence in which
1861 case the subsequence is at the end.
1865 /* dont use tail as the end marker for this traverse */
1866 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1867 regnode * const noper = NEXTOPER( cur );
1868 regnode * const noper_next = regnext( noper );
1871 regprop( mysv, cur);
1872 PerlIO_printf( Perl_debug_log, "%*s%s",
1873 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1875 regprop( mysv, noper);
1876 PerlIO_printf( Perl_debug_log, " -> %s",
1877 SvPV_nolen_const(mysv));
1880 regprop( mysv, noper_next );
1881 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1882 SvPV_nolen_const(mysv));
1884 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1887 if ( ( first ? OP( noper ) == optype
1888 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1889 && noper_next == tail && count<U16_MAX)
1894 optype = OP( noper );
1898 regprop( mysv, first);
1899 PerlIO_printf( Perl_debug_log, "%*s%s",
1900 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1901 regprop( mysv, NEXTOPER(first) );
1902 PerlIO_printf( Perl_debug_log, " -> %s\n",
1903 SvPV_nolen_const( mysv ) );
1908 regprop( mysv, cur);
1909 PerlIO_printf( Perl_debug_log, "%*s%s",
1910 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1911 regprop( mysv, noper );
1912 PerlIO_printf( Perl_debug_log, " -> %s\n",
1913 SvPV_nolen_const( mysv ) );
1919 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1920 (int)depth * 2 + 2, "E:", "**END**" );
1922 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1924 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1925 && noper_next == tail )
1929 optype = OP( noper );
1939 regprop( mysv, cur);
1940 PerlIO_printf( Perl_debug_log,
1941 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1942 " ", SvPV_nolen_const( mysv ), first, last, cur);
1947 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1948 (int)depth * 2 + 2, "E:", "==END==" );
1950 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1955 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1956 scan = NEXTOPER(NEXTOPER(scan));
1957 } else /* single branch is optimized. */
1958 scan = NEXTOPER(scan);
1961 else if (OP(scan) == EXACT) {
1962 I32 l = STR_LEN(scan);
1965 const U8 * const s = (U8*)STRING(scan);
1966 l = utf8_length(s, s + l);
1967 uc = utf8_to_uvchr(s, NULL);
1969 uc = *((U8*)STRING(scan));
1972 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1973 /* The code below prefers earlier match for fixed
1974 offset, later match for variable offset. */
1975 if (data->last_end == -1) { /* Update the start info. */
1976 data->last_start_min = data->pos_min;
1977 data->last_start_max = is_inf
1978 ? I32_MAX : data->pos_min + data->pos_delta;
1980 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
1982 SV * const sv = data->last_found;
1983 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
1984 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1985 if (mg && mg->mg_len >= 0)
1986 mg->mg_len += utf8_length((U8*)STRING(scan),
1987 (U8*)STRING(scan)+STR_LEN(scan));
1990 SvUTF8_on(data->last_found);
1991 data->last_end = data->pos_min + l;
1992 data->pos_min += l; /* As in the first entry. */
1993 data->flags &= ~SF_BEFORE_EOL;
1995 if (flags & SCF_DO_STCLASS_AND) {
1996 /* Check whether it is compatible with what we know already! */
2000 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2001 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2002 && (!(data->start_class->flags & ANYOF_FOLD)
2003 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2006 ANYOF_CLASS_ZERO(data->start_class);
2007 ANYOF_BITMAP_ZERO(data->start_class);
2009 ANYOF_BITMAP_SET(data->start_class, uc);
2010 data->start_class->flags &= ~ANYOF_EOS;
2012 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2014 else if (flags & SCF_DO_STCLASS_OR) {
2015 /* false positive possible if the class is case-folded */
2017 ANYOF_BITMAP_SET(data->start_class, uc);
2019 data->start_class->flags |= ANYOF_UNICODE_ALL;
2020 data->start_class->flags &= ~ANYOF_EOS;
2021 cl_and(data->start_class, &and_with);
2023 flags &= ~SCF_DO_STCLASS;
2025 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2026 I32 l = STR_LEN(scan);
2027 UV uc = *((U8*)STRING(scan));
2029 /* Search for fixed substrings supports EXACT only. */
2030 if (flags & SCF_DO_SUBSTR)
2031 scan_commit(pRExC_state, data);
2033 const U8 * const s = (U8 *)STRING(scan);
2034 l = utf8_length(s, s + l);
2035 uc = utf8_to_uvchr(s, NULL);
2038 if (data && (flags & SCF_DO_SUBSTR))
2040 if (flags & SCF_DO_STCLASS_AND) {
2041 /* Check whether it is compatible with what we know already! */
2045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2046 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2049 ANYOF_CLASS_ZERO(data->start_class);
2050 ANYOF_BITMAP_ZERO(data->start_class);
2052 ANYOF_BITMAP_SET(data->start_class, uc);
2053 data->start_class->flags &= ~ANYOF_EOS;
2054 data->start_class->flags |= ANYOF_FOLD;
2055 if (OP(scan) == EXACTFL)
2056 data->start_class->flags |= ANYOF_LOCALE;
2059 else if (flags & SCF_DO_STCLASS_OR) {
2060 if (data->start_class->flags & ANYOF_FOLD) {
2061 /* false positive possible if the class is case-folded.
2062 Assume that the locale settings are the same... */
2064 ANYOF_BITMAP_SET(data->start_class, uc);
2065 data->start_class->flags &= ~ANYOF_EOS;
2067 cl_and(data->start_class, &and_with);
2069 flags &= ~SCF_DO_STCLASS;
2071 else if (strchr((const char*)PL_varies,OP(scan))) {
2072 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2073 I32 f = flags, pos_before = 0;
2074 regnode *oscan = scan;
2075 struct regnode_charclass_class this_class;
2076 struct regnode_charclass_class *oclass = NULL;
2077 I32 next_is_eval = 0;
2079 switch (PL_regkind[(U8)OP(scan)]) {
2080 case WHILEM: /* End of (?:...)* . */
2081 scan = NEXTOPER(scan);
2084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2085 next = NEXTOPER(scan);
2086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2088 maxcount = REG_INFTY;
2089 next = regnext(scan);
2090 scan = NEXTOPER(scan);
2094 if (flags & SCF_DO_SUBSTR)
2099 if (flags & SCF_DO_STCLASS) {
2101 maxcount = REG_INFTY;
2102 next = regnext(scan);
2103 scan = NEXTOPER(scan);
2106 is_inf = is_inf_internal = 1;
2107 scan = regnext(scan);
2108 if (flags & SCF_DO_SUBSTR) {
2109 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2110 data->longest = &(data->longest_float);
2112 goto optimize_curly_tail;
2114 mincount = ARG1(scan);
2115 maxcount = ARG2(scan);
2116 next = regnext(scan);
2117 if (OP(scan) == CURLYX) {
2118 I32 lp = (data ? *(data->last_closep) : 0);
2119 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2121 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2122 next_is_eval = (OP(scan) == EVAL);
2124 if (flags & SCF_DO_SUBSTR) {
2125 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2126 pos_before = data->pos_min;
2130 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2132 data->flags |= SF_IS_INF;
2134 if (flags & SCF_DO_STCLASS) {
2135 cl_init(pRExC_state, &this_class);
2136 oclass = data->start_class;
2137 data->start_class = &this_class;
2138 f |= SCF_DO_STCLASS_AND;
2139 f &= ~SCF_DO_STCLASS_OR;
2141 /* These are the cases when once a subexpression
2142 fails at a particular position, it cannot succeed
2143 even after backtracking at the enclosing scope.
2145 XXXX what if minimal match and we are at the
2146 initial run of {n,m}? */
2147 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2148 f &= ~SCF_WHILEM_VISITED_POS;
2150 /* This will finish on WHILEM, setting scan, or on NULL: */
2151 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2153 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2155 if (flags & SCF_DO_STCLASS)
2156 data->start_class = oclass;
2157 if (mincount == 0 || minnext == 0) {
2158 if (flags & SCF_DO_STCLASS_OR) {
2159 cl_or(pRExC_state, data->start_class, &this_class);
2161 else if (flags & SCF_DO_STCLASS_AND) {
2162 /* Switch to OR mode: cache the old value of
2163 * data->start_class */
2164 StructCopy(data->start_class, &and_with,
2165 struct regnode_charclass_class);
2166 flags &= ~SCF_DO_STCLASS_AND;
2167 StructCopy(&this_class, data->start_class,
2168 struct regnode_charclass_class);
2169 flags |= SCF_DO_STCLASS_OR;
2170 data->start_class->flags |= ANYOF_EOS;
2172 } else { /* Non-zero len */
2173 if (flags & SCF_DO_STCLASS_OR) {
2174 cl_or(pRExC_state, data->start_class, &this_class);
2175 cl_and(data->start_class, &and_with);
2177 else if (flags & SCF_DO_STCLASS_AND)
2178 cl_and(data->start_class, &this_class);
2179 flags &= ~SCF_DO_STCLASS;
2181 if (!scan) /* It was not CURLYX, but CURLY. */
2183 if ( /* ? quantifier ok, except for (?{ ... }) */
2184 (next_is_eval || !(mincount == 0 && maxcount == 1))
2185 && (minnext == 0) && (deltanext == 0)
2186 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2187 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2188 && ckWARN(WARN_REGEXP))
2191 "Quantifier unexpected on zero-length expression");
2194 min += minnext * mincount;
2195 is_inf_internal |= ((maxcount == REG_INFTY
2196 && (minnext + deltanext) > 0)
2197 || deltanext == I32_MAX);
2198 is_inf |= is_inf_internal;
2199 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2201 /* Try powerful optimization CURLYX => CURLYN. */
2202 if ( OP(oscan) == CURLYX && data
2203 && data->flags & SF_IN_PAR
2204 && !(data->flags & SF_HAS_EVAL)
2205 && !deltanext && minnext == 1 ) {
2206 /* Try to optimize to CURLYN. */
2207 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2208 regnode *nxt1 = nxt;
2215 if (!strchr((const char*)PL_simple,OP(nxt))
2216 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2217 && STR_LEN(nxt) == 1))
2223 if (OP(nxt) != CLOSE)
2225 /* Now we know that nxt2 is the only contents: */
2226 oscan->flags = (U8)ARG(nxt);
2228 OP(nxt1) = NOTHING; /* was OPEN. */
2230 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2231 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2232 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2233 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2234 OP(nxt + 1) = OPTIMIZED; /* was count. */
2235 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2240 /* Try optimization CURLYX => CURLYM. */
2241 if ( OP(oscan) == CURLYX && data
2242 && !(data->flags & SF_HAS_PAR)
2243 && !(data->flags & SF_HAS_EVAL)
2244 && !deltanext /* atom is fixed width */
2245 && minnext != 0 /* CURLYM can't handle zero width */
2247 /* XXXX How to optimize if data == 0? */
2248 /* Optimize to a simpler form. */
2249 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2253 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2254 && (OP(nxt2) != WHILEM))
2256 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2257 /* Need to optimize away parenths. */
2258 if (data->flags & SF_IN_PAR) {
2259 /* Set the parenth number. */
2260 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2262 if (OP(nxt) != CLOSE)
2263 FAIL("Panic opt close");
2264 oscan->flags = (U8)ARG(nxt);
2265 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2266 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2268 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2269 OP(nxt + 1) = OPTIMIZED; /* was count. */
2270 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2271 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2274 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2275 regnode *nnxt = regnext(nxt1);
2278 if (reg_off_by_arg[OP(nxt1)])
2279 ARG_SET(nxt1, nxt2 - nxt1);
2280 else if (nxt2 - nxt1 < U16_MAX)
2281 NEXT_OFF(nxt1) = nxt2 - nxt1;
2283 OP(nxt) = NOTHING; /* Cannot beautify */
2288 /* Optimize again: */
2289 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2295 else if ((OP(oscan) == CURLYX)
2296 && (flags & SCF_WHILEM_VISITED_POS)
2297 /* See the comment on a similar expression above.
2298 However, this time it not a subexpression
2299 we care about, but the expression itself. */
2300 && (maxcount == REG_INFTY)
2301 && data && ++data->whilem_c < 16) {
2302 /* This stays as CURLYX, we can put the count/of pair. */
2303 /* Find WHILEM (as in regexec.c) */
2304 regnode *nxt = oscan + NEXT_OFF(oscan);
2306 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2308 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2309 | (RExC_whilem_seen << 4)); /* On WHILEM */
2311 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2313 if (flags & SCF_DO_SUBSTR) {
2314 SV *last_str = NULL;
2315 int counted = mincount != 0;
2317 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2318 #if defined(SPARC64_GCC_WORKAROUND)
2321 const char *s = NULL;
2324 if (pos_before >= data->last_start_min)
2327 b = data->last_start_min;
2330 s = SvPV_const(data->last_found, l);
2331 old = b - data->last_start_min;
2334 I32 b = pos_before >= data->last_start_min
2335 ? pos_before : data->last_start_min;
2337 const char *s = SvPV_const(data->last_found, l);
2338 I32 old = b - data->last_start_min;
2342 old = utf8_hop((U8*)s, old) - (U8*)s;
2345 /* Get the added string: */
2346 last_str = newSVpvn(s + old, l);
2348 SvUTF8_on(last_str);
2349 if (deltanext == 0 && pos_before == b) {
2350 /* What was added is a constant string */
2352 SvGROW(last_str, (mincount * l) + 1);
2353 repeatcpy(SvPVX(last_str) + l,
2354 SvPVX_const(last_str), l, mincount - 1);
2355 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2356 /* Add additional parts. */
2357 SvCUR_set(data->last_found,
2358 SvCUR(data->last_found) - l);
2359 sv_catsv(data->last_found, last_str);
2361 SV * sv = data->last_found;
2363 SvUTF8(sv) && SvMAGICAL(sv) ?
2364 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2365 if (mg && mg->mg_len >= 0)
2366 mg->mg_len += CHR_SVLEN(last_str);
2368 data->last_end += l * (mincount - 1);
2371 /* start offset must point into the last copy */
2372 data->last_start_min += minnext * (mincount - 1);
2373 data->last_start_max += is_inf ? I32_MAX
2374 : (maxcount - 1) * (minnext + data->pos_delta);
2377 /* It is counted once already... */
2378 data->pos_min += minnext * (mincount - counted);
2379 data->pos_delta += - counted * deltanext +
2380 (minnext + deltanext) * maxcount - minnext * mincount;
2381 if (mincount != maxcount) {
2382 /* Cannot extend fixed substrings found inside
2384 scan_commit(pRExC_state,data);
2385 if (mincount && last_str) {
2386 sv_setsv(data->last_found, last_str);
2387 data->last_end = data->pos_min;
2388 data->last_start_min =
2389 data->pos_min - CHR_SVLEN(last_str);
2390 data->last_start_max = is_inf
2392 : data->pos_min + data->pos_delta
2393 - CHR_SVLEN(last_str);
2395 data->longest = &(data->longest_float);
2397 SvREFCNT_dec(last_str);
2399 if (data && (fl & SF_HAS_EVAL))
2400 data->flags |= SF_HAS_EVAL;
2401 optimize_curly_tail:
2402 if (OP(oscan) != CURLYX) {
2403 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2405 NEXT_OFF(oscan) += NEXT_OFF(next);
2408 default: /* REF and CLUMP only? */
2409 if (flags & SCF_DO_SUBSTR) {
2410 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2411 data->longest = &(data->longest_float);
2413 is_inf = is_inf_internal = 1;
2414 if (flags & SCF_DO_STCLASS_OR)
2415 cl_anything(pRExC_state, data->start_class);
2416 flags &= ~SCF_DO_STCLASS;
2420 else if (strchr((const char*)PL_simple,OP(scan))) {
2423 if (flags & SCF_DO_SUBSTR) {
2424 scan_commit(pRExC_state,data);
2428 if (flags & SCF_DO_STCLASS) {
2429 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2431 /* Some of the logic below assumes that switching
2432 locale on will only add false positives. */
2433 switch (PL_regkind[(U8)OP(scan)]) {
2437 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2438 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2439 cl_anything(pRExC_state, data->start_class);
2442 if (OP(scan) == SANY)
2444 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2445 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2446 || (data->start_class->flags & ANYOF_CLASS));
2447 cl_anything(pRExC_state, data->start_class);
2449 if (flags & SCF_DO_STCLASS_AND || !value)
2450 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2453 if (flags & SCF_DO_STCLASS_AND)
2454 cl_and(data->start_class,
2455 (struct regnode_charclass_class*)scan);
2457 cl_or(pRExC_state, data->start_class,
2458 (struct regnode_charclass_class*)scan);
2461 if (flags & SCF_DO_STCLASS_AND) {
2462 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2463 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2464 for (value = 0; value < 256; value++)
2465 if (!isALNUM(value))
2466 ANYOF_BITMAP_CLEAR(data->start_class, value);
2470 if (data->start_class->flags & ANYOF_LOCALE)
2471 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2473 for (value = 0; value < 256; value++)
2475 ANYOF_BITMAP_SET(data->start_class, value);
2480 if (flags & SCF_DO_STCLASS_AND) {
2481 if (data->start_class->flags & ANYOF_LOCALE)
2482 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2485 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2486 data->start_class->flags |= ANYOF_LOCALE;
2490 if (flags & SCF_DO_STCLASS_AND) {
2491 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2492 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2493 for (value = 0; value < 256; value++)
2495 ANYOF_BITMAP_CLEAR(data->start_class, value);
2499 if (data->start_class->flags & ANYOF_LOCALE)
2500 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2502 for (value = 0; value < 256; value++)
2503 if (!isALNUM(value))
2504 ANYOF_BITMAP_SET(data->start_class, value);
2509 if (flags & SCF_DO_STCLASS_AND) {
2510 if (data->start_class->flags & ANYOF_LOCALE)
2511 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2514 data->start_class->flags |= ANYOF_LOCALE;
2515 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2519 if (flags & SCF_DO_STCLASS_AND) {
2520 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2521 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2522 for (value = 0; value < 256; value++)
2523 if (!isSPACE(value))
2524 ANYOF_BITMAP_CLEAR(data->start_class, value);
2528 if (data->start_class->flags & ANYOF_LOCALE)
2529 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2531 for (value = 0; value < 256; value++)
2533 ANYOF_BITMAP_SET(data->start_class, value);
2538 if (flags & SCF_DO_STCLASS_AND) {
2539 if (data->start_class->flags & ANYOF_LOCALE)
2540 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2543 data->start_class->flags |= ANYOF_LOCALE;
2544 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2548 if (flags & SCF_DO_STCLASS_AND) {
2549 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2550 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2551 for (value = 0; value < 256; value++)
2553 ANYOF_BITMAP_CLEAR(data->start_class, value);
2557 if (data->start_class->flags & ANYOF_LOCALE)
2558 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2560 for (value = 0; value < 256; value++)
2561 if (!isSPACE(value))
2562 ANYOF_BITMAP_SET(data->start_class, value);
2567 if (flags & SCF_DO_STCLASS_AND) {
2568 if (data->start_class->flags & ANYOF_LOCALE) {
2569 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2570 for (value = 0; value < 256; value++)
2571 if (!isSPACE(value))
2572 ANYOF_BITMAP_CLEAR(data->start_class, value);
2576 data->start_class->flags |= ANYOF_LOCALE;
2577 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2581 if (flags & SCF_DO_STCLASS_AND) {
2582 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2583 for (value = 0; value < 256; value++)
2584 if (!isDIGIT(value))
2585 ANYOF_BITMAP_CLEAR(data->start_class, value);
2588 if (data->start_class->flags & ANYOF_LOCALE)
2589 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2591 for (value = 0; value < 256; value++)
2593 ANYOF_BITMAP_SET(data->start_class, value);
2598 if (flags & SCF_DO_STCLASS_AND) {
2599 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2600 for (value = 0; value < 256; value++)
2602 ANYOF_BITMAP_CLEAR(data->start_class, value);
2605 if (data->start_class->flags & ANYOF_LOCALE)
2606 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2608 for (value = 0; value < 256; value++)
2609 if (!isDIGIT(value))
2610 ANYOF_BITMAP_SET(data->start_class, value);
2615 if (flags & SCF_DO_STCLASS_OR)
2616 cl_and(data->start_class, &and_with);
2617 flags &= ~SCF_DO_STCLASS;
2620 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2621 data->flags |= (OP(scan) == MEOL
2625 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2626 /* Lookbehind, or need to calculate parens/evals/stclass: */
2627 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2628 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2629 /* Lookahead/lookbehind */
2630 I32 deltanext, minnext, fake = 0;
2632 struct regnode_charclass_class intrnl;
2635 data_fake.flags = 0;
2637 data_fake.whilem_c = data->whilem_c;
2638 data_fake.last_closep = data->last_closep;
2641 data_fake.last_closep = &fake;
2642 if ( flags & SCF_DO_STCLASS && !scan->flags
2643 && OP(scan) == IFMATCH ) { /* Lookahead */
2644 cl_init(pRExC_state, &intrnl);
2645 data_fake.start_class = &intrnl;
2646 f |= SCF_DO_STCLASS_AND;
2648 if (flags & SCF_WHILEM_VISITED_POS)
2649 f |= SCF_WHILEM_VISITED_POS;
2650 next = regnext(scan);
2651 nscan = NEXTOPER(NEXTOPER(scan));
2652 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2655 vFAIL("Variable length lookbehind not implemented");
2657 else if (minnext > U8_MAX) {
2658 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2660 scan->flags = (U8)minnext;
2662 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2664 if (data && (data_fake.flags & SF_HAS_EVAL))
2665 data->flags |= SF_HAS_EVAL;
2667 data->whilem_c = data_fake.whilem_c;
2668 if (f & SCF_DO_STCLASS_AND) {
2669 const int was = (data->start_class->flags & ANYOF_EOS);
2671 cl_and(data->start_class, &intrnl);
2673 data->start_class->flags |= ANYOF_EOS;
2676 else if (OP(scan) == OPEN) {
2679 else if (OP(scan) == CLOSE) {
2680 if ((I32)ARG(scan) == is_par) {
2681 next = regnext(scan);
2683 if ( next && (OP(next) != WHILEM) && next < last)
2684 is_par = 0; /* Disable optimization */
2687 *(data->last_closep) = ARG(scan);
2689 else if (OP(scan) == EVAL) {
2691 data->flags |= SF_HAS_EVAL;
2693 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2694 if (flags & SCF_DO_SUBSTR) {
2695 scan_commit(pRExC_state,data);
2696 data->longest = &(data->longest_float);
2698 is_inf = is_inf_internal = 1;
2699 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2700 cl_anything(pRExC_state, data->start_class);
2701 flags &= ~SCF_DO_STCLASS;
2703 /* Else: zero-length, ignore. */
2704 scan = regnext(scan);
2709 *deltap = is_inf_internal ? I32_MAX : delta;
2710 if (flags & SCF_DO_SUBSTR && is_inf)
2711 data->pos_delta = I32_MAX - data->pos_min;
2712 if (is_par > U8_MAX)
2714 if (is_par && pars==1 && data) {
2715 data->flags |= SF_IN_PAR;
2716 data->flags &= ~SF_HAS_PAR;
2718 else if (pars && data) {
2719 data->flags |= SF_HAS_PAR;
2720 data->flags &= ~SF_IN_PAR;
2722 if (flags & SCF_DO_STCLASS_OR)
2723 cl_and(data->start_class, &and_with);
2728 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
2730 if (RExC_rx->data) {
2731 Renewc(RExC_rx->data,
2732 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2733 char, struct reg_data);
2734 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2735 RExC_rx->data->count += n;
2738 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2739 char, struct reg_data);
2740 Newx(RExC_rx->data->what, n, U8);
2741 RExC_rx->data->count = n;
2743 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2744 return RExC_rx->data->count - n;
2748 Perl_reginitcolors(pTHX)
2751 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2753 char *t = savepv(s);
2757 t = strchr(t, '\t');
2763 PL_colors[i] = t = (char *)"";
2768 PL_colors[i++] = (char *)"";
2775 - pregcomp - compile a regular expression into internal code
2777 * We can't allocate space until we know how big the compiled form will be,
2778 * but we can't compile it (and thus know how big it is) until we've got a
2779 * place to put the code. So we cheat: we compile it twice, once with code
2780 * generation turned off and size counting turned on, and once "for real".
2781 * This also means that we don't allocate space until we are sure that the
2782 * thing really will compile successfully, and we never have to move the
2783 * code and thus invalidate pointers into it. (Note that it has to be in
2784 * one piece because free() must be able to free it all.) [NB: not true in perl]
2786 * Beware that the optimization-preparation code in here knows about some
2787 * of the structure of the compiled regexp. [I'll say.]
2790 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2801 RExC_state_t RExC_state;
2802 RExC_state_t *pRExC_state = &RExC_state;
2804 GET_RE_DEBUG_FLAGS_DECL;
2807 FAIL("NULL regexp argument");
2809 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2812 DEBUG_r(if (!PL_colorset) reginitcolors());
2814 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2815 PL_colors[4],PL_colors[5],PL_colors[0],
2816 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2818 RExC_flags = pm->op_pmflags;
2822 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2823 RExC_seen_evals = 0;
2826 /* First pass: determine size, legality. */
2833 RExC_emit = &PL_regdummy;
2834 RExC_whilem_seen = 0;
2835 #if 0 /* REGC() is (currently) a NOP at the first pass.
2836 * Clever compilers notice this and complain. --jhi */
2837 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2839 if (reg(pRExC_state, 0, &flags) == NULL) {
2840 RExC_precomp = NULL;
2843 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2845 /* Small enough for pointer-storage convention?
2846 If extralen==0, this means that we will not need long jumps. */
2847 if (RExC_size >= 0x10000L && RExC_extralen)
2848 RExC_size += RExC_extralen;
2851 if (RExC_whilem_seen > 15)
2852 RExC_whilem_seen = 15;
2854 /* Allocate space and initialize. */
2855 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2858 FAIL("Regexp out of space");
2861 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2862 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2865 r->prelen = xend - exp;
2866 r->precomp = savepvn(RExC_precomp, r->prelen);
2868 #ifdef PERL_OLD_COPY_ON_WRITE
2869 r->saved_copy = NULL;
2871 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2872 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2873 r->lastparen = 0; /* mg.c reads this. */
2875 r->substrs = 0; /* Useful during FAIL. */
2876 r->startp = 0; /* Useful during FAIL. */
2877 r->endp = 0; /* Useful during FAIL. */
2879 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2881 r->offsets[0] = RExC_size;
2883 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2884 "%s %"UVuf" bytes for offset annotations.\n",
2885 r->offsets ? "Got" : "Couldn't get",
2886 (UV)((2*RExC_size+1) * sizeof(U32))));
2890 /* Second pass: emit code. */
2891 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2896 RExC_emit_start = r->program;
2897 RExC_emit = r->program;
2898 /* Store the count of eval-groups for security checks: */
2899 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2900 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2902 if (reg(pRExC_state, 0, &flags) == NULL)
2906 /* Dig out information for optimizations. */
2907 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2908 pm->op_pmflags = RExC_flags;
2910 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2911 r->regstclass = NULL;
2912 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2913 r->reganch |= ROPT_NAUGHTY;
2914 scan = r->program + 1; /* First BRANCH. */
2916 /* XXXX To minimize changes to RE engine we always allocate
2917 3-units-long substrs field. */
2918 Newxz(r->substrs, 1, struct reg_substr_data);
2920 StructCopy(&zero_scan_data, &data, scan_data_t);
2921 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2922 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2924 STRLEN longest_float_length, longest_fixed_length;
2925 struct regnode_charclass_class ch_class;
2930 /* Skip introductions and multiplicators >= 1. */
2931 while ((OP(first) == OPEN && (sawopen = 1)) ||
2932 /* An OR of *one* alternative - should not happen now. */
2933 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2934 (OP(first) == PLUS) ||
2935 (OP(first) == MINMOD) ||
2936 /* An {n,m} with n>0 */
2937 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2938 if (OP(first) == PLUS)
2941 first += regarglen[(U8)OP(first)];
2942 first = NEXTOPER(first);
2945 /* Starting-point info. */
2947 if (PL_regkind[(U8)OP(first)] == EXACT) {
2948 if (OP(first) == EXACT)
2949 /*EMPTY*/; /* Empty, get anchored substr later. */
2950 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2951 r->regstclass = first;
2953 else if (strchr((const char*)PL_simple,OP(first)))
2954 r->regstclass = first;
2955 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2956 PL_regkind[(U8)OP(first)] == NBOUND)
2957 r->regstclass = first;
2958 else if (PL_regkind[(U8)OP(first)] == BOL) {
2959 r->reganch |= (OP(first) == MBOL
2961 : (OP(first) == SBOL
2964 first = NEXTOPER(first);
2967 else if (OP(first) == GPOS) {
2968 r->reganch |= ROPT_ANCH_GPOS;
2969 first = NEXTOPER(first);
2972 else if (!sawopen && (OP(first) == STAR &&
2973 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2974 !(r->reganch & ROPT_ANCH) )
2976 /* turn .* into ^.* with an implied $*=1 */
2978 (OP(NEXTOPER(first)) == REG_ANY)
2981 r->reganch |= type | ROPT_IMPLICIT;
2982 first = NEXTOPER(first);
2985 if (sawplus && (!sawopen || !RExC_sawback)
2986 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
2987 /* x+ must match at the 1st pos of run of x's */
2988 r->reganch |= ROPT_SKIP;
2990 /* Scan is after the zeroth branch, first is atomic matcher. */
2991 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
2992 (IV)(first - scan + 1)));
2994 * If there's something expensive in the r.e., find the
2995 * longest literal string that must appear and make it the
2996 * regmust. Resolve ties in favor of later strings, since
2997 * the regstart check works with the beginning of the r.e.
2998 * and avoiding duplication strengthens checking. Not a
2999 * strong reason, but sufficient in the absence of others.
3000 * [Now we resolve ties in favor of the earlier string if
3001 * it happens that c_offset_min has been invalidated, since the
3002 * earlier string may buy us something the later one won't.]
3006 data.longest_fixed = newSVpvs("");
3007 data.longest_float = newSVpvs("");
3008 data.last_found = newSVpvs("");
3009 data.longest = &(data.longest_fixed);
3011 if (!r->regstclass) {
3012 cl_init(pRExC_state, &ch_class);
3013 data.start_class = &ch_class;
3014 stclass_flag = SCF_DO_STCLASS_AND;
3015 } else /* XXXX Check for BOUND? */
3017 data.last_closep = &last_close;
3019 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3020 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3021 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3022 && data.last_start_min == 0 && data.last_end > 0
3023 && !RExC_seen_zerolen
3024 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3025 r->reganch |= ROPT_CHECK_ALL;
3026 scan_commit(pRExC_state, &data);
3027 SvREFCNT_dec(data.last_found);
3029 longest_float_length = CHR_SVLEN(data.longest_float);
3030 if (longest_float_length
3031 || (data.flags & SF_FL_BEFORE_EOL
3032 && (!(data.flags & SF_FL_BEFORE_MEOL)
3033 || (RExC_flags & PMf_MULTILINE)))) {
3036 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3037 && data.offset_fixed == data.offset_float_min
3038 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3039 goto remove_float; /* As in (a)+. */
3041 if (SvUTF8(data.longest_float)) {
3042 r->float_utf8 = data.longest_float;
3043 r->float_substr = NULL;
3045 r->float_substr = data.longest_float;
3046 r->float_utf8 = NULL;
3048 r->float_min_offset = data.offset_float_min;
3049 r->float_max_offset = data.offset_float_max;
3050 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3051 && (!(data.flags & SF_FL_BEFORE_MEOL)
3052 || (RExC_flags & PMf_MULTILINE)));
3053 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3057 r->float_substr = r->float_utf8 = NULL;
3058 SvREFCNT_dec(data.longest_float);
3059 longest_float_length = 0;
3062 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3063 if (longest_fixed_length
3064 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3065 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3066 || (RExC_flags & PMf_MULTILINE)))) {
3069 if (SvUTF8(data.longest_fixed)) {
3070 r->anchored_utf8 = data.longest_fixed;
3071 r->anchored_substr = NULL;
3073 r->anchored_substr = data.longest_fixed;
3074 r->anchored_utf8 = NULL;
3076 r->anchored_offset = data.offset_fixed;
3077 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3078 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3079 || (RExC_flags & PMf_MULTILINE)));
3080 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3083 r->anchored_substr = r->anchored_utf8 = NULL;
3084 SvREFCNT_dec(data.longest_fixed);
3085 longest_fixed_length = 0;
3088 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3089 r->regstclass = NULL;
3090 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3092 && !(data.start_class->flags & ANYOF_EOS)
3093 && !cl_is_anything(data.start_class))
3095 const I32 n = add_data(pRExC_state, 1, "f");
3097 Newx(RExC_rx->data->data[n], 1,
3098 struct regnode_charclass_class);
3099 StructCopy(data.start_class,
3100 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3101 struct regnode_charclass_class);
3102 r->regstclass = (regnode*)RExC_rx->data->data[n];
3103 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3104 PL_regdata = r->data; /* for regprop() */
3105 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3106 regprop(sv, (regnode*)data.start_class);
3107 PerlIO_printf(Perl_debug_log,
3108 "synthetic stclass \"%s\".\n",
3109 SvPVX_const(sv));});
3112 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3113 if (longest_fixed_length > longest_float_length) {
3114 r->check_substr = r->anchored_substr;
3115 r->check_utf8 = r->anchored_utf8;
3116 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3117 if (r->reganch & ROPT_ANCH_SINGLE)
3118 r->reganch |= ROPT_NOSCAN;
3121 r->check_substr = r->float_substr;
3122 r->check_utf8 = r->float_utf8;
3123 r->check_offset_min = data.offset_float_min;
3124 r->check_offset_max = data.offset_float_max;
3126 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3127 This should be changed ASAP! */
3128 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3129 r->reganch |= RE_USE_INTUIT;
3130 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3131 r->reganch |= RE_INTUIT_TAIL;
3135 /* Several toplevels. Best we can is to set minlen. */
3137 struct regnode_charclass_class ch_class;
3140 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3141 scan = r->program + 1;
3142 cl_init(pRExC_state, &ch_class);
3143 data.start_class = &ch_class;
3144 data.last_closep = &last_close;
3145 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3146 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3147 = r->float_substr = r->float_utf8 = NULL;
3148 if (!(data.start_class->flags & ANYOF_EOS)
3149 && !cl_is_anything(data.start_class))
3151 const I32 n = add_data(pRExC_state, 1, "f");
3153 Newx(RExC_rx->data->data[n], 1,
3154 struct regnode_charclass_class);
3155 StructCopy(data.start_class,
3156 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3157 struct regnode_charclass_class);
3158 r->regstclass = (regnode*)RExC_rx->data->data[n];
3159 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3160 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3161 regprop(sv, (regnode*)data.start_class);
3162 PerlIO_printf(Perl_debug_log,
3163 "synthetic stclass \"%s\".\n",
3164 SvPVX_const(sv));});
3169 if (RExC_seen & REG_SEEN_GPOS)
3170 r->reganch |= ROPT_GPOS_SEEN;
3171 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3172 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3173 if (RExC_seen & REG_SEEN_EVAL)
3174 r->reganch |= ROPT_EVAL_SEEN;
3175 if (RExC_seen & REG_SEEN_CANY)
3176 r->reganch |= ROPT_CANY_SEEN;
3177 Newxz(r->startp, RExC_npar, I32);
3178 Newxz(r->endp, RExC_npar, I32);
3179 PL_regdata = r->data; /* for regprop() */
3180 DEBUG_COMPILE_r(regdump(r));
3185 - reg - regular expression, i.e. main body or parenthesized thing
3187 * Caller must absorb opening parenthesis.
3189 * Combining parenthesis handling with the base level of regular expression
3190 * is a trifle forced, but the need to tie the tails of the branches to what
3191 * follows makes it hard to avoid.
3194 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3195 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3198 register regnode *ret; /* Will be the head of the group. */
3199 register regnode *br;
3200 register regnode *lastbr;
3201 register regnode *ender = NULL;
3202 register I32 parno = 0;
3204 const I32 oregflags = RExC_flags;
3205 bool have_branch = 0;
3208 /* for (?g), (?gc), and (?o) warnings; warning
3209 about (?c) will warn about (?g) -- japhy */
3211 #define WASTED_O 0x01
3212 #define WASTED_G 0x02
3213 #define WASTED_C 0x04
3214 #define WASTED_GC (0x02|0x04)
3215 I32 wastedflags = 0x00;
3217 char * parse_start = RExC_parse; /* MJD */
3218 char * const oregcomp_parse = RExC_parse;
3220 *flagp = 0; /* Tentatively. */
3223 /* Make an OPEN node, if parenthesized. */
3225 if (*RExC_parse == '?') { /* (?...) */
3226 U32 posflags = 0, negflags = 0;
3227 U32 *flagsp = &posflags;
3228 bool is_logical = 0;
3229 const char * const seqstart = RExC_parse;
3232 paren = *RExC_parse++;
3233 ret = NULL; /* For look-ahead/behind. */
3235 case '<': /* (?<...) */
3236 RExC_seen |= REG_SEEN_LOOKBEHIND;
3237 if (*RExC_parse == '!')
3239 if (*RExC_parse != '=' && *RExC_parse != '!')
3242 case '=': /* (?=...) */
3243 case '!': /* (?!...) */
3244 RExC_seen_zerolen++;
3245 case ':': /* (?:...) */
3246 case '>': /* (?>...) */
3248 case '$': /* (?$...) */
3249 case '@': /* (?@...) */
3250 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3252 case '#': /* (?#...) */
3253 while (*RExC_parse && *RExC_parse != ')')
3255 if (*RExC_parse != ')')
3256 FAIL("Sequence (?#... not terminated");
3257 nextchar(pRExC_state);
3260 case 'p': /* (?p...) */
3261 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3262 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3264 case '?': /* (??...) */
3266 if (*RExC_parse != '{')
3268 paren = *RExC_parse++;
3270 case '{': /* (?{...}) */
3272 I32 count = 1, n = 0;
3274 char *s = RExC_parse;
3276 RExC_seen_zerolen++;
3277 RExC_seen |= REG_SEEN_EVAL;
3278 while (count && (c = *RExC_parse)) {
3289 if (*RExC_parse != ')') {
3291 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3295 OP_4tree *sop, *rop;
3296 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3299 Perl_save_re_context(aTHX);
3300 rop = sv_compile_2op(sv, &sop, "re", &pad);
3301 sop->op_private |= OPpREFCOUNTED;
3302 /* re_dup will OpREFCNT_inc */
3303 OpREFCNT_set(sop, 1);
3306 n = add_data(pRExC_state, 3, "nop");
3307 RExC_rx->data->data[n] = (void*)rop;
3308 RExC_rx->data->data[n+1] = (void*)sop;
3309 RExC_rx->data->data[n+2] = (void*)pad;
3312 else { /* First pass */
3313 if (PL_reginterp_cnt < ++RExC_seen_evals
3315 /* No compiled RE interpolated, has runtime
3316 components ===> unsafe. */
3317 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3318 if (PL_tainting && PL_tainted)
3319 FAIL("Eval-group in insecure regular expression");
3320 if (IN_PERL_COMPILETIME)
3324 nextchar(pRExC_state);
3326 ret = reg_node(pRExC_state, LOGICAL);
3329 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3330 /* deal with the length of this later - MJD */
3333 ret = reganode(pRExC_state, EVAL, n);
3334 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3335 Set_Node_Offset(ret, parse_start);
3338 case '(': /* (?(?{...})...) and (?(?=...)...) */
3340 if (RExC_parse[0] == '?') { /* (?(?...)) */
3341 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3342 || RExC_parse[1] == '<'
3343 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3346 ret = reg_node(pRExC_state, LOGICAL);
3349 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3353 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3356 parno = atoi(RExC_parse++);
3358 while (isDIGIT(*RExC_parse))
3360 ret = reganode(pRExC_state, GROUPP, parno);
3362 if ((c = *nextchar(pRExC_state)) != ')')
3363 vFAIL("Switch condition not recognized");
3365 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3366 br = regbranch(pRExC_state, &flags, 1);
3368 br = reganode(pRExC_state, LONGJMP, 0);
3370 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3371 c = *nextchar(pRExC_state);
3375 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3376 regbranch(pRExC_state, &flags, 1);
3377 regtail(pRExC_state, ret, lastbr);
3380 c = *nextchar(pRExC_state);
3385 vFAIL("Switch (?(condition)... contains too many branches");
3386 ender = reg_node(pRExC_state, TAIL);
3387 regtail(pRExC_state, br, ender);
3389 regtail(pRExC_state, lastbr, ender);
3390 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3393 regtail(pRExC_state, ret, ender);
3397 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3401 RExC_parse--; /* for vFAIL to print correctly */
3402 vFAIL("Sequence (? incomplete");
3406 parse_flags: /* (?i) */
3407 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3408 /* (?g), (?gc) and (?o) are useless here
3409 and must be globally applied -- japhy */
3411 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3412 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3413 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3414 if (! (wastedflags & wflagbit) ) {
3415 wastedflags |= wflagbit;
3418 "Useless (%s%c) - %suse /%c modifier",
3419 flagsp == &negflags ? "?-" : "?",
3421 flagsp == &negflags ? "don't " : "",
3427 else if (*RExC_parse == 'c') {
3428 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3429 if (! (wastedflags & WASTED_C) ) {
3430 wastedflags |= WASTED_GC;
3433 "Useless (%sc) - %suse /gc modifier",
3434 flagsp == &negflags ? "?-" : "?",
3435 flagsp == &negflags ? "don't " : ""
3440 else { pmflag(flagsp, *RExC_parse); }
3444 if (*RExC_parse == '-') {
3446 wastedflags = 0; /* reset so (?g-c) warns twice */
3450 RExC_flags |= posflags;
3451 RExC_flags &= ~negflags;
3452 if (*RExC_parse == ':') {
3458 if (*RExC_parse != ')') {
3460 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3462 nextchar(pRExC_state);
3470 ret = reganode(pRExC_state, OPEN, parno);
3471 Set_Node_Length(ret, 1); /* MJD */
3472 Set_Node_Offset(ret, RExC_parse); /* MJD */
3479 /* Pick up the branches, linking them together. */
3480 parse_start = RExC_parse; /* MJD */
3481 br = regbranch(pRExC_state, &flags, 1);
3482 /* branch_len = (paren != 0); */
3486 if (*RExC_parse == '|') {
3487 if (!SIZE_ONLY && RExC_extralen) {
3488 reginsert(pRExC_state, BRANCHJ, br);
3491 reginsert(pRExC_state, BRANCH, br);
3492 Set_Node_Length(br, paren != 0);
3493 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3497 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3499 else if (paren == ':') {
3500 *flagp |= flags&SIMPLE;
3502 if (is_open) { /* Starts with OPEN. */
3503 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3505 else if (paren != '?') /* Not Conditional */
3507 *flagp |= flags & (SPSTART | HASWIDTH);
3509 while (*RExC_parse == '|') {
3510 if (!SIZE_ONLY && RExC_extralen) {
3511 ender = reganode(pRExC_state, LONGJMP,0);
3512 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3515 RExC_extralen += 2; /* Account for LONGJMP. */
3516 nextchar(pRExC_state);
3517 br = regbranch(pRExC_state, &flags, 0);
3521 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3525 *flagp |= flags&SPSTART;
3528 if (have_branch || paren != ':') {
3529 /* Make a closing node, and hook it on the end. */
3532 ender = reg_node(pRExC_state, TAIL);
3535 ender = reganode(pRExC_state, CLOSE, parno);
3536 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3537 Set_Node_Length(ender,1); /* MJD */
3543 *flagp &= ~HASWIDTH;
3546 ender = reg_node(pRExC_state, SUCCEED);
3549 ender = reg_node(pRExC_state, END);
3552 regtail(pRExC_state, lastbr, ender);
3555 /* Hook the tails of the branches to the closing node. */
3556 for (br = ret; br != NULL; br = regnext(br)) {
3557 regoptail(pRExC_state, br, ender);
3564 static const char parens[] = "=!<,>";
3566 if (paren && (p = strchr(parens, paren))) {
3567 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3568 int flag = (p - parens) > 1;
3571 node = SUSPEND, flag = 0;
3572 reginsert(pRExC_state, node,ret);
3573 Set_Node_Cur_Length(ret);
3574 Set_Node_Offset(ret, parse_start + 1);
3576 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3580 /* Check for proper termination. */
3582 RExC_flags = oregflags;
3583 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3584 RExC_parse = oregcomp_parse;
3585 vFAIL("Unmatched (");
3588 else if (!paren && RExC_parse < RExC_end) {
3589 if (*RExC_parse == ')') {
3591 vFAIL("Unmatched )");
3594 FAIL("Junk on end of regexp"); /* "Can't happen". */
3602 - regbranch - one alternative of an | operator
3604 * Implements the concatenation operator.
3607 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3610 register regnode *ret;
3611 register regnode *chain = NULL;
3612 register regnode *latest;
3613 I32 flags = 0, c = 0;
3618 if (!SIZE_ONLY && RExC_extralen)
3619 ret = reganode(pRExC_state, BRANCHJ,0);
3621 ret = reg_node(pRExC_state, BRANCH);
3622 Set_Node_Length(ret, 1);
3626 if (!first && SIZE_ONLY)
3627 RExC_extralen += 1; /* BRANCHJ */
3629 *flagp = WORST; /* Tentatively. */
3632 nextchar(pRExC_state);
3633 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3635 latest = regpiece(pRExC_state, &flags);
3636 if (latest == NULL) {
3637 if (flags & TRYAGAIN)
3641 else if (ret == NULL)
3643 *flagp |= flags&HASWIDTH;
3644 if (chain == NULL) /* First piece. */
3645 *flagp |= flags&SPSTART;
3648 regtail(pRExC_state, chain, latest);
3653 if (chain == NULL) { /* Loop ran zero times. */
3654 chain = reg_node(pRExC_state, NOTHING);
3659 *flagp |= flags&SIMPLE;
3666 - regpiece - something followed by possible [*+?]
3668 * Note that the branching code sequences used for ? and the general cases
3669 * of * and + are somewhat optimized: they use the same NOTHING node as
3670 * both the endmarker for their branch list and the body of the last branch.
3671 * It might seem that this node could be dispensed with entirely, but the
3672 * endmarker role is not redundant.
3675 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3678 register regnode *ret;
3680 register char *next;
3682 const char * const origparse = RExC_parse;
3685 I32 max = REG_INFTY;
3688 ret = regatom(pRExC_state, &flags);
3690 if (flags & TRYAGAIN)
3697 if (op == '{' && regcurly(RExC_parse)) {
3698 parse_start = RExC_parse; /* MJD */
3699 next = RExC_parse + 1;
3701 while (isDIGIT(*next) || *next == ',') {
3710 if (*next == '}') { /* got one */
3714 min = atoi(RExC_parse);
3718 maxpos = RExC_parse;
3720 if (!max && *maxpos != '0')
3721 max = REG_INFTY; /* meaning "infinity" */
3722 else if (max >= REG_INFTY)
3723 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3725 nextchar(pRExC_state);
3728 if ((flags&SIMPLE)) {
3729 RExC_naughty += 2 + RExC_naughty / 2;
3730 reginsert(pRExC_state, CURLY, ret);
3731 Set_Node_Offset(ret, parse_start+1); /* MJD */
3732 Set_Node_Cur_Length(ret);
3735 regnode *w = reg_node(pRExC_state, WHILEM);
3738 regtail(pRExC_state, ret, w);
3739 if (!SIZE_ONLY && RExC_extralen) {
3740 reginsert(pRExC_state, LONGJMP,ret);
3741 reginsert(pRExC_state, NOTHING,ret);
3742 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3744 reginsert(pRExC_state, CURLYX,ret);
3746 Set_Node_Offset(ret, parse_start+1);
3747 Set_Node_Length(ret,
3748 op == '{' ? (RExC_parse - parse_start) : 1);
3750 if (!SIZE_ONLY && RExC_extralen)
3751 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3752 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3754 RExC_whilem_seen++, RExC_extralen += 3;
3755 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3763 if (max && max < min)
3764 vFAIL("Can't do {n,m} with n > m");
3766 ARG1_SET(ret, (U16)min);
3767 ARG2_SET(ret, (U16)max);
3779 #if 0 /* Now runtime fix should be reliable. */
3781 /* if this is reinstated, don't forget to put this back into perldiag:
3783 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3785 (F) The part of the regexp subject to either the * or + quantifier
3786 could match an empty string. The {#} shows in the regular
3787 expression about where the problem was discovered.
3791 if (!(flags&HASWIDTH) && op != '?')
3792 vFAIL("Regexp *+ operand could be empty");
3795 parse_start = RExC_parse;
3796 nextchar(pRExC_state);
3798 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3800 if (op == '*' && (flags&SIMPLE)) {
3801 reginsert(pRExC_state, STAR, ret);
3805 else if (op == '*') {
3809 else if (op == '+' && (flags&SIMPLE)) {
3810 reginsert(pRExC_state, PLUS, ret);
3814 else if (op == '+') {
3818 else if (op == '?') {
3823 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
3825 "%.*s matches null string many times",
3826 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
3830 if (*RExC_parse == '?') {
3831 nextchar(pRExC_state);
3832 reginsert(pRExC_state, MINMOD, ret);
3833 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3835 if (ISMULT2(RExC_parse)) {
3837 vFAIL("Nested quantifiers");
3844 - regatom - the lowest level
3846 * Optimization: gobbles an entire sequence of ordinary characters so that
3847 * it can turn them into a single node, which is smaller to store and
3848 * faster to run. Backslashed characters are exceptions, each becoming a
3849 * separate node; the code is simpler that way and it's not worth fixing.
3851 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3853 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3856 register regnode *ret = NULL;
3858 char *parse_start = RExC_parse;
3860 *flagp = WORST; /* Tentatively. */
3863 switch (*RExC_parse) {
3865 RExC_seen_zerolen++;
3866 nextchar(pRExC_state);
3867 if (RExC_flags & PMf_MULTILINE)
3868 ret = reg_node(pRExC_state, MBOL);
3869 else if (RExC_flags & PMf_SINGLELINE)
3870 ret = reg_node(pRExC_state, SBOL);
3872 ret = reg_node(pRExC_state, BOL);
3873 Set_Node_Length(ret, 1); /* MJD */
3876 nextchar(pRExC_state);
3878 RExC_seen_zerolen++;
3879 if (RExC_flags & PMf_MULTILINE)
3880 ret = reg_node(pRExC_state, MEOL);
3881 else if (RExC_flags & PMf_SINGLELINE)
3882 ret = reg_node(pRExC_state, SEOL);
3884 ret = reg_node(pRExC_state, EOL);
3885 Set_Node_Length(ret, 1); /* MJD */
3888 nextchar(pRExC_state);
3889 if (RExC_flags & PMf_SINGLELINE)
3890 ret = reg_node(pRExC_state, SANY);
3892 ret = reg_node(pRExC_state, REG_ANY);
3893 *flagp |= HASWIDTH|SIMPLE;
3895 Set_Node_Length(ret, 1); /* MJD */
3899 char *oregcomp_parse = ++RExC_parse;
3900 ret = regclass(pRExC_state);
3901 if (*RExC_parse != ']') {
3902 RExC_parse = oregcomp_parse;
3903 vFAIL("Unmatched [");
3905 nextchar(pRExC_state);
3906 *flagp |= HASWIDTH|SIMPLE;
3907 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3911 nextchar(pRExC_state);
3912 ret = reg(pRExC_state, 1, &flags);
3914 if (flags & TRYAGAIN) {
3915 if (RExC_parse == RExC_end) {
3916 /* Make parent create an empty node if needed. */
3924 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3928 if (flags & TRYAGAIN) {
3932 vFAIL("Internal urp");
3933 /* Supposed to be caught earlier. */
3936 if (!regcurly(RExC_parse)) {
3945 vFAIL("Quantifier follows nothing");
3948 switch (*++RExC_parse) {
3950 RExC_seen_zerolen++;
3951 ret = reg_node(pRExC_state, SBOL);
3953 nextchar(pRExC_state);
3954 Set_Node_Length(ret, 2); /* MJD */
3957 ret = reg_node(pRExC_state, GPOS);
3958 RExC_seen |= REG_SEEN_GPOS;
3960 nextchar(pRExC_state);
3961 Set_Node_Length(ret, 2); /* MJD */
3964 ret = reg_node(pRExC_state, SEOL);
3966 RExC_seen_zerolen++; /* Do not optimize RE away */
3967 nextchar(pRExC_state);
3970 ret = reg_node(pRExC_state, EOS);
3972 RExC_seen_zerolen++; /* Do not optimize RE away */
3973 nextchar(pRExC_state);
3974 Set_Node_Length(ret, 2); /* MJD */
3977 ret = reg_node(pRExC_state, CANY);
3978 RExC_seen |= REG_SEEN_CANY;
3979 *flagp |= HASWIDTH|SIMPLE;
3980 nextchar(pRExC_state);
3981 Set_Node_Length(ret, 2); /* MJD */
3984 ret = reg_node(pRExC_state, CLUMP);
3986 nextchar(pRExC_state);
3987 Set_Node_Length(ret, 2); /* MJD */
3990 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
3991 *flagp |= HASWIDTH|SIMPLE;
3992 nextchar(pRExC_state);
3993 Set_Node_Length(ret, 2); /* MJD */
3996 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
3997 *flagp |= HASWIDTH|SIMPLE;
3998 nextchar(pRExC_state);
3999 Set_Node_Length(ret, 2); /* MJD */
4002 RExC_seen_zerolen++;
4003 RExC_seen |= REG_SEEN_LOOKBEHIND;
4004 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4006 nextchar(pRExC_state);
4007 Set_Node_Length(ret, 2); /* MJD */
4010 RExC_seen_zerolen++;
4011 RExC_seen |= REG_SEEN_LOOKBEHIND;
4012 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4014 nextchar(pRExC_state);
4015 Set_Node_Length(ret, 2); /* MJD */
4018 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4019 *flagp |= HASWIDTH|SIMPLE;
4020 nextchar(pRExC_state);
4021 Set_Node_Length(ret, 2); /* MJD */
4024 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4025 *flagp |= HASWIDTH|SIMPLE;
4026 nextchar(pRExC_state);
4027 Set_Node_Length(ret, 2); /* MJD */
4030 ret = reg_node(pRExC_state, DIGIT);
4031 *flagp |= HASWIDTH|SIMPLE;
4032 nextchar(pRExC_state);
4033 Set_Node_Length(ret, 2); /* MJD */
4036 ret = reg_node(pRExC_state, NDIGIT);
4037 *flagp |= HASWIDTH|SIMPLE;
4038 nextchar(pRExC_state);
4039 Set_Node_Length(ret, 2); /* MJD */
4044 char* oldregxend = RExC_end;
4045 char* parse_start = RExC_parse - 2;
4047 if (RExC_parse[1] == '{') {
4048 /* a lovely hack--pretend we saw [\pX] instead */
4049 RExC_end = strchr(RExC_parse, '}');
4051 U8 c = (U8)*RExC_parse;
4053 RExC_end = oldregxend;
4054 vFAIL2("Missing right brace on \\%c{}", c);
4059 RExC_end = RExC_parse + 2;
4060 if (RExC_end > oldregxend)
4061 RExC_end = oldregxend;
4065 ret = regclass(pRExC_state);
4067 RExC_end = oldregxend;
4070 Set_Node_Offset(ret, parse_start + 2);
4071 Set_Node_Cur_Length(ret);
4072 nextchar(pRExC_state);
4073 *flagp |= HASWIDTH|SIMPLE;
4086 case '1': case '2': case '3': case '4':
4087 case '5': case '6': case '7': case '8': case '9':
4089 const I32 num = atoi(RExC_parse);
4091 if (num > 9 && num >= RExC_npar)
4094 char * parse_start = RExC_parse - 1; /* MJD */
4095 while (isDIGIT(*RExC_parse))
4098 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4099 vFAIL("Reference to nonexistent group");
4101 ret = reganode(pRExC_state,
4102 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4106 /* override incorrect value set in reganode MJD */
4107 Set_Node_Offset(ret, parse_start+1);
4108 Set_Node_Cur_Length(ret); /* MJD */
4110 nextchar(pRExC_state);
4115 if (RExC_parse >= RExC_end)
4116 FAIL("Trailing \\");
4119 /* Do not generate "unrecognized" warnings here, we fall
4120 back into the quick-grab loop below */
4127 if (RExC_flags & PMf_EXTENDED) {
4128 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4129 if (RExC_parse < RExC_end)
4135 register STRLEN len;
4140 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4142 parse_start = RExC_parse - 1;
4148 ret = reg_node(pRExC_state,
4149 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4151 for (len = 0, p = RExC_parse - 1;
4152 len < 127 && p < RExC_end;
4157 if (RExC_flags & PMf_EXTENDED)
4158 p = regwhite(p, RExC_end);
4205 ender = ASCII_TO_NATIVE('\033');
4209 ender = ASCII_TO_NATIVE('\007');
4214 char* const e = strchr(p, '}');
4218 vFAIL("Missing right brace on \\x{}");
4221 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4222 | PERL_SCAN_DISALLOW_PREFIX;
4223 STRLEN numlen = e - p - 1;
4224 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4231 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4233 ender = grok_hex(p, &numlen, &flags, NULL);
4239 ender = UCHARAT(p++);
4240 ender = toCTRL(ender);
4242 case '0': case '1': case '2': case '3':case '4':
4243 case '5': case '6': case '7': case '8':case '9':
4245 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4248 ender = grok_oct(p, &numlen, &flags, NULL);
4258 FAIL("Trailing \\");
4261 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4262 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4263 goto normal_default;
4268 if (UTF8_IS_START(*p) && UTF) {
4270 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4278 if (RExC_flags & PMf_EXTENDED)
4279 p = regwhite(p, RExC_end);
4281 /* Prime the casefolded buffer. */
4282 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4284 if (ISMULT2(p)) { /* Back off on ?+*. */
4291 /* Emit all the Unicode characters. */
4293 for (foldbuf = tmpbuf;
4295 foldlen -= numlen) {
4296 ender = utf8_to_uvchr(foldbuf, &numlen);
4298 reguni(pRExC_state, ender, s, &unilen);
4301 /* In EBCDIC the numlen
4302 * and unilen can differ. */
4304 if (numlen >= foldlen)
4308 break; /* "Can't happen." */
4312 reguni(pRExC_state, ender, s, &unilen);
4321 REGC((char)ender, s++);
4329 /* Emit all the Unicode characters. */
4331 for (foldbuf = tmpbuf;
4333 foldlen -= numlen) {
4334 ender = utf8_to_uvchr(foldbuf, &numlen);
4336 reguni(pRExC_state, ender, s, &unilen);
4339 /* In EBCDIC the numlen
4340 * and unilen can differ. */
4342 if (numlen >= foldlen)
4350 reguni(pRExC_state, ender, s, &unilen);
4359 REGC((char)ender, s++);
4363 Set_Node_Cur_Length(ret); /* MJD */
4364 nextchar(pRExC_state);
4366 /* len is STRLEN which is unsigned, need to copy to signed */
4369 vFAIL("Internal disaster");
4373 if (len == 1 && UNI_IS_INVARIANT(ender))
4378 RExC_size += STR_SZ(len);
4380 RExC_emit += STR_SZ(len);
4385 /* If the encoding pragma is in effect recode the text of
4386 * any EXACT-kind nodes. */
4387 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4388 STRLEN oldlen = STR_LEN(ret);
4389 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4393 if (sv_utf8_downgrade(sv, TRUE)) {
4394 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4395 const STRLEN newlen = SvCUR(sv);
4400 GET_RE_DEBUG_FLAGS_DECL;
4401 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4402 (int)oldlen, STRING(ret),
4404 Copy(s, STRING(ret), newlen, char);
4405 STR_LEN(ret) += newlen - oldlen;
4406 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4408 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4416 S_regwhite(char *p, const char *e)
4421 else if (*p == '#') {
4424 } while (p < e && *p != '\n');
4432 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4433 Character classes ([:foo:]) can also be negated ([:^foo:]).
4434 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4435 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4436 but trigger failures because they are currently unimplemented. */
4438 #define POSIXCC_DONE(c) ((c) == ':')
4439 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4440 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4443 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4446 I32 namedclass = OOB_NAMEDCLASS;
4448 if (value == '[' && RExC_parse + 1 < RExC_end &&
4449 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4450 POSIXCC(UCHARAT(RExC_parse))) {
4451 const char c = UCHARAT(RExC_parse);
4452 char* s = RExC_parse++;
4454 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4456 if (RExC_parse == RExC_end)
4457 /* Grandfather lone [:, [=, [. */
4460 const char* t = RExC_parse++; /* skip over the c */
4461 const char *posixcc;
4465 if (UCHARAT(RExC_parse) == ']') {
4466 RExC_parse++; /* skip over the ending ] */
4469 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4470 const I32 skip = t - posixcc;
4472 /* Initially switch on the length of the name. */
4475 if (memEQ(posixcc, "word", 4)) {
4476 /* this is not POSIX, this is the Perl \w */;
4478 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4482 /* Names all of length 5. */
4483 /* alnum alpha ascii blank cntrl digit graph lower
4484 print punct space upper */
4485 /* Offset 4 gives the best switch position. */
4486 switch (posixcc[4]) {
4488 if (memEQ(posixcc, "alph", 4)) {
4491 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4495 if (memEQ(posixcc, "spac", 4)) {
4498 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4502 if (memEQ(posixcc, "grap", 4)) {
4505 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4509 if (memEQ(posixcc, "asci", 4)) {
4512 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4516 if (memEQ(posixcc, "blan", 4)) {
4519 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4523 if (memEQ(posixcc, "cntr", 4)) {
4526 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4530 if (memEQ(posixcc, "alnu", 4)) {
4533 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4537 if (memEQ(posixcc, "lowe", 4)) {
4540 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4542 if (memEQ(posixcc, "uppe", 4)) {
4545 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4549 if (memEQ(posixcc, "digi", 4)) {
4552 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4554 if (memEQ(posixcc, "prin", 4)) {
4557 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4559 if (memEQ(posixcc, "punc", 4)) {
4562 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4568 if (memEQ(posixcc, "xdigit", 6)) {
4570 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4575 if (namedclass == OOB_NAMEDCLASS)
4577 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4580 assert (posixcc[skip] == ':');
4581 assert (posixcc[skip+1] == ']');
4582 } else if (!SIZE_ONLY) {
4583 /* [[=foo=]] and [[.foo.]] are still future. */
4585 /* adjust RExC_parse so the warning shows after
4587 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4589 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4592 /* Maternal grandfather:
4593 * "[:" ending in ":" but not in ":]" */
4603 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4606 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4607 const char *s = RExC_parse;
4608 const char c = *s++;
4610 while(*s && isALNUM(*s))
4612 if (*s && c == *s && s[1] == ']') {
4613 if (ckWARN(WARN_REGEXP))
4615 "POSIX syntax [%c %c] belongs inside character classes",
4618 /* [[=foo=]] and [[.foo.]] are still future. */
4619 if (POSIXCC_NOTYET(c)) {
4620 /* adjust RExC_parse so the error shows after
4622 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4624 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4631 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4635 register UV nextvalue;
4636 register IV prevvalue = OOB_UNICODE;
4637 register IV range = 0;
4638 register regnode *ret;
4641 char *rangebegin = NULL;
4642 bool need_class = 0;
4646 bool optimize_invert = TRUE;
4647 AV* unicode_alternate = NULL;
4649 UV literal_endpoint = 0;
4652 ret = reganode(pRExC_state, ANYOF, 0);
4655 ANYOF_FLAGS(ret) = 0;
4657 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4661 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4665 RExC_size += ANYOF_SKIP;
4667 RExC_emit += ANYOF_SKIP;
4669 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4671 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4672 ANYOF_BITMAP_ZERO(ret);
4673 listsv = newSVpvs("# comment\n");
4676 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4678 if (!SIZE_ONLY && POSIXCC(nextvalue))
4679 checkposixcc(pRExC_state);
4681 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4682 if (UCHARAT(RExC_parse) == ']')
4685 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4689 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4692 rangebegin = RExC_parse;
4694 value = utf8n_to_uvchr((U8*)RExC_parse,
4695 RExC_end - RExC_parse,
4697 RExC_parse += numlen;
4700 value = UCHARAT(RExC_parse++);
4701 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4702 if (value == '[' && POSIXCC(nextvalue))
4703 namedclass = regpposixcc(pRExC_state, value);
4704 else if (value == '\\') {
4706 value = utf8n_to_uvchr((U8*)RExC_parse,
4707 RExC_end - RExC_parse,
4709 RExC_parse += numlen;
4712 value = UCHARAT(RExC_parse++);
4713 /* Some compilers cannot handle switching on 64-bit integer
4714 * values, therefore value cannot be an UV. Yes, this will
4715 * be a problem later if we want switch on Unicode.
4716 * A similar issue a little bit later when switching on
4717 * namedclass. --jhi */
4718 switch ((I32)value) {
4719 case 'w': namedclass = ANYOF_ALNUM; break;
4720 case 'W': namedclass = ANYOF_NALNUM; break;
4721 case 's': namedclass = ANYOF_SPACE; break;
4722 case 'S': namedclass = ANYOF_NSPACE; break;
4723 case 'd': namedclass = ANYOF_DIGIT; break;
4724 case 'D': namedclass = ANYOF_NDIGIT; break;
4727 if (RExC_parse >= RExC_end)
4728 vFAIL2("Empty \\%c{}", (U8)value);
4729 if (*RExC_parse == '{') {
4730 const U8 c = (U8)value;
4731 e = strchr(RExC_parse++, '}');
4733 vFAIL2("Missing right brace on \\%c{}", c);
4734 while (isSPACE(UCHARAT(RExC_parse)))
4736 if (e == RExC_parse)
4737 vFAIL2("Empty \\%c{}", c);
4739 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4747 if (UCHARAT(RExC_parse) == '^') {
4750 value = value == 'p' ? 'P' : 'p'; /* toggle */
4751 while (isSPACE(UCHARAT(RExC_parse))) {
4757 Perl_sv_catpvf(aTHX_ listsv,
4758 "+utf8::%.*s\n", (int)n, RExC_parse);
4760 Perl_sv_catpvf(aTHX_ listsv,
4761 "!utf8::%.*s\n", (int)n, RExC_parse);
4764 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4765 namedclass = ANYOF_MAX; /* no official name, but it's named */
4767 case 'n': value = '\n'; break;
4768 case 'r': value = '\r'; break;
4769 case 't': value = '\t'; break;
4770 case 'f': value = '\f'; break;
4771 case 'b': value = '\b'; break;
4772 case 'e': value = ASCII_TO_NATIVE('\033');break;
4773 case 'a': value = ASCII_TO_NATIVE('\007');break;
4775 if (*RExC_parse == '{') {
4776 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4777 | PERL_SCAN_DISALLOW_PREFIX;
4778 e = strchr(RExC_parse++, '}');
4780 vFAIL("Missing right brace on \\x{}");
4782 numlen = e - RExC_parse;
4783 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4787 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4789 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4790 RExC_parse += numlen;
4794 value = UCHARAT(RExC_parse++);
4795 value = toCTRL(value);
4797 case '0': case '1': case '2': case '3': case '4':
4798 case '5': case '6': case '7': case '8': case '9':
4802 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4803 RExC_parse += numlen;
4807 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
4809 "Unrecognized escape \\%c in character class passed through",
4813 } /* end of \blah */
4819 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4821 if (!SIZE_ONLY && !need_class)
4822 ANYOF_CLASS_ZERO(ret);
4826 /* a bad range like a-\d, a-[:digit:] ? */
4829 if (ckWARN(WARN_REGEXP)) {
4831 RExC_parse >= rangebegin ?
4832 RExC_parse - rangebegin : 0;
4834 "False [] range \"%*.*s\"",
4839 if (prevvalue < 256) {
4840 ANYOF_BITMAP_SET(ret, prevvalue);
4841 ANYOF_BITMAP_SET(ret, '-');
4844 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4845 Perl_sv_catpvf(aTHX_ listsv,
4846 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4850 range = 0; /* this was not a true range */
4854 const char *what = NULL;
4857 if (namedclass > OOB_NAMEDCLASS)
4858 optimize_invert = FALSE;
4859 /* Possible truncation here but in some 64-bit environments
4860 * the compiler gets heartburn about switch on 64-bit values.
4861 * A similar issue a little earlier when switching on value.
4863 switch ((I32)namedclass) {
4866 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4868 for (value = 0; value < 256; value++)
4870 ANYOF_BITMAP_SET(ret, value);
4877 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4879 for (value = 0; value < 256; value++)
4880 if (!isALNUM(value))
4881 ANYOF_BITMAP_SET(ret, value);
4888 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4890 for (value = 0; value < 256; value++)
4891 if (isALNUMC(value))
4892 ANYOF_BITMAP_SET(ret, value);
4899 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4901 for (value = 0; value < 256; value++)
4902 if (!isALNUMC(value))
4903 ANYOF_BITMAP_SET(ret, value);
4910 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4912 for (value = 0; value < 256; value++)
4914 ANYOF_BITMAP_SET(ret, value);
4921 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4923 for (value = 0; value < 256; value++)
4924 if (!isALPHA(value))
4925 ANYOF_BITMAP_SET(ret, value);
4932 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4935 for (value = 0; value < 128; value++)
4936 ANYOF_BITMAP_SET(ret, value);
4938 for (value = 0; value < 256; value++) {
4940 ANYOF_BITMAP_SET(ret, value);
4949 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4952 for (value = 128; value < 256; value++)
4953 ANYOF_BITMAP_SET(ret, value);
4955 for (value = 0; value < 256; value++) {
4956 if (!isASCII(value))
4957 ANYOF_BITMAP_SET(ret, value);
4966 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4968 for (value = 0; value < 256; value++)
4970 ANYOF_BITMAP_SET(ret, value);
4977 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4979 for (value = 0; value < 256; value++)
4980 if (!isBLANK(value))
4981 ANYOF_BITMAP_SET(ret, value);
4988 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
4990 for (value = 0; value < 256; value++)
4992 ANYOF_BITMAP_SET(ret, value);
4999 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5001 for (value = 0; value < 256; value++)
5002 if (!isCNTRL(value))
5003 ANYOF_BITMAP_SET(ret, value);
5010 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5012 /* consecutive digits assumed */
5013 for (value = '0'; value <= '9'; value++)
5014 ANYOF_BITMAP_SET(ret, value);
5021 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5023 /* consecutive digits assumed */
5024 for (value = 0; value < '0'; value++)
5025 ANYOF_BITMAP_SET(ret, value);
5026 for (value = '9' + 1; value < 256; value++)
5027 ANYOF_BITMAP_SET(ret, value);
5034 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5036 for (value = 0; value < 256; value++)
5038 ANYOF_BITMAP_SET(ret, value);
5045 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5047 for (value = 0; value < 256; value++)
5048 if (!isGRAPH(value))
5049 ANYOF_BITMAP_SET(ret, value);
5056 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5058 for (value = 0; value < 256; value++)
5060 ANYOF_BITMAP_SET(ret, value);
5067 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5069 for (value = 0; value < 256; value++)
5070 if (!isLOWER(value))
5071 ANYOF_BITMAP_SET(ret, value);
5078 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5080 for (value = 0; value < 256; value++)
5082 ANYOF_BITMAP_SET(ret, value);
5089 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5091 for (value = 0; value < 256; value++)
5092 if (!isPRINT(value))
5093 ANYOF_BITMAP_SET(ret, value);
5100 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5102 for (value = 0; value < 256; value++)
5103 if (isPSXSPC(value))
5104 ANYOF_BITMAP_SET(ret, value);
5111 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5113 for (value = 0; value < 256; value++)
5114 if (!isPSXSPC(value))
5115 ANYOF_BITMAP_SET(ret, value);
5122 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5124 for (value = 0; value < 256; value++)
5126 ANYOF_BITMAP_SET(ret, value);
5133 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5135 for (value = 0; value < 256; value++)
5136 if (!isPUNCT(value))
5137 ANYOF_BITMAP_SET(ret, value);
5144 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5146 for (value = 0; value < 256; value++)
5148 ANYOF_BITMAP_SET(ret, value);
5155 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5157 for (value = 0; value < 256; value++)
5158 if (!isSPACE(value))
5159 ANYOF_BITMAP_SET(ret, value);
5166 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5168 for (value = 0; value < 256; value++)
5170 ANYOF_BITMAP_SET(ret, value);
5177 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5179 for (value = 0; value < 256; value++)
5180 if (!isUPPER(value))
5181 ANYOF_BITMAP_SET(ret, value);
5188 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5190 for (value = 0; value < 256; value++)
5191 if (isXDIGIT(value))
5192 ANYOF_BITMAP_SET(ret, value);
5199 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5201 for (value = 0; value < 256; value++)
5202 if (!isXDIGIT(value))
5203 ANYOF_BITMAP_SET(ret, value);
5209 /* this is to handle \p and \P */
5212 vFAIL("Invalid [::] class");
5216 /* Strings such as "+utf8::isWord\n" */
5217 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5220 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5223 } /* end of namedclass \blah */
5226 if (prevvalue > (IV)value) /* b-a */ {
5227 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5228 RExC_parse - rangebegin,
5229 RExC_parse - rangebegin,
5231 range = 0; /* not a valid range */
5235 prevvalue = value; /* save the beginning of the range */
5236 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5237 RExC_parse[1] != ']') {
5240 /* a bad range like \w-, [:word:]- ? */
5241 if (namedclass > OOB_NAMEDCLASS) {
5242 if (ckWARN(WARN_REGEXP)) {
5244 RExC_parse >= rangebegin ?
5245 RExC_parse - rangebegin : 0;
5247 "False [] range \"%*.*s\"",
5253 ANYOF_BITMAP_SET(ret, '-');
5255 range = 1; /* yeah, it's a range! */
5256 continue; /* but do it the next time */
5260 /* now is the next time */
5264 if (prevvalue < 256) {
5265 const IV ceilvalue = value < 256 ? value : 255;
5268 /* In EBCDIC [\x89-\x91] should include
5269 * the \x8e but [i-j] should not. */
5270 if (literal_endpoint == 2 &&
5271 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5272 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5274 if (isLOWER(prevvalue)) {
5275 for (i = prevvalue; i <= ceilvalue; i++)
5277 ANYOF_BITMAP_SET(ret, i);
5279 for (i = prevvalue; i <= ceilvalue; i++)
5281 ANYOF_BITMAP_SET(ret, i);
5286 for (i = prevvalue; i <= ceilvalue; i++)
5287 ANYOF_BITMAP_SET(ret, i);
5289 if (value > 255 || UTF) {
5290 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5291 const UV natvalue = NATIVE_TO_UNI(value);
5293 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5294 if (prevnatvalue < natvalue) { /* what about > ? */
5295 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5296 prevnatvalue, natvalue);
5298 else if (prevnatvalue == natvalue) {
5299 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5301 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5303 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5305 /* If folding and foldable and a single
5306 * character, insert also the folded version
5307 * to the charclass. */
5309 if (foldlen == (STRLEN)UNISKIP(f))
5310 Perl_sv_catpvf(aTHX_ listsv,
5313 /* Any multicharacter foldings
5314 * require the following transform:
5315 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5316 * where E folds into "pq" and F folds
5317 * into "rst", all other characters
5318 * fold to single characters. We save
5319 * away these multicharacter foldings,
5320 * to be later saved as part of the
5321 * additional "s" data. */
5324 if (!unicode_alternate)
5325 unicode_alternate = newAV();
5326 sv = newSVpvn((char*)foldbuf, foldlen);
5328 av_push(unicode_alternate, sv);
5332 /* If folding and the value is one of the Greek
5333 * sigmas insert a few more sigmas to make the
5334 * folding rules of the sigmas to work right.
5335 * Note that not all the possible combinations
5336 * are handled here: some of them are handled
5337 * by the standard folding rules, and some of
5338 * them (literal or EXACTF cases) are handled
5339 * during runtime in regexec.c:S_find_byclass(). */
5340 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5341 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5342 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5343 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5344 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5346 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5347 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5348 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5353 literal_endpoint = 0;
5357 range = 0; /* this range (if it was one) is done now */
5361 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5363 RExC_size += ANYOF_CLASS_ADD_SKIP;
5365 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5368 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5370 /* If the only flag is folding (plus possibly inversion). */
5371 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5373 for (value = 0; value < 256; ++value) {
5374 if (ANYOF_BITMAP_TEST(ret, value)) {
5375 UV fold = PL_fold[value];
5378 ANYOF_BITMAP_SET(ret, fold);
5381 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5384 /* optimize inverted simple patterns (e.g. [^a-z]) */
5385 if (!SIZE_ONLY && optimize_invert &&
5386 /* If the only flag is inversion. */
5387 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5388 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5389 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5390 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5397 /* The 0th element stores the character class description
5398 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5399 * to initialize the appropriate swash (which gets stored in
5400 * the 1st element), and also useful for dumping the regnode.
5401 * The 2nd element stores the multicharacter foldings,
5402 * used later (regexec.c:S_reginclass()). */
5403 av_store(av, 0, listsv);
5404 av_store(av, 1, NULL);
5405 av_store(av, 2, (SV*)unicode_alternate);
5406 rv = newRV_noinc((SV*)av);
5407 n = add_data(pRExC_state, 1, "s");
5408 RExC_rx->data->data[n] = (void*)rv;
5416 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5418 char* retval = RExC_parse++;
5421 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5422 RExC_parse[2] == '#') {
5423 while (*RExC_parse != ')') {
5424 if (RExC_parse == RExC_end)
5425 FAIL("Sequence (?#... not terminated");
5431 if (RExC_flags & PMf_EXTENDED) {
5432 if (isSPACE(*RExC_parse)) {
5436 else if (*RExC_parse == '#') {
5437 while (RExC_parse < RExC_end)
5438 if (*RExC_parse++ == '\n') break;
5447 - reg_node - emit a node
5449 STATIC regnode * /* Location. */
5450 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5453 register regnode *ptr;
5454 regnode * const ret = RExC_emit;
5457 SIZE_ALIGN(RExC_size);
5462 NODE_ALIGN_FILL(ret);
5464 FILL_ADVANCE_NODE(ptr, op);
5465 if (RExC_offsets) { /* MJD */
5466 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5467 "reg_node", __LINE__,
5469 RExC_emit - RExC_emit_start > RExC_offsets[0]
5470 ? "Overwriting end of array!\n" : "OK",
5471 RExC_emit - RExC_emit_start,
5472 RExC_parse - RExC_start,
5474 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5483 - reganode - emit a node with an argument
5485 STATIC regnode * /* Location. */
5486 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5489 register regnode *ptr;
5490 regnode * const ret = RExC_emit;
5493 SIZE_ALIGN(RExC_size);
5498 NODE_ALIGN_FILL(ret);
5500 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5501 if (RExC_offsets) { /* MJD */
5502 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5506 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5507 "Overwriting end of array!\n" : "OK",
5508 RExC_emit - RExC_emit_start,
5509 RExC_parse - RExC_start,
5511 Set_Cur_Node_Offset;
5520 - reguni - emit (if appropriate) a Unicode character
5523 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5526 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5530 - reginsert - insert an operator in front of already-emitted operand
5532 * Means relocating the operand.
5535 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5538 register regnode *src;
5539 register regnode *dst;
5540 register regnode *place;
5541 const int offset = regarglen[(U8)op];
5543 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5546 RExC_size += NODE_STEP_REGNODE + offset;
5551 RExC_emit += NODE_STEP_REGNODE + offset;
5553 while (src > opnd) {
5554 StructCopy(--src, --dst, regnode);
5555 if (RExC_offsets) { /* MJD 20010112 */
5556 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5560 dst - RExC_emit_start > RExC_offsets[0]
5561 ? "Overwriting end of array!\n" : "OK",
5562 src - RExC_emit_start,
5563 dst - RExC_emit_start,
5565 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5566 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5571 place = opnd; /* Op node, where operand used to be. */
5572 if (RExC_offsets) { /* MJD */
5573 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5577 place - RExC_emit_start > RExC_offsets[0]
5578 ? "Overwriting end of array!\n" : "OK",
5579 place - RExC_emit_start,
5580 RExC_parse - RExC_start,
5582 Set_Node_Offset(place, RExC_parse);
5583 Set_Node_Length(place, 1);
5585 src = NEXTOPER(place);
5586 FILL_ADVANCE_NODE(place, op);
5587 Zero(src, offset, regnode);
5591 - regtail - set the next-pointer at the end of a node chain of p to val.
5594 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5597 register regnode *scan;
5602 /* Find last node. */
5605 regnode * const temp = regnext(scan);
5611 if (reg_off_by_arg[OP(scan)]) {
5612 ARG_SET(scan, val - scan);
5615 NEXT_OFF(scan) = val - scan;
5620 - regoptail - regtail on operand of first argument; nop if operandless
5623 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5626 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5627 if (p == NULL || SIZE_ONLY)
5629 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5630 regtail(pRExC_state, NEXTOPER(p), val);
5632 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5633 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5640 - regcurly - a little FSA that accepts {\d+,?\d*}
5643 S_regcurly(register const char *s)
5662 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5665 Perl_regdump(pTHX_ regexp *r)
5669 SV * const sv = sv_newmortal();
5671 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5673 /* Header fields of interest. */
5674 if (r->anchored_substr)
5675 PerlIO_printf(Perl_debug_log,
5676 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5678 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5679 SvPVX_const(r->anchored_substr),
5681 SvTAIL(r->anchored_substr) ? "$" : "",
5682 (IV)r->anchored_offset);
5683 else if (r->anchored_utf8)
5684 PerlIO_printf(Perl_debug_log,
5685 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5687 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5688 SvPVX_const(r->anchored_utf8),
5690 SvTAIL(r->anchored_utf8) ? "$" : "",
5691 (IV)r->anchored_offset);
5692 if (r->float_substr)
5693 PerlIO_printf(Perl_debug_log,
5694 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5696 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5697 SvPVX_const(r->float_substr),
5699 SvTAIL(r->float_substr) ? "$" : "",
5700 (IV)r->float_min_offset, (UV)r->float_max_offset);
5701 else if (r->float_utf8)
5702 PerlIO_printf(Perl_debug_log,
5703 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5705 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5706 SvPVX_const(r->float_utf8),
5708 SvTAIL(r->float_utf8) ? "$" : "",
5709 (IV)r->float_min_offset, (UV)r->float_max_offset);
5710 if (r->check_substr || r->check_utf8)
5711 PerlIO_printf(Perl_debug_log,
5712 r->check_substr == r->float_substr
5713 && r->check_utf8 == r->float_utf8
5714 ? "(checking floating" : "(checking anchored");
5715 if (r->reganch & ROPT_NOSCAN)
5716 PerlIO_printf(Perl_debug_log, " noscan");
5717 if (r->reganch & ROPT_CHECK_ALL)
5718 PerlIO_printf(Perl_debug_log, " isall");
5719 if (r->check_substr || r->check_utf8)
5720 PerlIO_printf(Perl_debug_log, ") ");
5722 if (r->regstclass) {
5723 regprop(sv, r->regstclass);
5724 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5726 if (r->reganch & ROPT_ANCH) {
5727 PerlIO_printf(Perl_debug_log, "anchored");
5728 if (r->reganch & ROPT_ANCH_BOL)
5729 PerlIO_printf(Perl_debug_log, "(BOL)");
5730 if (r->reganch & ROPT_ANCH_MBOL)
5731 PerlIO_printf(Perl_debug_log, "(MBOL)");
5732 if (r->reganch & ROPT_ANCH_SBOL)
5733 PerlIO_printf(Perl_debug_log, "(SBOL)");
5734 if (r->reganch & ROPT_ANCH_GPOS)
5735 PerlIO_printf(Perl_debug_log, "(GPOS)");
5736 PerlIO_putc(Perl_debug_log, ' ');
5738 if (r->reganch & ROPT_GPOS_SEEN)
5739 PerlIO_printf(Perl_debug_log, "GPOS ");
5740 if (r->reganch & ROPT_SKIP)
5741 PerlIO_printf(Perl_debug_log, "plus ");
5742 if (r->reganch & ROPT_IMPLICIT)
5743 PerlIO_printf(Perl_debug_log, "implicit ");
5744 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5745 if (r->reganch & ROPT_EVAL_SEEN)
5746 PerlIO_printf(Perl_debug_log, "with eval ");
5747 PerlIO_printf(Perl_debug_log, "\n");
5749 const U32 len = r->offsets[0];
5750 GET_RE_DEBUG_FLAGS_DECL;
5753 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5754 for (i = 1; i <= len; i++)
5755 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5756 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5757 PerlIO_printf(Perl_debug_log, "\n");
5762 #endif /* DEBUGGING */
5766 - regprop - printable representation of opcode
5769 Perl_regprop(pTHX_ SV *sv, const regnode *o)
5775 sv_setpvn(sv, "", 0);
5776 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5777 /* It would be nice to FAIL() here, but this may be called from
5778 regexec.c, and it would be hard to supply pRExC_state. */
5779 Perl_croak(aTHX_ "Corrupted regexp opcode");
5780 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5782 k = PL_regkind[(U8)OP(o)];
5785 SV * const dsv = sv_2mortal(newSVpvs(""));
5786 /* Using is_utf8_string() is a crude hack but it may
5787 * be the best for now since we have no flag "this EXACTish
5788 * node was UTF-8" --jhi */
5789 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5790 const char * const s = do_utf8 ?
5791 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5792 UNI_DISPLAY_REGEX) :
5794 const int len = do_utf8 ?
5797 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5801 } else if (k == TRIE) {
5804 this isn't always safe, as Pl_regdata may not be for this regex yet
5805 (depending on where its called from) so its being moved to dumpuntil
5807 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5808 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5811 trie->uniquecharcount,
5814 } else if (k == CURLY) {
5815 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5816 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5817 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5819 else if (k == WHILEM && o->flags) /* Ordinal/of */
5820 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5821 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5822 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5823 else if (k == LOGICAL)
5824 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5825 else if (k == ANYOF) {
5826 int i, rangestart = -1;
5827 const U8 flags = ANYOF_FLAGS(o);
5829 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5830 static const char * const anyofs[] = {
5863 if (flags & ANYOF_LOCALE)
5864 sv_catpvs(sv, "{loc}");
5865 if (flags & ANYOF_FOLD)
5866 sv_catpvs(sv, "{i}");
5867 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5868 if (flags & ANYOF_INVERT)
5870 for (i = 0; i <= 256; i++) {
5871 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5872 if (rangestart == -1)
5874 } else if (rangestart != -1) {
5875 if (i <= rangestart + 3)
5876 for (; rangestart < i; rangestart++)
5877 put_byte(sv, rangestart);
5879 put_byte(sv, rangestart);
5881 put_byte(sv, i - 1);
5887 if (o->flags & ANYOF_CLASS)
5888 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5889 if (ANYOF_CLASS_TEST(o,i))
5890 sv_catpv(sv, anyofs[i]);
5892 if (flags & ANYOF_UNICODE)
5893 sv_catpvs(sv, "{unicode}");
5894 else if (flags & ANYOF_UNICODE_ALL)
5895 sv_catpvs(sv, "{unicode_all}");
5899 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
5903 U8 s[UTF8_MAXBYTES_CASE+1];
5905 for (i = 0; i <= 256; i++) { /* just the first 256 */
5906 uvchr_to_utf8(s, i);
5908 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5909 if (rangestart == -1)
5911 } else if (rangestart != -1) {
5912 if (i <= rangestart + 3)
5913 for (; rangestart < i; rangestart++) {
5914 const U8 * const e = uvchr_to_utf8(s,rangestart);
5916 for(p = s; p < e; p++)
5920 const U8 *e = uvchr_to_utf8(s,rangestart);
5922 for (p = s; p < e; p++)
5925 e = uvchr_to_utf8(s, i-1);
5926 for (p = s; p < e; p++)
5933 sv_catpvs(sv, "..."); /* et cetera */
5937 char *s = savesvpv(lv);
5938 char * const origs = s;
5940 while(*s && *s != '\n') s++;
5943 const char * const t = ++s;
5961 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5963 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5964 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5966 PERL_UNUSED_ARG(sv);
5968 #endif /* DEBUGGING */
5972 Perl_re_intuit_string(pTHX_ regexp *prog)
5973 { /* Assume that RE_INTUIT is set */
5975 GET_RE_DEBUG_FLAGS_DECL;
5978 const char * const s = SvPV_nolen_const(prog->check_substr
5979 ? prog->check_substr : prog->check_utf8);
5981 if (!PL_colorset) reginitcolors();
5982 PerlIO_printf(Perl_debug_log,
5983 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
5985 prog->check_substr ? "" : "utf8 ",
5986 PL_colors[5],PL_colors[0],
5989 (strlen(s) > 60 ? "..." : ""));
5992 return prog->check_substr ? prog->check_substr : prog->check_utf8;
5996 Perl_pregfree(pTHX_ struct regexp *r)
6000 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6001 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
6005 if (!r || (--r->refcnt > 0))
6007 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6008 const char * const s = (r->reganch & ROPT_UTF8)
6009 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6010 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6011 const int len = SvCUR(dsv);
6014 PerlIO_printf(Perl_debug_log,
6015 "%sFreeing REx:%s %s%*.*s%s%s\n",
6016 PL_colors[4],PL_colors[5],PL_colors[0],
6019 len > 60 ? "..." : "");
6022 /* gcov results gave these as non-null 100% of the time, so there's no
6023 optimisation in checking them before calling Safefree */
6024 Safefree(r->precomp);
6025 Safefree(r->offsets); /* 20010421 MJD */
6026 RX_MATCH_COPY_FREE(r);
6027 #ifdef PERL_OLD_COPY_ON_WRITE
6029 SvREFCNT_dec(r->saved_copy);
6032 if (r->anchored_substr)
6033 SvREFCNT_dec(r->anchored_substr);
6034 if (r->anchored_utf8)
6035 SvREFCNT_dec(r->anchored_utf8);
6036 if (r->float_substr)
6037 SvREFCNT_dec(r->float_substr);
6039 SvREFCNT_dec(r->float_utf8);
6040 Safefree(r->substrs);
6043 int n = r->data->count;
6044 PAD* new_comppad = NULL;
6049 /* If you add a ->what type here, update the comment in regcomp.h */
6050 switch (r->data->what[n]) {
6052 SvREFCNT_dec((SV*)r->data->data[n]);
6055 Safefree(r->data->data[n]);
6058 new_comppad = (AV*)r->data->data[n];
6061 if (new_comppad == NULL)
6062 Perl_croak(aTHX_ "panic: pregfree comppad");
6063 PAD_SAVE_LOCAL(old_comppad,
6064 /* Watch out for global destruction's random ordering. */
6065 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6068 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6071 op_free((OP_4tree*)r->data->data[n]);
6073 PAD_RESTORE_LOCAL(old_comppad);
6074 SvREFCNT_dec((SV*)new_comppad);
6081 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
6084 refcount = --trie->refcount;
6087 Safefree(trie->charmap);
6088 if (trie->widecharmap)
6089 SvREFCNT_dec((SV*)trie->widecharmap);
6090 Safefree(trie->states);
6091 Safefree(trie->trans);
6094 SvREFCNT_dec((SV*)trie->words);
6095 if (trie->revcharmap)
6096 SvREFCNT_dec((SV*)trie->revcharmap);
6098 Safefree(r->data->data[n]); /* do this last!!!! */
6103 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6106 Safefree(r->data->what);
6109 Safefree(r->startp);
6115 - regnext - dig the "next" pointer out of a node
6118 Perl_regnext(pTHX_ register regnode *p)
6121 register I32 offset;
6123 if (p == &PL_regdummy)
6126 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6134 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6137 STRLEN l1 = strlen(pat1);
6138 STRLEN l2 = strlen(pat2);
6141 const char *message;
6147 Copy(pat1, buf, l1 , char);
6148 Copy(pat2, buf + l1, l2 , char);
6149 buf[l1 + l2] = '\n';
6150 buf[l1 + l2 + 1] = '\0';
6152 /* ANSI variant takes additional second argument */
6153 va_start(args, pat2);
6157 msv = vmess(buf, &args);
6159 message = SvPV_const(msv,l1);
6162 Copy(message, buf, l1 , char);
6163 buf[l1-1] = '\0'; /* Overwrite \n */
6164 Perl_croak(aTHX_ "%s", buf);
6167 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6170 Perl_save_re_context(pTHX)
6173 SAVEI32(PL_reg_flags); /* from regexec.c */
6175 SAVEPPTR(PL_reginput); /* String-input pointer. */
6176 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6177 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6178 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6179 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6180 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6181 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6182 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6183 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6184 PL_reg_start_tmp = 0;
6185 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6186 PL_reg_start_tmpl = 0;
6187 SAVEVPTR(PL_regdata);
6188 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6189 SAVEI32(PL_regnarrate); /* from regexec.c */
6190 SAVEVPTR(PL_regprogram); /* from regexec.c */
6191 SAVEINT(PL_regindent); /* from regexec.c */
6192 SAVEVPTR(PL_regcc); /* from regexec.c */
6193 SAVEVPTR(PL_curcop);
6194 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6195 SAVEVPTR(PL_reg_re); /* from regexec.c */
6196 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6197 SAVESPTR(PL_reg_sv); /* from regexec.c */
6198 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6199 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6200 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6201 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6202 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6203 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6204 PL_reg_oldsaved = NULL;
6205 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6206 PL_reg_oldsavedlen = 0;
6207 #ifdef PERL_OLD_COPY_ON_WRITE
6211 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6213 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6214 PL_reg_leftiter = 0;
6215 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6216 PL_reg_poscache = NULL;
6217 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6218 PL_reg_poscache_size = 0;
6219 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6220 SAVEI32(PL_regnpar); /* () count. */
6221 SAVEI32(PL_regsize); /* from regexec.c */
6223 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6225 const REGEXP * const rx = PM_GETRE(PL_curpm);
6228 for (i = 1; i <= rx->nparens; i++) {
6230 char digits[TYPE_CHARS(long)];
6231 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
6232 GV *const *const gvp
6233 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6235 if (gvp && SvTYPE(gv = *gvp) == SVt_PVGV && GvSV(gv)) {
6243 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6248 clear_re(pTHX_ void *r)
6251 ReREFCNT_dec((regexp *)r);
6257 S_put_byte(pTHX_ SV *sv, int c)
6259 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6260 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6261 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6262 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6264 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6269 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6272 register U8 op = EXACT; /* Arbitrary non-END op. */
6273 register regnode *next;
6275 while (op != END && (!last || node < last)) {
6276 /* While that wasn't END last time... */
6282 next = regnext(node);
6284 if (OP(node) == OPTIMIZED)
6287 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6288 (int)(2*l + 1), "", SvPVX_const(sv));
6289 if (next == NULL) /* Next ptr. */
6290 PerlIO_printf(Perl_debug_log, "(0)");
6292 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6293 (void)PerlIO_putc(Perl_debug_log, '\n');
6295 if (PL_regkind[(U8)op] == BRANCHJ) {
6296 register regnode *nnode = (OP(next) == LONGJMP
6299 if (last && nnode > last)
6301 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6303 else if (PL_regkind[(U8)op] == BRANCH) {
6304 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6306 else if ( PL_regkind[(U8)op] == TRIE ) {
6307 const I32 n = ARG(node);
6308 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6309 const I32 arry_len = av_len(trie->words)+1;
6311 PerlIO_printf(Perl_debug_log,
6312 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6316 (int)trie->charcount,
6317 trie->uniquecharcount,
6318 (IV)trie->laststate-1,
6319 node->flags ? " EVAL mode" : "");
6321 for (word_idx=0; word_idx < arry_len; word_idx++) {
6322 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6324 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6327 SvPV_nolen_const(*elem_ptr),
6332 PerlIO_printf(Perl_debug_log, "(0)\n");
6334 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6340 node = NEXTOPER(node);
6341 node += regarglen[(U8)op];
6344 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6345 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6346 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6348 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6349 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6352 else if ( op == PLUS || op == STAR) {
6353 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6355 else if (op == ANYOF) {
6356 /* arglen 1 + class block */
6357 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6358 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6359 node = NEXTOPER(node);
6361 else if (PL_regkind[(U8)op] == EXACT) {
6362 /* Literal string, where present. */
6363 node += NODE_SZ_STR(node) - 1;
6364 node = NEXTOPER(node);
6367 node = NEXTOPER(node);
6368 node += regarglen[(U8)op];
6370 if (op == CURLYX || op == OPEN)
6372 else if (op == WHILEM)
6378 #endif /* DEBUGGING */
6382 * c-indentation-style: bsd
6384 * indent-tabs-mode: t
6387 * ex: set ts=8 sts=4 sw=4 noet: