5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
287 #define Simple_vFAIL(m) STMT_START { \
288 const IV offset = RExC_parse - RExC_precomp; \
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
296 #define vFAIL(m) STMT_START { \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
303 * Like Simple_vFAIL(), but accepts two arguments.
305 #define Simple_vFAIL2(m,a1) STMT_START { \
306 const IV offset = RExC_parse - RExC_precomp; \
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
314 #define vFAIL2(m,a1) STMT_START { \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
322 * Like Simple_vFAIL(), but accepts three arguments.
324 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
325 const IV offset = RExC_parse - RExC_precomp; \
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
333 #define vFAIL3(m,a1,a2) STMT_START { \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
340 * Like Simple_vFAIL(), but accepts four arguments.
342 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
343 const IV offset = RExC_parse - RExC_precomp; \
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
348 #define vWARN(loc,m) STMT_START { \
349 const IV offset = loc - RExC_precomp; \
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
354 #define vWARNdep(loc,m) STMT_START { \
355 const IV offset = loc - RExC_precomp; \
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
362 #define vWARN2(loc, m, a1) STMT_START { \
363 const IV offset = loc - RExC_precomp; \
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
368 #define vWARN3(loc, m, a1, a2) STMT_START { \
369 const IV offset = loc - RExC_precomp; \
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
374 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
375 const IV offset = loc - RExC_precomp; \
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
381 const IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
387 /* Allow for side effects in s */
388 #define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
392 /* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
398 #define MJD_OFFSET_DEBUG(x)
399 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
402 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
409 RExC_offsets[2*(node)-1] = (byte); \
414 #define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
418 #define Set_Node_Length_To_R(node,len) STMT_START { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
421 __LINE__, (int)(node), (int)(len))); \
423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
425 RExC_offsets[2*(node)] = (len); \
430 #define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433 #define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
436 /* Get offsets and lengths */
437 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
440 static void clear_re(pTHX_ void *r);
442 /* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
447 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
453 SvSetMagicSV(*data->longest, data->last_found);
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
460 data->flags &= ~SF_FIX_BEFORE_EOL;
463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
464 data->offset_float_max = (l
465 ? data->last_start_max
466 : data->pos_min + data->pos_delta);
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
469 if (data->flags & SF_BEFORE_EOL)
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
473 data->flags &= ~SF_FL_BEFORE_EOL;
476 SvCUR_set(data->last_found, 0);
478 SV * const sv = data->last_found;
480 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
485 data->flags &= ~SF_BEFORE_EOL;
488 /* Can match anything (initialization) */
490 S_cl_anything(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
492 ANYOF_CLASS_ZERO(cl);
493 ANYOF_BITMAP_SETALL(cl);
494 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
496 cl->flags |= ANYOF_LOCALE;
499 /* Can match anything (initialization) */
501 S_cl_is_anything(const struct regnode_charclass_class *cl)
505 for (value = 0; value <= ANYOF_MAX; value += 2)
506 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
508 if (!(cl->flags & ANYOF_UNICODE_ALL))
510 if (!ANYOF_BITMAP_TESTALLSET(cl))
515 /* Can match anything (initialization) */
517 S_cl_init(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
519 Zero(cl, 1, struct regnode_charclass_class);
521 cl_anything(pRExC_state, cl);
525 S_cl_init_zero(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
527 Zero(cl, 1, struct regnode_charclass_class);
529 cl_anything(pRExC_state, cl);
531 cl->flags |= ANYOF_LOCALE;
534 /* 'And' a given class with another one. Can create false positives */
535 /* We assume that cl is not inverted */
537 S_cl_and(struct regnode_charclass_class *cl,
538 const struct regnode_charclass_class *and_with)
540 if (!(and_with->flags & ANYOF_CLASS)
541 && !(cl->flags & ANYOF_CLASS)
542 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
543 && !(and_with->flags & ANYOF_FOLD)
544 && !(cl->flags & ANYOF_FOLD)) {
547 if (and_with->flags & ANYOF_INVERT)
548 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
549 cl->bitmap[i] &= ~and_with->bitmap[i];
551 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
552 cl->bitmap[i] &= and_with->bitmap[i];
553 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
554 if (!(and_with->flags & ANYOF_EOS))
555 cl->flags &= ~ANYOF_EOS;
557 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
558 !(and_with->flags & ANYOF_INVERT)) {
559 cl->flags &= ~ANYOF_UNICODE_ALL;
560 cl->flags |= ANYOF_UNICODE;
561 ARG_SET(cl, ARG(and_with));
563 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
564 !(and_with->flags & ANYOF_INVERT))
565 cl->flags &= ~ANYOF_UNICODE_ALL;
566 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
567 !(and_with->flags & ANYOF_INVERT))
568 cl->flags &= ~ANYOF_UNICODE;
571 /* 'OR' a given class with another one. Can create false positives */
572 /* We assume that cl is not inverted */
574 S_cl_or(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
576 if (or_with->flags & ANYOF_INVERT) {
578 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
579 * <= (B1 | !B2) | (CL1 | !CL2)
580 * which is wasteful if CL2 is small, but we ignore CL2:
581 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
582 * XXXX Can we handle case-fold? Unclear:
583 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
584 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
586 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
587 && !(or_with->flags & ANYOF_FOLD)
588 && !(cl->flags & ANYOF_FOLD) ) {
591 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
592 cl->bitmap[i] |= ~or_with->bitmap[i];
593 } /* XXXX: logic is complicated otherwise */
595 cl_anything(pRExC_state, cl);
598 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
599 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
600 && (!(or_with->flags & ANYOF_FOLD)
601 || (cl->flags & ANYOF_FOLD)) ) {
604 /* OR char bitmap and class bitmap separately */
605 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
606 cl->bitmap[i] |= or_with->bitmap[i];
607 if (or_with->flags & ANYOF_CLASS) {
608 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
609 cl->classflags[i] |= or_with->classflags[i];
610 cl->flags |= ANYOF_CLASS;
613 else { /* XXXX: logic is complicated, leave it along for a moment. */
614 cl_anything(pRExC_state, cl);
617 if (or_with->flags & ANYOF_EOS)
618 cl->flags |= ANYOF_EOS;
620 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
621 ARG(cl) != ARG(or_with)) {
622 cl->flags |= ANYOF_UNICODE_ALL;
623 cl->flags &= ~ANYOF_UNICODE;
625 if (or_with->flags & ANYOF_UNICODE_ALL) {
626 cl->flags |= ANYOF_UNICODE_ALL;
627 cl->flags &= ~ANYOF_UNICODE;
633 make_trie(startbranch,first,last,tail,flags)
634 startbranch: the first branch in the whole branch sequence
635 first : start branch of sequence of branch-exact nodes.
636 May be the same as startbranch
637 last : Thing following the last branch.
638 May be the same as tail.
639 tail : item following the branch sequence
640 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
642 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
644 A trie is an N'ary tree where the branches are determined by digital
645 decomposition of the key. IE, at the root node you look up the 1st character and
646 follow that branch repeat until you find the end of the branches. Nodes can be
647 marked as "accepting" meaning they represent a complete word. Eg:
651 would convert into the following structure. Numbers represent states, letters
652 following numbers represent valid transitions on the letter from that state, if
653 the number is in square brackets it represents an accepting state, otherwise it
654 will be in parenthesis.
656 +-h->+-e->[3]-+-r->(8)-+-s->[9]
660 (1) +-i->(6)-+-s->[7]
662 +-s->(3)-+-h->(4)-+-e->[5]
664 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
666 This shows that when matching against the string 'hers' we will begin at state 1
667 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
668 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
669 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
670 single traverse. We store a mapping from accepting to state to which word was
671 matched, and then when we have multiple possibilities we try to complete the
672 rest of the regex in the order in which they occured in the alternation.
674 The only prior NFA like behaviour that would be changed by the TRIE support is
675 the silent ignoring of duplicate alternations which are of the form:
677 / (DUPE|DUPE) X? (?{ ... }) Y /x
679 Thus EVAL blocks follwing a trie may be called a different number of times with
680 and without the optimisation. With the optimisations dupes will be silently
681 ignored. This inconsistant behaviour of EVAL type nodes is well established as
682 the following demonstrates:
684 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
686 which prints out 'word' three times, but
688 'words'=~/(word|word|word)(?{ print $1 })S/
690 which doesnt print it out at all. This is due to other optimisations kicking in.
692 Example of what happens on a structural level:
694 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
696 1: CURLYM[1] {1,32767}(18)
707 This would be optimizable with startbranch=5, first=5, last=16, tail=16
708 and should turn into:
710 1: CURLYM[1] {1,32767}(18)
712 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
720 Cases where tail != last would be like /(?foo|bar)baz/:
730 which would be optimizable with startbranch=1, first=1, last=7, tail=8
731 and would end up looking like:
734 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
743 #define TRIE_DEBUG_CHAR \
744 DEBUG_TRIE_COMPILE_r({ \
747 tmp = newSVpvs( "" ); \
748 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
750 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
752 av_push( trie->revcharmap, tmp ); \
755 #define TRIE_READ_CHAR STMT_START { \
758 if ( foldlen > 0 ) { \
759 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
764 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
765 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
766 foldlen -= UNISKIP( uvc ); \
767 scan = foldbuf + UNISKIP( uvc ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
779 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
780 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
781 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
782 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
784 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
785 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
786 TRIE_LIST_LEN( state ) *= 2; \
787 Renew( trie->states[ state ].trans.list, \
788 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
790 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
791 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
792 TRIE_LIST_CUR( state )++; \
795 #define TRIE_LIST_NEW(state) STMT_START { \
796 Newxz( trie->states[ state ].trans.list, \
797 4, reg_trie_trans_le ); \
798 TRIE_LIST_CUR( state ) = 1; \
799 TRIE_LIST_LEN( state ) = 4; \
803 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
806 /* first pass, loop through and scan words */
809 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
814 /* we just use folder as a flag in utf8 */
815 const U8 * const folder = ( flags == EXACTF
823 const U32 data_slot = add_data( pRExC_state, 1, "t" );
826 GET_RE_DEBUG_FLAGS_DECL;
828 Newxz( trie, 1, reg_trie_data );
830 RExC_rx->data->data[ data_slot ] = (void*)trie;
831 Newxz( trie->charmap, 256, U16 );
833 trie->words = newAV();
834 trie->revcharmap = newAV();
838 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
839 if (!SvIOK(re_trie_maxbuff)) {
840 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
843 /* -- First loop and Setup --
845 We first traverse the branches and scan each word to determine if it
846 contains widechars, and how many unique chars there are, this is
847 important as we have to build a table with at least as many columns as we
850 We use an array of integers to represent the character codes 0..255
851 (trie->charmap) and we use a an HV* to store unicode characters. We use the
852 native representation of the character value as the key and IV's for the
855 *TODO* If we keep track of how many times each character is used we can
856 remap the columns so that the table compression later on is more
857 efficient in terms of memory by ensuring most common value is in the
858 middle and the least common are on the outside. IMO this would be better
859 than a most to least common mapping as theres a decent chance the most
860 common letter will share a node with the least common, meaning the node
861 will not be compressable. With a middle is most common approach the worst
862 case is when we have the least common nodes twice.
867 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
868 regnode * const noper = NEXTOPER( cur );
869 const U8 *uc = (U8*)STRING( noper );
870 const U8 * const e = uc + STR_LEN( noper );
872 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
873 const U8 *scan = (U8*)NULL;
875 for ( ; uc < e ; uc += len ) {
879 if ( !trie->charmap[ uvc ] ) {
880 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
882 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
887 if ( !trie->widecharmap )
888 trie->widecharmap = newHV();
890 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
893 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
895 if ( !SvTRUE( *svpp ) ) {
896 sv_setiv( *svpp, ++trie->uniquecharcount );
902 } /* end first pass */
903 DEBUG_TRIE_COMPILE_r(
904 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
905 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
906 (int)trie->charcount, trie->uniquecharcount )
911 We now know what we are dealing with in terms of unique chars and
912 string sizes so we can calculate how much memory a naive
913 representation using a flat table will take. If it's over a reasonable
914 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
915 conservative but potentially much slower representation using an array
918 At the end we convert both representations into the same compressed
919 form that will be used in regexec.c for matching with. The latter
920 is a form that cannot be used to construct with but has memory
921 properties similar to the list form and access properties similar
922 to the table form making it both suitable for fast searches and
923 small enough that its feasable to store for the duration of a program.
925 See the comment in the code where the compressed table is produced
926 inplace from the flat tabe representation for an explanation of how
927 the compression works.
932 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
934 Second Pass -- Array Of Lists Representation
936 Each state will be represented by a list of charid:state records
937 (reg_trie_trans_le) the first such element holds the CUR and LEN
938 points of the allocated array. (See defines above).
940 We build the initial structure using the lists, and then convert
941 it into the compressed table form which allows faster lookups
942 (but cant be modified once converted).
948 STRLEN transcount = 1;
950 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
954 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
956 regnode * const noper = NEXTOPER( cur );
957 U8 *uc = (U8*)STRING( noper );
958 const U8 * const e = uc + STR_LEN( noper );
959 U32 state = 1; /* required init */
960 U16 charid = 0; /* sanity init */
961 U8 *scan = (U8*)NULL; /* sanity init */
962 STRLEN foldlen = 0; /* required init */
963 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
965 for ( ; uc < e ; uc += len ) {
970 charid = trie->charmap[ uvc ];
972 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
976 charid=(U16)SvIV( *svpp );
985 if ( !trie->states[ state ].trans.list ) {
986 TRIE_LIST_NEW( state );
988 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
989 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
990 newstate = TRIE_LIST_ITEM( state, check ).newstate;
995 newstate = next_alloc++;
996 TRIE_LIST_PUSH( state, charid, newstate );
1001 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1003 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1006 if ( !trie->states[ state ].wordnum ) {
1007 /* we havent inserted this word into the structure yet. */
1008 trie->states[ state ].wordnum = ++curword;
1011 /* store the word for dumping */
1012 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1013 if ( UTF ) SvUTF8_on( tmp );
1014 av_push( trie->words, tmp );
1018 /*EMPTY*/; /* It's a dupe. So ignore it. */
1021 } /* end second pass */
1023 trie->laststate = next_alloc;
1024 Renew( trie->states, next_alloc, reg_trie_state );
1026 DEBUG_TRIE_COMPILE_MORE_r({
1029 /* print out the table precompression. */
1031 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1032 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1034 for( state=1 ; state < next_alloc ; state ++ ) {
1037 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1038 if ( ! trie->states[ state ].wordnum ) {
1039 PerlIO_printf( Perl_debug_log, "%5s| ","");
1041 PerlIO_printf( Perl_debug_log, "W%04x| ",
1042 trie->states[ state ].wordnum
1045 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1046 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1047 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1048 SvPV_nolen_const( *tmp ),
1049 TRIE_LIST_ITEM(state,charid).forid,
1050 (UV)TRIE_LIST_ITEM(state,charid).newstate
1055 PerlIO_printf( Perl_debug_log, "\n\n" );
1058 Newxz( trie->trans, transcount ,reg_trie_trans );
1065 for( state=1 ; state < next_alloc ; state ++ ) {
1069 DEBUG_TRIE_COMPILE_MORE_r(
1070 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1074 if (trie->states[state].trans.list) {
1075 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1079 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1080 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1081 if ( forid < minid ) {
1083 } else if ( forid > maxid ) {
1087 if ( transcount < tp + maxid - minid + 1) {
1089 Renew( trie->trans, transcount, reg_trie_trans );
1090 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1092 base = trie->uniquecharcount + tp - minid;
1093 if ( maxid == minid ) {
1095 for ( ; zp < tp ; zp++ ) {
1096 if ( ! trie->trans[ zp ].next ) {
1097 base = trie->uniquecharcount + zp - minid;
1098 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1099 trie->trans[ zp ].check = state;
1105 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1106 trie->trans[ tp ].check = state;
1111 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1112 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1113 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1114 trie->trans[ tid ].check = state;
1116 tp += ( maxid - minid + 1 );
1118 Safefree(trie->states[ state ].trans.list);
1121 DEBUG_TRIE_COMPILE_MORE_r(
1122 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1125 trie->states[ state ].trans.base=base;
1127 trie->lasttrans = tp + 1;
1131 Second Pass -- Flat Table Representation.
1133 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1134 We know that we will need Charcount+1 trans at most to store the data
1135 (one row per char at worst case) So we preallocate both structures
1136 assuming worst case.
1138 We then construct the trie using only the .next slots of the entry
1141 We use the .check field of the first entry of the node temporarily to
1142 make compression both faster and easier by keeping track of how many non
1143 zero fields are in the node.
1145 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1148 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1149 number representing the first entry of the node, and state as a
1150 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1151 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1152 are 2 entrys per node. eg:
1160 The table is internally in the right hand, idx form. However as we also
1161 have to deal with the states array which is indexed by nodenum we have to
1162 use TRIE_NODENUM() to convert.
1166 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1168 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
1169 next_alloc = trie->uniquecharcount + 1;
1171 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1173 regnode * const noper = NEXTOPER( cur );
1174 const U8 *uc = (U8*)STRING( noper );
1175 const U8 * const e = uc + STR_LEN( noper );
1177 U32 state = 1; /* required init */
1179 U16 charid = 0; /* sanity init */
1180 U32 accept_state = 0; /* sanity init */
1181 U8 *scan = (U8*)NULL; /* sanity init */
1183 STRLEN foldlen = 0; /* required init */
1184 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1187 for ( ; uc < e ; uc += len ) {
1192 charid = trie->charmap[ uvc ];
1194 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1195 charid = svpp ? (U16)SvIV(*svpp) : 0;
1199 if ( !trie->trans[ state + charid ].next ) {
1200 trie->trans[ state + charid ].next = next_alloc;
1201 trie->trans[ state ].check++;
1202 next_alloc += trie->uniquecharcount;
1204 state = trie->trans[ state + charid ].next;
1206 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1208 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1211 accept_state = TRIE_NODENUM( state );
1212 if ( !trie->states[ accept_state ].wordnum ) {
1213 /* we havent inserted this word into the structure yet. */
1214 trie->states[ accept_state ].wordnum = ++curword;
1217 /* store the word for dumping */
1218 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1219 if ( UTF ) SvUTF8_on( tmp );
1220 av_push( trie->words, tmp );
1224 /*EMPTY*/; /* Its a dupe. So ignore it. */
1227 } /* end second pass */
1229 DEBUG_TRIE_COMPILE_MORE_r({
1231 print out the table precompression so that we can do a visual check
1232 that they are identical.
1236 PerlIO_printf( Perl_debug_log, "\nChar : " );
1238 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1239 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1241 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1245 PerlIO_printf( Perl_debug_log, "\nState+-" );
1247 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1248 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1251 PerlIO_printf( Perl_debug_log, "\n" );
1253 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1255 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1257 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1258 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1259 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1261 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1262 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1264 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1265 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1268 PerlIO_printf( Perl_debug_log, "\n\n" );
1272 * Inplace compress the table.*
1274 For sparse data sets the table constructed by the trie algorithm will
1275 be mostly 0/FAIL transitions or to put it another way mostly empty.
1276 (Note that leaf nodes will not contain any transitions.)
1278 This algorithm compresses the tables by eliminating most such
1279 transitions, at the cost of a modest bit of extra work during lookup:
1281 - Each states[] entry contains a .base field which indicates the
1282 index in the state[] array wheres its transition data is stored.
1284 - If .base is 0 there are no valid transitions from that node.
1286 - If .base is nonzero then charid is added to it to find an entry in
1289 -If trans[states[state].base+charid].check!=state then the
1290 transition is taken to be a 0/Fail transition. Thus if there are fail
1291 transitions at the front of the node then the .base offset will point
1292 somewhere inside the previous nodes data (or maybe even into a node
1293 even earlier), but the .check field determines if the transition is
1296 The following process inplace converts the table to the compressed
1297 table: We first do not compress the root node 1,and mark its all its
1298 .check pointers as 1 and set its .base pointer as 1 as well. This
1299 allows to do a DFA construction from the compressed table later, and
1300 ensures that any .base pointers we calculate later are greater than
1303 - We set 'pos' to indicate the first entry of the second node.
1305 - We then iterate over the columns of the node, finding the first and
1306 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1307 and set the .check pointers accordingly, and advance pos
1308 appropriately and repreat for the next node. Note that when we copy
1309 the next pointers we have to convert them from the original
1310 NODEIDX form to NODENUM form as the former is not valid post
1313 - If a node has no transitions used we mark its base as 0 and do not
1314 advance the pos pointer.
1316 - If a node only has one transition we use a second pointer into the
1317 structure to fill in allocated fail transitions from other states.
1318 This pointer is independent of the main pointer and scans forward
1319 looking for null transitions that are allocated to a state. When it
1320 finds one it writes the single transition into the "hole". If the
1321 pointer doesnt find one the single transition is appeneded as normal.
1323 - Once compressed we can Renew/realloc the structures to release the
1326 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1327 specifically Fig 3.47 and the associated pseudocode.
1331 const U32 laststate = TRIE_NODENUM( next_alloc );
1334 trie->laststate = laststate;
1336 for ( state = 1 ; state < laststate ; state++ ) {
1338 const U32 stateidx = TRIE_NODEIDX( state );
1339 const U32 o_used = trie->trans[ stateidx ].check;
1340 U32 used = trie->trans[ stateidx ].check;
1341 trie->trans[ stateidx ].check = 0;
1343 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1344 if ( flag || trie->trans[ stateidx + charid ].next ) {
1345 if ( trie->trans[ stateidx + charid ].next ) {
1347 for ( ; zp < pos ; zp++ ) {
1348 if ( ! trie->trans[ zp ].next ) {
1352 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1353 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1354 trie->trans[ zp ].check = state;
1355 if ( ++zp > pos ) pos = zp;
1362 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1364 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1365 trie->trans[ pos ].check = state;
1370 trie->lasttrans = pos + 1;
1371 Renew( trie->states, laststate + 1, reg_trie_state);
1372 DEBUG_TRIE_COMPILE_MORE_r(
1373 PerlIO_printf( Perl_debug_log,
1374 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1375 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1378 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1381 } /* end table compress */
1383 /* resize the trans array to remove unused space */
1384 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1386 DEBUG_TRIE_COMPILE_r({
1389 Now we print it out again, in a slightly different form as there is additional
1390 info we want to be able to see when its compressed. They are close enough for
1391 visual comparison though.
1393 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1395 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1396 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1398 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1401 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1403 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1404 PerlIO_printf( Perl_debug_log, "-----");
1405 PerlIO_printf( Perl_debug_log, "\n");
1407 for( state = 1 ; state < trie->laststate ; state++ ) {
1408 const U32 base = trie->states[ state ].trans.base;
1410 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1412 if ( trie->states[ state ].wordnum ) {
1413 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1415 PerlIO_printf( Perl_debug_log, "%6s", "" );
1418 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1423 while( ( base + ofs < trie->uniquecharcount ) ||
1424 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1425 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1428 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1430 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1431 if ( ( base + ofs >= trie->uniquecharcount ) &&
1432 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1433 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1435 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1436 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1438 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1442 PerlIO_printf( Perl_debug_log, "]");
1445 PerlIO_printf( Perl_debug_log, "\n" );
1450 /* now finally we "stitch in" the new TRIE node
1451 This means we convert either the first branch or the first Exact,
1452 depending on whether the thing following (in 'last') is a branch
1453 or not and whther first is the startbranch (ie is it a sub part of
1454 the alternation or is it the whole thing.)
1455 Assuming its a sub part we conver the EXACT otherwise we convert
1456 the whole branch sequence, including the first.
1463 if ( first == startbranch && OP( last ) != BRANCH ) {
1466 convert = NEXTOPER( first );
1467 NEXT_OFF( first ) = (U16)(last - first);
1470 OP( convert ) = TRIE + (U8)( flags - EXACT );
1471 NEXT_OFF( convert ) = (U16)(tail - convert);
1472 ARG_SET( convert, data_slot );
1474 /* tells us if we need to handle accept buffers specially */
1475 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1478 /* needed for dumping*/
1480 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1481 /* We now need to mark all of the space originally used by the
1482 branches as optimized away. This keeps the dumpuntil from
1483 throwing a wobbly as it doesnt use regnext() to traverse the
1486 while( optimize < last ) {
1487 OP( optimize ) = OPTIMIZED;
1491 } /* end node insert */
1498 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1499 * These need to be revisited when a newer toolchain becomes available.
1501 #if defined(__sparc64__) && defined(__GNUC__)
1502 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1503 # undef SPARC64_GCC_WORKAROUND
1504 # define SPARC64_GCC_WORKAROUND 1
1508 /* REx optimizer. Converts nodes into quickier variants "in place".
1509 Finds fixed substrings. */
1511 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1512 to the position after last scanned or to NULL. */
1516 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1517 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1518 /* scanp: Start here (read-write). */
1519 /* deltap: Write maxlen-minlen here. */
1520 /* last: Stop before this one. */
1523 I32 min = 0, pars = 0, code;
1524 regnode *scan = *scanp, *next;
1526 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1527 int is_inf_internal = 0; /* The studied chunk is infinite */
1528 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1529 scan_data_t data_fake;
1530 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1531 SV *re_trie_maxbuff = NULL;
1533 GET_RE_DEBUG_FLAGS_DECL;
1535 while (scan && OP(scan) != END && scan < last) {
1536 /* Peephole optimizer: */
1538 SV * const mysv=sv_newmortal();
1539 regprop( mysv, scan);
1540 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1541 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1544 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1545 /* Merge several consecutive EXACTish nodes into one. */
1546 regnode *n = regnext(scan);
1549 regnode *stop = scan;
1552 next = scan + NODE_SZ_STR(scan);
1553 /* Skip NOTHING, merge EXACT*. */
1555 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1556 (stringok && (OP(n) == OP(scan))))
1558 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1559 if (OP(n) == TAIL || n > next)
1561 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1562 NEXT_OFF(scan) += NEXT_OFF(n);
1563 next = n + NODE_STEP_REGNODE;
1570 else if (stringok) {
1571 const int oldl = STR_LEN(scan);
1572 regnode * const nnext = regnext(n);
1574 if (oldl + STR_LEN(n) > U8_MAX)
1576 NEXT_OFF(scan) += NEXT_OFF(n);
1577 STR_LEN(scan) += STR_LEN(n);
1578 next = n + NODE_SZ_STR(n);
1579 /* Now we can overwrite *n : */
1580 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1588 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1590 Two problematic code points in Unicode casefolding of EXACT nodes:
1592 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1593 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1599 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1600 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1602 This means that in case-insensitive matching (or "loose matching",
1603 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1604 length of the above casefolded versions) can match a target string
1605 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1606 This would rather mess up the minimum length computation.
1608 What we'll do is to look for the tail four bytes, and then peek
1609 at the preceding two bytes to see whether we need to decrease
1610 the minimum length by four (six minus two).
1612 Thanks to the design of UTF-8, there cannot be false matches:
1613 A sequence of valid UTF-8 bytes cannot be a subsequence of
1614 another valid sequence of UTF-8 bytes.
1617 char * const s0 = STRING(scan), *s, *t;
1618 char * const s1 = s0 + STR_LEN(scan) - 1;
1619 char * const s2 = s1 - 4;
1620 const char * const t0 = "\xcc\x88\xcc\x81";
1621 const char * const t1 = t0 + 3;
1624 s < s2 && (t = ninstr(s, s1, t0, t1));
1626 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1627 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1634 n = scan + NODE_SZ_STR(scan);
1636 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1647 /* Follow the next-chain of the current node and optimize
1648 away all the NOTHINGs from it. */
1649 if (OP(scan) != CURLYX) {
1650 const int max = (reg_off_by_arg[OP(scan)]
1652 /* I32 may be smaller than U16 on CRAYs! */
1653 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1654 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1658 /* Skip NOTHING and LONGJMP. */
1659 while ((n = regnext(n))
1660 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1661 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1662 && off + noff < max)
1664 if (reg_off_by_arg[OP(scan)])
1667 NEXT_OFF(scan) = off;
1670 /* The principal pseudo-switch. Cannot be a switch, since we
1671 look into several different things. */
1672 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1673 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1674 next = regnext(scan);
1676 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1678 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1679 I32 max1 = 0, min1 = I32_MAX, num = 0;
1680 struct regnode_charclass_class accum;
1681 regnode *startbranch=scan;
1683 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1684 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1685 if (flags & SCF_DO_STCLASS)
1686 cl_init_zero(pRExC_state, &accum);
1688 while (OP(scan) == code) {
1689 I32 deltanext, minnext, f = 0, fake;
1690 struct regnode_charclass_class this_class;
1693 data_fake.flags = 0;
1695 data_fake.whilem_c = data->whilem_c;
1696 data_fake.last_closep = data->last_closep;
1699 data_fake.last_closep = &fake;
1700 next = regnext(scan);
1701 scan = NEXTOPER(scan);
1703 scan = NEXTOPER(scan);
1704 if (flags & SCF_DO_STCLASS) {
1705 cl_init(pRExC_state, &this_class);
1706 data_fake.start_class = &this_class;
1707 f = SCF_DO_STCLASS_AND;
1709 if (flags & SCF_WHILEM_VISITED_POS)
1710 f |= SCF_WHILEM_VISITED_POS;
1712 /* we suppose the run is continuous, last=next...*/
1713 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1714 next, &data_fake, f,depth+1);
1717 if (max1 < minnext + deltanext)
1718 max1 = minnext + deltanext;
1719 if (deltanext == I32_MAX)
1720 is_inf = is_inf_internal = 1;
1722 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1724 if (data && (data_fake.flags & SF_HAS_EVAL))
1725 data->flags |= SF_HAS_EVAL;
1727 data->whilem_c = data_fake.whilem_c;
1728 if (flags & SCF_DO_STCLASS)
1729 cl_or(pRExC_state, &accum, &this_class);
1730 if (code == SUSPEND)
1733 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1735 if (flags & SCF_DO_SUBSTR) {
1736 data->pos_min += min1;
1737 data->pos_delta += max1 - min1;
1738 if (max1 != min1 || is_inf)
1739 data->longest = &(data->longest_float);
1742 delta += max1 - min1;
1743 if (flags & SCF_DO_STCLASS_OR) {
1744 cl_or(pRExC_state, data->start_class, &accum);
1746 cl_and(data->start_class, &and_with);
1747 flags &= ~SCF_DO_STCLASS;
1750 else if (flags & SCF_DO_STCLASS_AND) {
1752 cl_and(data->start_class, &accum);
1753 flags &= ~SCF_DO_STCLASS;
1756 /* Switch to OR mode: cache the old value of
1757 * data->start_class */
1758 StructCopy(data->start_class, &and_with,
1759 struct regnode_charclass_class);
1760 flags &= ~SCF_DO_STCLASS_AND;
1761 StructCopy(&accum, data->start_class,
1762 struct regnode_charclass_class);
1763 flags |= SCF_DO_STCLASS_OR;
1764 data->start_class->flags |= ANYOF_EOS;
1770 Assuming this was/is a branch we are dealing with: 'scan' now
1771 points at the item that follows the branch sequence, whatever
1772 it is. We now start at the beginning of the sequence and look
1778 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1780 If we can find such a subseqence we need to turn the first
1781 element into a trie and then add the subsequent branch exact
1782 strings to the trie.
1786 1. patterns where the whole set of branch can be converted to a trie,
1788 2. patterns where only a subset of the alternations can be
1789 converted to a trie.
1791 In case 1 we can replace the whole set with a single regop
1792 for the trie. In case 2 we need to keep the start and end
1795 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1796 becomes BRANCH TRIE; BRANCH X;
1798 Hypthetically when we know the regex isnt anchored we can
1799 turn a case 1 into a DFA and let it rip... Every time it finds a match
1800 it would just call its tail, no WHILEM/CURLY needed.
1804 if (!re_trie_maxbuff) {
1805 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1806 if (!SvIOK(re_trie_maxbuff))
1807 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1809 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1811 regnode *first = (regnode *)NULL;
1812 regnode *last = (regnode *)NULL;
1813 regnode *tail = scan;
1818 SV * const mysv = sv_newmortal(); /* for dumping */
1820 /* var tail is used because there may be a TAIL
1821 regop in the way. Ie, the exacts will point to the
1822 thing following the TAIL, but the last branch will
1823 point at the TAIL. So we advance tail. If we
1824 have nested (?:) we may have to move through several
1828 while ( OP( tail ) == TAIL ) {
1829 /* this is the TAIL generated by (?:) */
1830 tail = regnext( tail );
1834 regprop( mysv, tail );
1835 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1836 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1837 (RExC_seen_evals) ? "[EVAL]" : ""
1842 step through the branches, cur represents each
1843 branch, noper is the first thing to be matched
1844 as part of that branch and noper_next is the
1845 regnext() of that node. if noper is an EXACT
1846 and noper_next is the same as scan (our current
1847 position in the regex) then the EXACT branch is
1848 a possible optimization target. Once we have
1849 two or more consequetive such branches we can
1850 create a trie of the EXACT's contents and stich
1851 it in place. If the sequence represents all of
1852 the branches we eliminate the whole thing and
1853 replace it with a single TRIE. If it is a
1854 subsequence then we need to stitch it in. This
1855 means the first branch has to remain, and needs
1856 to be repointed at the item on the branch chain
1857 following the last branch optimized. This could
1858 be either a BRANCH, in which case the
1859 subsequence is internal, or it could be the
1860 item following the branch sequence in which
1861 case the subsequence is at the end.
1865 /* dont use tail as the end marker for this traverse */
1866 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1867 regnode * const noper = NEXTOPER( cur );
1868 regnode * const noper_next = regnext( noper );
1871 regprop( mysv, cur);
1872 PerlIO_printf( Perl_debug_log, "%*s%s",
1873 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1875 regprop( mysv, noper);
1876 PerlIO_printf( Perl_debug_log, " -> %s",
1877 SvPV_nolen_const(mysv));
1880 regprop( mysv, noper_next );
1881 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1882 SvPV_nolen_const(mysv));
1884 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1887 if ( ( first ? OP( noper ) == optype
1888 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1889 && noper_next == tail && count<U16_MAX)
1894 optype = OP( noper );
1898 regprop( mysv, first);
1899 PerlIO_printf( Perl_debug_log, "%*s%s",
1900 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1901 regprop( mysv, NEXTOPER(first) );
1902 PerlIO_printf( Perl_debug_log, " -> %s\n",
1903 SvPV_nolen_const( mysv ) );
1908 regprop( mysv, cur);
1909 PerlIO_printf( Perl_debug_log, "%*s%s",
1910 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1911 regprop( mysv, noper );
1912 PerlIO_printf( Perl_debug_log, " -> %s\n",
1913 SvPV_nolen_const( mysv ) );
1919 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1920 (int)depth * 2 + 2, "E:", "**END**" );
1922 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1924 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1925 && noper_next == tail )
1929 optype = OP( noper );
1939 regprop( mysv, cur);
1940 PerlIO_printf( Perl_debug_log,
1941 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1942 " ", SvPV_nolen_const( mysv ), first, last, cur);
1947 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1948 (int)depth * 2 + 2, "E:", "==END==" );
1950 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1955 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1956 scan = NEXTOPER(NEXTOPER(scan));
1957 } else /* single branch is optimized. */
1958 scan = NEXTOPER(scan);
1961 else if (OP(scan) == EXACT) {
1962 I32 l = STR_LEN(scan);
1965 const U8 * const s = (U8*)STRING(scan);
1966 l = utf8_length(s, s + l);
1967 uc = utf8_to_uvchr(s, NULL);
1969 uc = *((U8*)STRING(scan));
1972 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1973 /* The code below prefers earlier match for fixed
1974 offset, later match for variable offset. */
1975 if (data->last_end == -1) { /* Update the start info. */
1976 data->last_start_min = data->pos_min;
1977 data->last_start_max = is_inf
1978 ? I32_MAX : data->pos_min + data->pos_delta;
1980 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
1982 SvUTF8_on(data->last_found);
1984 SV * const sv = data->last_found;
1985 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
1986 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1987 if (mg && mg->mg_len >= 0)
1988 mg->mg_len += utf8_length((U8*)STRING(scan),
1989 (U8*)STRING(scan)+STR_LEN(scan));
1991 data->last_end = data->pos_min + l;
1992 data->pos_min += l; /* As in the first entry. */
1993 data->flags &= ~SF_BEFORE_EOL;
1995 if (flags & SCF_DO_STCLASS_AND) {
1996 /* Check whether it is compatible with what we know already! */
2000 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2001 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2002 && (!(data->start_class->flags & ANYOF_FOLD)
2003 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2006 ANYOF_CLASS_ZERO(data->start_class);
2007 ANYOF_BITMAP_ZERO(data->start_class);
2009 ANYOF_BITMAP_SET(data->start_class, uc);
2010 data->start_class->flags &= ~ANYOF_EOS;
2012 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2014 else if (flags & SCF_DO_STCLASS_OR) {
2015 /* false positive possible if the class is case-folded */
2017 ANYOF_BITMAP_SET(data->start_class, uc);
2019 data->start_class->flags |= ANYOF_UNICODE_ALL;
2020 data->start_class->flags &= ~ANYOF_EOS;
2021 cl_and(data->start_class, &and_with);
2023 flags &= ~SCF_DO_STCLASS;
2025 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2026 I32 l = STR_LEN(scan);
2027 UV uc = *((U8*)STRING(scan));
2029 /* Search for fixed substrings supports EXACT only. */
2030 if (flags & SCF_DO_SUBSTR)
2031 scan_commit(pRExC_state, data);
2033 const U8 * const s = (U8 *)STRING(scan);
2034 l = utf8_length(s, s + l);
2035 uc = utf8_to_uvchr(s, NULL);
2038 if (data && (flags & SCF_DO_SUBSTR))
2040 if (flags & SCF_DO_STCLASS_AND) {
2041 /* Check whether it is compatible with what we know already! */
2045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2046 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2049 ANYOF_CLASS_ZERO(data->start_class);
2050 ANYOF_BITMAP_ZERO(data->start_class);
2052 ANYOF_BITMAP_SET(data->start_class, uc);
2053 data->start_class->flags &= ~ANYOF_EOS;
2054 data->start_class->flags |= ANYOF_FOLD;
2055 if (OP(scan) == EXACTFL)
2056 data->start_class->flags |= ANYOF_LOCALE;
2059 else if (flags & SCF_DO_STCLASS_OR) {
2060 if (data->start_class->flags & ANYOF_FOLD) {
2061 /* false positive possible if the class is case-folded.
2062 Assume that the locale settings are the same... */
2064 ANYOF_BITMAP_SET(data->start_class, uc);
2065 data->start_class->flags &= ~ANYOF_EOS;
2067 cl_and(data->start_class, &and_with);
2069 flags &= ~SCF_DO_STCLASS;
2071 else if (strchr((const char*)PL_varies,OP(scan))) {
2072 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2073 I32 f = flags, pos_before = 0;
2074 regnode *oscan = scan;
2075 struct regnode_charclass_class this_class;
2076 struct regnode_charclass_class *oclass = NULL;
2077 I32 next_is_eval = 0;
2079 switch (PL_regkind[(U8)OP(scan)]) {
2080 case WHILEM: /* End of (?:...)* . */
2081 scan = NEXTOPER(scan);
2084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2085 next = NEXTOPER(scan);
2086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2088 maxcount = REG_INFTY;
2089 next = regnext(scan);
2090 scan = NEXTOPER(scan);
2094 if (flags & SCF_DO_SUBSTR)
2099 if (flags & SCF_DO_STCLASS) {
2101 maxcount = REG_INFTY;
2102 next = regnext(scan);
2103 scan = NEXTOPER(scan);
2106 is_inf = is_inf_internal = 1;
2107 scan = regnext(scan);
2108 if (flags & SCF_DO_SUBSTR) {
2109 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2110 data->longest = &(data->longest_float);
2112 goto optimize_curly_tail;
2114 mincount = ARG1(scan);
2115 maxcount = ARG2(scan);
2116 next = regnext(scan);
2117 if (OP(scan) == CURLYX) {
2118 I32 lp = (data ? *(data->last_closep) : 0);
2119 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2121 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2122 next_is_eval = (OP(scan) == EVAL);
2124 if (flags & SCF_DO_SUBSTR) {
2125 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2126 pos_before = data->pos_min;
2130 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2132 data->flags |= SF_IS_INF;
2134 if (flags & SCF_DO_STCLASS) {
2135 cl_init(pRExC_state, &this_class);
2136 oclass = data->start_class;
2137 data->start_class = &this_class;
2138 f |= SCF_DO_STCLASS_AND;
2139 f &= ~SCF_DO_STCLASS_OR;
2141 /* These are the cases when once a subexpression
2142 fails at a particular position, it cannot succeed
2143 even after backtracking at the enclosing scope.
2145 XXXX what if minimal match and we are at the
2146 initial run of {n,m}? */
2147 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2148 f &= ~SCF_WHILEM_VISITED_POS;
2150 /* This will finish on WHILEM, setting scan, or on NULL: */
2151 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2153 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2155 if (flags & SCF_DO_STCLASS)
2156 data->start_class = oclass;
2157 if (mincount == 0 || minnext == 0) {
2158 if (flags & SCF_DO_STCLASS_OR) {
2159 cl_or(pRExC_state, data->start_class, &this_class);
2161 else if (flags & SCF_DO_STCLASS_AND) {
2162 /* Switch to OR mode: cache the old value of
2163 * data->start_class */
2164 StructCopy(data->start_class, &and_with,
2165 struct regnode_charclass_class);
2166 flags &= ~SCF_DO_STCLASS_AND;
2167 StructCopy(&this_class, data->start_class,
2168 struct regnode_charclass_class);
2169 flags |= SCF_DO_STCLASS_OR;
2170 data->start_class->flags |= ANYOF_EOS;
2172 } else { /* Non-zero len */
2173 if (flags & SCF_DO_STCLASS_OR) {
2174 cl_or(pRExC_state, data->start_class, &this_class);
2175 cl_and(data->start_class, &and_with);
2177 else if (flags & SCF_DO_STCLASS_AND)
2178 cl_and(data->start_class, &this_class);
2179 flags &= ~SCF_DO_STCLASS;
2181 if (!scan) /* It was not CURLYX, but CURLY. */
2183 if ( /* ? quantifier ok, except for (?{ ... }) */
2184 (next_is_eval || !(mincount == 0 && maxcount == 1))
2185 && (minnext == 0) && (deltanext == 0)
2186 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2187 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2188 && ckWARN(WARN_REGEXP))
2191 "Quantifier unexpected on zero-length expression");
2194 min += minnext * mincount;
2195 is_inf_internal |= ((maxcount == REG_INFTY
2196 && (minnext + deltanext) > 0)
2197 || deltanext == I32_MAX);
2198 is_inf |= is_inf_internal;
2199 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2201 /* Try powerful optimization CURLYX => CURLYN. */
2202 if ( OP(oscan) == CURLYX && data
2203 && data->flags & SF_IN_PAR
2204 && !(data->flags & SF_HAS_EVAL)
2205 && !deltanext && minnext == 1 ) {
2206 /* Try to optimize to CURLYN. */
2207 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2208 regnode *nxt1 = nxt;
2215 if (!strchr((const char*)PL_simple,OP(nxt))
2216 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2217 && STR_LEN(nxt) == 1))
2223 if (OP(nxt) != CLOSE)
2225 /* Now we know that nxt2 is the only contents: */
2226 oscan->flags = (U8)ARG(nxt);
2228 OP(nxt1) = NOTHING; /* was OPEN. */
2230 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2231 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2232 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2233 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2234 OP(nxt + 1) = OPTIMIZED; /* was count. */
2235 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2240 /* Try optimization CURLYX => CURLYM. */
2241 if ( OP(oscan) == CURLYX && data
2242 && !(data->flags & SF_HAS_PAR)
2243 && !(data->flags & SF_HAS_EVAL)
2244 && !deltanext /* atom is fixed width */
2245 && minnext != 0 /* CURLYM can't handle zero width */
2247 /* XXXX How to optimize if data == 0? */
2248 /* Optimize to a simpler form. */
2249 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2253 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2254 && (OP(nxt2) != WHILEM))
2256 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2257 /* Need to optimize away parenths. */
2258 if (data->flags & SF_IN_PAR) {
2259 /* Set the parenth number. */
2260 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2262 if (OP(nxt) != CLOSE)
2263 FAIL("Panic opt close");
2264 oscan->flags = (U8)ARG(nxt);
2265 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2266 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2268 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2269 OP(nxt + 1) = OPTIMIZED; /* was count. */
2270 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2271 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2274 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2275 regnode *nnxt = regnext(nxt1);
2278 if (reg_off_by_arg[OP(nxt1)])
2279 ARG_SET(nxt1, nxt2 - nxt1);
2280 else if (nxt2 - nxt1 < U16_MAX)
2281 NEXT_OFF(nxt1) = nxt2 - nxt1;
2283 OP(nxt) = NOTHING; /* Cannot beautify */
2288 /* Optimize again: */
2289 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2295 else if ((OP(oscan) == CURLYX)
2296 && (flags & SCF_WHILEM_VISITED_POS)
2297 /* See the comment on a similar expression above.
2298 However, this time it not a subexpression
2299 we care about, but the expression itself. */
2300 && (maxcount == REG_INFTY)
2301 && data && ++data->whilem_c < 16) {
2302 /* This stays as CURLYX, we can put the count/of pair. */
2303 /* Find WHILEM (as in regexec.c) */
2304 regnode *nxt = oscan + NEXT_OFF(oscan);
2306 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2308 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2309 | (RExC_whilem_seen << 4)); /* On WHILEM */
2311 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2313 if (flags & SCF_DO_SUBSTR) {
2314 SV *last_str = NULL;
2315 int counted = mincount != 0;
2317 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2318 #if defined(SPARC64_GCC_WORKAROUND)
2321 const char *s = NULL;
2324 if (pos_before >= data->last_start_min)
2327 b = data->last_start_min;
2330 s = SvPV_const(data->last_found, l);
2331 old = b - data->last_start_min;
2334 I32 b = pos_before >= data->last_start_min
2335 ? pos_before : data->last_start_min;
2337 const char *s = SvPV_const(data->last_found, l);
2338 I32 old = b - data->last_start_min;
2342 old = utf8_hop((U8*)s, old) - (U8*)s;
2345 /* Get the added string: */
2346 last_str = newSVpvn(s + old, l);
2348 SvUTF8_on(last_str);
2349 if (deltanext == 0 && pos_before == b) {
2350 /* What was added is a constant string */
2352 SvGROW(last_str, (mincount * l) + 1);
2353 repeatcpy(SvPVX(last_str) + l,
2354 SvPVX_const(last_str), l, mincount - 1);
2355 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2356 /* Add additional parts. */
2357 SvCUR_set(data->last_found,
2358 SvCUR(data->last_found) - l);
2359 sv_catsv(data->last_found, last_str);
2361 SV * sv = data->last_found;
2363 SvUTF8(sv) && SvMAGICAL(sv) ?
2364 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2365 if (mg && mg->mg_len >= 0)
2366 mg->mg_len += CHR_SVLEN(last_str);
2368 data->last_end += l * (mincount - 1);
2371 /* start offset must point into the last copy */
2372 data->last_start_min += minnext * (mincount - 1);
2373 data->last_start_max += is_inf ? I32_MAX
2374 : (maxcount - 1) * (minnext + data->pos_delta);
2377 /* It is counted once already... */
2378 data->pos_min += minnext * (mincount - counted);
2379 data->pos_delta += - counted * deltanext +
2380 (minnext + deltanext) * maxcount - minnext * mincount;
2381 if (mincount != maxcount) {
2382 /* Cannot extend fixed substrings found inside
2384 scan_commit(pRExC_state,data);
2385 if (mincount && last_str) {
2386 SV *sv = data->last_found;
2387 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2388 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2392 sv_setsv(sv, last_str);
2393 data->last_end = data->pos_min;
2394 data->last_start_min =
2395 data->pos_min - CHR_SVLEN(last_str);
2396 data->last_start_max = is_inf
2398 : data->pos_min + data->pos_delta
2399 - CHR_SVLEN(last_str);
2401 data->longest = &(data->longest_float);
2403 SvREFCNT_dec(last_str);
2405 if (data && (fl & SF_HAS_EVAL))
2406 data->flags |= SF_HAS_EVAL;
2407 optimize_curly_tail:
2408 if (OP(oscan) != CURLYX) {
2409 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2411 NEXT_OFF(oscan) += NEXT_OFF(next);
2414 default: /* REF and CLUMP only? */
2415 if (flags & SCF_DO_SUBSTR) {
2416 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2417 data->longest = &(data->longest_float);
2419 is_inf = is_inf_internal = 1;
2420 if (flags & SCF_DO_STCLASS_OR)
2421 cl_anything(pRExC_state, data->start_class);
2422 flags &= ~SCF_DO_STCLASS;
2426 else if (strchr((const char*)PL_simple,OP(scan))) {
2429 if (flags & SCF_DO_SUBSTR) {
2430 scan_commit(pRExC_state,data);
2434 if (flags & SCF_DO_STCLASS) {
2435 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2437 /* Some of the logic below assumes that switching
2438 locale on will only add false positives. */
2439 switch (PL_regkind[(U8)OP(scan)]) {
2443 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2444 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2445 cl_anything(pRExC_state, data->start_class);
2448 if (OP(scan) == SANY)
2450 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2451 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2452 || (data->start_class->flags & ANYOF_CLASS));
2453 cl_anything(pRExC_state, data->start_class);
2455 if (flags & SCF_DO_STCLASS_AND || !value)
2456 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2459 if (flags & SCF_DO_STCLASS_AND)
2460 cl_and(data->start_class,
2461 (struct regnode_charclass_class*)scan);
2463 cl_or(pRExC_state, data->start_class,
2464 (struct regnode_charclass_class*)scan);
2467 if (flags & SCF_DO_STCLASS_AND) {
2468 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2469 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2470 for (value = 0; value < 256; value++)
2471 if (!isALNUM(value))
2472 ANYOF_BITMAP_CLEAR(data->start_class, value);
2476 if (data->start_class->flags & ANYOF_LOCALE)
2477 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2479 for (value = 0; value < 256; value++)
2481 ANYOF_BITMAP_SET(data->start_class, value);
2486 if (flags & SCF_DO_STCLASS_AND) {
2487 if (data->start_class->flags & ANYOF_LOCALE)
2488 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2491 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2492 data->start_class->flags |= ANYOF_LOCALE;
2496 if (flags & SCF_DO_STCLASS_AND) {
2497 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2498 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2499 for (value = 0; value < 256; value++)
2501 ANYOF_BITMAP_CLEAR(data->start_class, value);
2505 if (data->start_class->flags & ANYOF_LOCALE)
2506 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2508 for (value = 0; value < 256; value++)
2509 if (!isALNUM(value))
2510 ANYOF_BITMAP_SET(data->start_class, value);
2515 if (flags & SCF_DO_STCLASS_AND) {
2516 if (data->start_class->flags & ANYOF_LOCALE)
2517 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2520 data->start_class->flags |= ANYOF_LOCALE;
2521 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2525 if (flags & SCF_DO_STCLASS_AND) {
2526 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2527 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2528 for (value = 0; value < 256; value++)
2529 if (!isSPACE(value))
2530 ANYOF_BITMAP_CLEAR(data->start_class, value);
2534 if (data->start_class->flags & ANYOF_LOCALE)
2535 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2537 for (value = 0; value < 256; value++)
2539 ANYOF_BITMAP_SET(data->start_class, value);
2544 if (flags & SCF_DO_STCLASS_AND) {
2545 if (data->start_class->flags & ANYOF_LOCALE)
2546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2549 data->start_class->flags |= ANYOF_LOCALE;
2550 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2554 if (flags & SCF_DO_STCLASS_AND) {
2555 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2557 for (value = 0; value < 256; value++)
2559 ANYOF_BITMAP_CLEAR(data->start_class, value);
2563 if (data->start_class->flags & ANYOF_LOCALE)
2564 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2566 for (value = 0; value < 256; value++)
2567 if (!isSPACE(value))
2568 ANYOF_BITMAP_SET(data->start_class, value);
2573 if (flags & SCF_DO_STCLASS_AND) {
2574 if (data->start_class->flags & ANYOF_LOCALE) {
2575 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2576 for (value = 0; value < 256; value++)
2577 if (!isSPACE(value))
2578 ANYOF_BITMAP_CLEAR(data->start_class, value);
2582 data->start_class->flags |= ANYOF_LOCALE;
2583 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2587 if (flags & SCF_DO_STCLASS_AND) {
2588 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2589 for (value = 0; value < 256; value++)
2590 if (!isDIGIT(value))
2591 ANYOF_BITMAP_CLEAR(data->start_class, value);
2594 if (data->start_class->flags & ANYOF_LOCALE)
2595 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2597 for (value = 0; value < 256; value++)
2599 ANYOF_BITMAP_SET(data->start_class, value);
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2606 for (value = 0; value < 256; value++)
2608 ANYOF_BITMAP_CLEAR(data->start_class, value);
2611 if (data->start_class->flags & ANYOF_LOCALE)
2612 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2614 for (value = 0; value < 256; value++)
2615 if (!isDIGIT(value))
2616 ANYOF_BITMAP_SET(data->start_class, value);
2621 if (flags & SCF_DO_STCLASS_OR)
2622 cl_and(data->start_class, &and_with);
2623 flags &= ~SCF_DO_STCLASS;
2626 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2627 data->flags |= (OP(scan) == MEOL
2631 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2632 /* Lookbehind, or need to calculate parens/evals/stclass: */
2633 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2634 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2635 /* Lookahead/lookbehind */
2636 I32 deltanext, minnext, fake = 0;
2638 struct regnode_charclass_class intrnl;
2641 data_fake.flags = 0;
2643 data_fake.whilem_c = data->whilem_c;
2644 data_fake.last_closep = data->last_closep;
2647 data_fake.last_closep = &fake;
2648 if ( flags & SCF_DO_STCLASS && !scan->flags
2649 && OP(scan) == IFMATCH ) { /* Lookahead */
2650 cl_init(pRExC_state, &intrnl);
2651 data_fake.start_class = &intrnl;
2652 f |= SCF_DO_STCLASS_AND;
2654 if (flags & SCF_WHILEM_VISITED_POS)
2655 f |= SCF_WHILEM_VISITED_POS;
2656 next = regnext(scan);
2657 nscan = NEXTOPER(NEXTOPER(scan));
2658 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2661 vFAIL("Variable length lookbehind not implemented");
2663 else if (minnext > U8_MAX) {
2664 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2666 scan->flags = (U8)minnext;
2668 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2670 if (data && (data_fake.flags & SF_HAS_EVAL))
2671 data->flags |= SF_HAS_EVAL;
2673 data->whilem_c = data_fake.whilem_c;
2674 if (f & SCF_DO_STCLASS_AND) {
2675 const int was = (data->start_class->flags & ANYOF_EOS);
2677 cl_and(data->start_class, &intrnl);
2679 data->start_class->flags |= ANYOF_EOS;
2682 else if (OP(scan) == OPEN) {
2685 else if (OP(scan) == CLOSE) {
2686 if ((I32)ARG(scan) == is_par) {
2687 next = regnext(scan);
2689 if ( next && (OP(next) != WHILEM) && next < last)
2690 is_par = 0; /* Disable optimization */
2693 *(data->last_closep) = ARG(scan);
2695 else if (OP(scan) == EVAL) {
2697 data->flags |= SF_HAS_EVAL;
2699 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2700 if (flags & SCF_DO_SUBSTR) {
2701 scan_commit(pRExC_state,data);
2702 data->longest = &(data->longest_float);
2704 is_inf = is_inf_internal = 1;
2705 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2706 cl_anything(pRExC_state, data->start_class);
2707 flags &= ~SCF_DO_STCLASS;
2709 /* Else: zero-length, ignore. */
2710 scan = regnext(scan);
2715 *deltap = is_inf_internal ? I32_MAX : delta;
2716 if (flags & SCF_DO_SUBSTR && is_inf)
2717 data->pos_delta = I32_MAX - data->pos_min;
2718 if (is_par > U8_MAX)
2720 if (is_par && pars==1 && data) {
2721 data->flags |= SF_IN_PAR;
2722 data->flags &= ~SF_HAS_PAR;
2724 else if (pars && data) {
2725 data->flags |= SF_HAS_PAR;
2726 data->flags &= ~SF_IN_PAR;
2728 if (flags & SCF_DO_STCLASS_OR)
2729 cl_and(data->start_class, &and_with);
2734 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
2736 if (RExC_rx->data) {
2737 Renewc(RExC_rx->data,
2738 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2739 char, struct reg_data);
2740 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2741 RExC_rx->data->count += n;
2744 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2745 char, struct reg_data);
2746 Newx(RExC_rx->data->what, n, U8);
2747 RExC_rx->data->count = n;
2749 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2750 return RExC_rx->data->count - n;
2754 Perl_reginitcolors(pTHX)
2757 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2759 char *t = savepv(s);
2763 t = strchr(t, '\t');
2769 PL_colors[i] = t = (char *)"";
2774 PL_colors[i++] = (char *)"";
2781 - pregcomp - compile a regular expression into internal code
2783 * We can't allocate space until we know how big the compiled form will be,
2784 * but we can't compile it (and thus know how big it is) until we've got a
2785 * place to put the code. So we cheat: we compile it twice, once with code
2786 * generation turned off and size counting turned on, and once "for real".
2787 * This also means that we don't allocate space until we are sure that the
2788 * thing really will compile successfully, and we never have to move the
2789 * code and thus invalidate pointers into it. (Note that it has to be in
2790 * one piece because free() must be able to free it all.) [NB: not true in perl]
2792 * Beware that the optimization-preparation code in here knows about some
2793 * of the structure of the compiled regexp. [I'll say.]
2796 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2807 RExC_state_t RExC_state;
2808 RExC_state_t *pRExC_state = &RExC_state;
2810 GET_RE_DEBUG_FLAGS_DECL;
2813 FAIL("NULL regexp argument");
2815 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2818 DEBUG_r(if (!PL_colorset) reginitcolors());
2820 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2821 PL_colors[4],PL_colors[5],PL_colors[0],
2822 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2824 RExC_flags = pm->op_pmflags;
2828 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2829 RExC_seen_evals = 0;
2832 /* First pass: determine size, legality. */
2839 RExC_emit = &PL_regdummy;
2840 RExC_whilem_seen = 0;
2841 #if 0 /* REGC() is (currently) a NOP at the first pass.
2842 * Clever compilers notice this and complain. --jhi */
2843 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2845 if (reg(pRExC_state, 0, &flags) == NULL) {
2846 RExC_precomp = NULL;
2849 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2851 /* Small enough for pointer-storage convention?
2852 If extralen==0, this means that we will not need long jumps. */
2853 if (RExC_size >= 0x10000L && RExC_extralen)
2854 RExC_size += RExC_extralen;
2857 if (RExC_whilem_seen > 15)
2858 RExC_whilem_seen = 15;
2860 /* Allocate space and initialize. */
2861 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2864 FAIL("Regexp out of space");
2867 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2868 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2871 r->prelen = xend - exp;
2872 r->precomp = savepvn(RExC_precomp, r->prelen);
2874 #ifdef PERL_OLD_COPY_ON_WRITE
2875 r->saved_copy = NULL;
2877 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2878 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2879 r->lastparen = 0; /* mg.c reads this. */
2881 r->substrs = 0; /* Useful during FAIL. */
2882 r->startp = 0; /* Useful during FAIL. */
2883 r->endp = 0; /* Useful during FAIL. */
2885 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2887 r->offsets[0] = RExC_size;
2889 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2890 "%s %"UVuf" bytes for offset annotations.\n",
2891 r->offsets ? "Got" : "Couldn't get",
2892 (UV)((2*RExC_size+1) * sizeof(U32))));
2896 /* Second pass: emit code. */
2897 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2902 RExC_emit_start = r->program;
2903 RExC_emit = r->program;
2904 /* Store the count of eval-groups for security checks: */
2905 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2906 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2908 if (reg(pRExC_state, 0, &flags) == NULL)
2912 /* Dig out information for optimizations. */
2913 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2914 pm->op_pmflags = RExC_flags;
2916 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2917 r->regstclass = NULL;
2918 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2919 r->reganch |= ROPT_NAUGHTY;
2920 scan = r->program + 1; /* First BRANCH. */
2922 /* XXXX To minimize changes to RE engine we always allocate
2923 3-units-long substrs field. */
2924 Newxz(r->substrs, 1, struct reg_substr_data);
2926 StructCopy(&zero_scan_data, &data, scan_data_t);
2927 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2928 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2930 STRLEN longest_float_length, longest_fixed_length;
2931 struct regnode_charclass_class ch_class;
2936 /* Skip introductions and multiplicators >= 1. */
2937 while ((OP(first) == OPEN && (sawopen = 1)) ||
2938 /* An OR of *one* alternative - should not happen now. */
2939 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2940 (OP(first) == PLUS) ||
2941 (OP(first) == MINMOD) ||
2942 /* An {n,m} with n>0 */
2943 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2944 if (OP(first) == PLUS)
2947 first += regarglen[(U8)OP(first)];
2948 first = NEXTOPER(first);
2951 /* Starting-point info. */
2953 if (PL_regkind[(U8)OP(first)] == EXACT) {
2954 if (OP(first) == EXACT)
2955 /*EMPTY*/; /* Empty, get anchored substr later. */
2956 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2957 r->regstclass = first;
2959 else if (strchr((const char*)PL_simple,OP(first)))
2960 r->regstclass = first;
2961 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2962 PL_regkind[(U8)OP(first)] == NBOUND)
2963 r->regstclass = first;
2964 else if (PL_regkind[(U8)OP(first)] == BOL) {
2965 r->reganch |= (OP(first) == MBOL
2967 : (OP(first) == SBOL
2970 first = NEXTOPER(first);
2973 else if (OP(first) == GPOS) {
2974 r->reganch |= ROPT_ANCH_GPOS;
2975 first = NEXTOPER(first);
2978 else if (!sawopen && (OP(first) == STAR &&
2979 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2980 !(r->reganch & ROPT_ANCH) )
2982 /* turn .* into ^.* with an implied $*=1 */
2984 (OP(NEXTOPER(first)) == REG_ANY)
2987 r->reganch |= type | ROPT_IMPLICIT;
2988 first = NEXTOPER(first);
2991 if (sawplus && (!sawopen || !RExC_sawback)
2992 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
2993 /* x+ must match at the 1st pos of run of x's */
2994 r->reganch |= ROPT_SKIP;
2996 /* Scan is after the zeroth branch, first is atomic matcher. */
2997 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
2998 (IV)(first - scan + 1)));
3000 * If there's something expensive in the r.e., find the
3001 * longest literal string that must appear and make it the
3002 * regmust. Resolve ties in favor of later strings, since
3003 * the regstart check works with the beginning of the r.e.
3004 * and avoiding duplication strengthens checking. Not a
3005 * strong reason, but sufficient in the absence of others.
3006 * [Now we resolve ties in favor of the earlier string if
3007 * it happens that c_offset_min has been invalidated, since the
3008 * earlier string may buy us something the later one won't.]
3012 data.longest_fixed = newSVpvs("");
3013 data.longest_float = newSVpvs("");
3014 data.last_found = newSVpvs("");
3015 data.longest = &(data.longest_fixed);
3017 if (!r->regstclass) {
3018 cl_init(pRExC_state, &ch_class);
3019 data.start_class = &ch_class;
3020 stclass_flag = SCF_DO_STCLASS_AND;
3021 } else /* XXXX Check for BOUND? */
3023 data.last_closep = &last_close;
3025 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3026 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3027 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3028 && data.last_start_min == 0 && data.last_end > 0
3029 && !RExC_seen_zerolen
3030 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3031 r->reganch |= ROPT_CHECK_ALL;
3032 scan_commit(pRExC_state, &data);
3033 SvREFCNT_dec(data.last_found);
3035 longest_float_length = CHR_SVLEN(data.longest_float);
3036 if (longest_float_length
3037 || (data.flags & SF_FL_BEFORE_EOL
3038 && (!(data.flags & SF_FL_BEFORE_MEOL)
3039 || (RExC_flags & PMf_MULTILINE)))) {
3042 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3043 && data.offset_fixed == data.offset_float_min
3044 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3045 goto remove_float; /* As in (a)+. */
3047 if (SvUTF8(data.longest_float)) {
3048 r->float_utf8 = data.longest_float;
3049 r->float_substr = NULL;
3051 r->float_substr = data.longest_float;
3052 r->float_utf8 = NULL;
3054 r->float_min_offset = data.offset_float_min;
3055 r->float_max_offset = data.offset_float_max;
3056 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3057 && (!(data.flags & SF_FL_BEFORE_MEOL)
3058 || (RExC_flags & PMf_MULTILINE)));
3059 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3063 r->float_substr = r->float_utf8 = NULL;
3064 SvREFCNT_dec(data.longest_float);
3065 longest_float_length = 0;
3068 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3069 if (longest_fixed_length
3070 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3071 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3072 || (RExC_flags & PMf_MULTILINE)))) {
3075 if (SvUTF8(data.longest_fixed)) {
3076 r->anchored_utf8 = data.longest_fixed;
3077 r->anchored_substr = NULL;
3079 r->anchored_substr = data.longest_fixed;
3080 r->anchored_utf8 = NULL;
3082 r->anchored_offset = data.offset_fixed;
3083 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3084 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3085 || (RExC_flags & PMf_MULTILINE)));
3086 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3089 r->anchored_substr = r->anchored_utf8 = NULL;
3090 SvREFCNT_dec(data.longest_fixed);
3091 longest_fixed_length = 0;
3094 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3095 r->regstclass = NULL;
3096 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3098 && !(data.start_class->flags & ANYOF_EOS)
3099 && !cl_is_anything(data.start_class))
3101 const I32 n = add_data(pRExC_state, 1, "f");
3103 Newx(RExC_rx->data->data[n], 1,
3104 struct regnode_charclass_class);
3105 StructCopy(data.start_class,
3106 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3107 struct regnode_charclass_class);
3108 r->regstclass = (regnode*)RExC_rx->data->data[n];
3109 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3110 PL_regdata = r->data; /* for regprop() */
3111 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3112 regprop(sv, (regnode*)data.start_class);
3113 PerlIO_printf(Perl_debug_log,
3114 "synthetic stclass \"%s\".\n",
3115 SvPVX_const(sv));});
3118 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3119 if (longest_fixed_length > longest_float_length) {
3120 r->check_substr = r->anchored_substr;
3121 r->check_utf8 = r->anchored_utf8;
3122 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3123 if (r->reganch & ROPT_ANCH_SINGLE)
3124 r->reganch |= ROPT_NOSCAN;
3127 r->check_substr = r->float_substr;
3128 r->check_utf8 = r->float_utf8;
3129 r->check_offset_min = data.offset_float_min;
3130 r->check_offset_max = data.offset_float_max;
3132 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3133 This should be changed ASAP! */
3134 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3135 r->reganch |= RE_USE_INTUIT;
3136 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3137 r->reganch |= RE_INTUIT_TAIL;
3141 /* Several toplevels. Best we can is to set minlen. */
3143 struct regnode_charclass_class ch_class;
3146 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3147 scan = r->program + 1;
3148 cl_init(pRExC_state, &ch_class);
3149 data.start_class = &ch_class;
3150 data.last_closep = &last_close;
3151 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3152 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3153 = r->float_substr = r->float_utf8 = NULL;
3154 if (!(data.start_class->flags & ANYOF_EOS)
3155 && !cl_is_anything(data.start_class))
3157 const I32 n = add_data(pRExC_state, 1, "f");
3159 Newx(RExC_rx->data->data[n], 1,
3160 struct regnode_charclass_class);
3161 StructCopy(data.start_class,
3162 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3163 struct regnode_charclass_class);
3164 r->regstclass = (regnode*)RExC_rx->data->data[n];
3165 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3166 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3167 regprop(sv, (regnode*)data.start_class);
3168 PerlIO_printf(Perl_debug_log,
3169 "synthetic stclass \"%s\".\n",
3170 SvPVX_const(sv));});
3175 if (RExC_seen & REG_SEEN_GPOS)
3176 r->reganch |= ROPT_GPOS_SEEN;
3177 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3178 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3179 if (RExC_seen & REG_SEEN_EVAL)
3180 r->reganch |= ROPT_EVAL_SEEN;
3181 if (RExC_seen & REG_SEEN_CANY)
3182 r->reganch |= ROPT_CANY_SEEN;
3183 Newxz(r->startp, RExC_npar, I32);
3184 Newxz(r->endp, RExC_npar, I32);
3185 PL_regdata = r->data; /* for regprop() */
3186 DEBUG_COMPILE_r(regdump(r));
3191 - reg - regular expression, i.e. main body or parenthesized thing
3193 * Caller must absorb opening parenthesis.
3195 * Combining parenthesis handling with the base level of regular expression
3196 * is a trifle forced, but the need to tie the tails of the branches to what
3197 * follows makes it hard to avoid.
3200 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3201 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3204 register regnode *ret; /* Will be the head of the group. */
3205 register regnode *br;
3206 register regnode *lastbr;
3207 register regnode *ender = NULL;
3208 register I32 parno = 0;
3210 const I32 oregflags = RExC_flags;
3211 bool have_branch = 0;
3214 /* for (?g), (?gc), and (?o) warnings; warning
3215 about (?c) will warn about (?g) -- japhy */
3217 #define WASTED_O 0x01
3218 #define WASTED_G 0x02
3219 #define WASTED_C 0x04
3220 #define WASTED_GC (0x02|0x04)
3221 I32 wastedflags = 0x00;
3223 char * parse_start = RExC_parse; /* MJD */
3224 char * const oregcomp_parse = RExC_parse;
3226 *flagp = 0; /* Tentatively. */
3229 /* Make an OPEN node, if parenthesized. */
3231 if (*RExC_parse == '?') { /* (?...) */
3232 U32 posflags = 0, negflags = 0;
3233 U32 *flagsp = &posflags;
3234 bool is_logical = 0;
3235 const char * const seqstart = RExC_parse;
3238 paren = *RExC_parse++;
3239 ret = NULL; /* For look-ahead/behind. */
3241 case '<': /* (?<...) */
3242 RExC_seen |= REG_SEEN_LOOKBEHIND;
3243 if (*RExC_parse == '!')
3245 if (*RExC_parse != '=' && *RExC_parse != '!')
3248 case '=': /* (?=...) */
3249 case '!': /* (?!...) */
3250 RExC_seen_zerolen++;
3251 case ':': /* (?:...) */
3252 case '>': /* (?>...) */
3254 case '$': /* (?$...) */
3255 case '@': /* (?@...) */
3256 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3258 case '#': /* (?#...) */
3259 while (*RExC_parse && *RExC_parse != ')')
3261 if (*RExC_parse != ')')
3262 FAIL("Sequence (?#... not terminated");
3263 nextchar(pRExC_state);
3266 case 'p': /* (?p...) */
3267 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3268 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3270 case '?': /* (??...) */
3272 if (*RExC_parse != '{')
3274 paren = *RExC_parse++;
3276 case '{': /* (?{...}) */
3278 I32 count = 1, n = 0;
3280 char *s = RExC_parse;
3282 RExC_seen_zerolen++;
3283 RExC_seen |= REG_SEEN_EVAL;
3284 while (count && (c = *RExC_parse)) {
3295 if (*RExC_parse != ')') {
3297 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3301 OP_4tree *sop, *rop;
3302 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3305 Perl_save_re_context(aTHX);
3306 rop = sv_compile_2op(sv, &sop, "re", &pad);
3307 sop->op_private |= OPpREFCOUNTED;
3308 /* re_dup will OpREFCNT_inc */
3309 OpREFCNT_set(sop, 1);
3312 n = add_data(pRExC_state, 3, "nop");
3313 RExC_rx->data->data[n] = (void*)rop;
3314 RExC_rx->data->data[n+1] = (void*)sop;
3315 RExC_rx->data->data[n+2] = (void*)pad;
3318 else { /* First pass */
3319 if (PL_reginterp_cnt < ++RExC_seen_evals
3321 /* No compiled RE interpolated, has runtime
3322 components ===> unsafe. */
3323 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3324 if (PL_tainting && PL_tainted)
3325 FAIL("Eval-group in insecure regular expression");
3326 if (IN_PERL_COMPILETIME)
3330 nextchar(pRExC_state);
3332 ret = reg_node(pRExC_state, LOGICAL);
3335 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3336 /* deal with the length of this later - MJD */
3339 ret = reganode(pRExC_state, EVAL, n);
3340 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3341 Set_Node_Offset(ret, parse_start);
3344 case '(': /* (?(?{...})...) and (?(?=...)...) */
3346 if (RExC_parse[0] == '?') { /* (?(?...)) */
3347 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3348 || RExC_parse[1] == '<'
3349 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3352 ret = reg_node(pRExC_state, LOGICAL);
3355 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3359 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3362 parno = atoi(RExC_parse++);
3364 while (isDIGIT(*RExC_parse))
3366 ret = reganode(pRExC_state, GROUPP, parno);
3368 if ((c = *nextchar(pRExC_state)) != ')')
3369 vFAIL("Switch condition not recognized");
3371 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3372 br = regbranch(pRExC_state, &flags, 1);
3374 br = reganode(pRExC_state, LONGJMP, 0);
3376 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3377 c = *nextchar(pRExC_state);
3381 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3382 regbranch(pRExC_state, &flags, 1);
3383 regtail(pRExC_state, ret, lastbr);
3386 c = *nextchar(pRExC_state);
3391 vFAIL("Switch (?(condition)... contains too many branches");
3392 ender = reg_node(pRExC_state, TAIL);
3393 regtail(pRExC_state, br, ender);
3395 regtail(pRExC_state, lastbr, ender);
3396 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3399 regtail(pRExC_state, ret, ender);
3403 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3407 RExC_parse--; /* for vFAIL to print correctly */
3408 vFAIL("Sequence (? incomplete");
3412 parse_flags: /* (?i) */
3413 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3414 /* (?g), (?gc) and (?o) are useless here
3415 and must be globally applied -- japhy */
3417 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3418 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3419 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3420 if (! (wastedflags & wflagbit) ) {
3421 wastedflags |= wflagbit;
3424 "Useless (%s%c) - %suse /%c modifier",
3425 flagsp == &negflags ? "?-" : "?",
3427 flagsp == &negflags ? "don't " : "",
3433 else if (*RExC_parse == 'c') {
3434 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3435 if (! (wastedflags & WASTED_C) ) {
3436 wastedflags |= WASTED_GC;
3439 "Useless (%sc) - %suse /gc modifier",
3440 flagsp == &negflags ? "?-" : "?",
3441 flagsp == &negflags ? "don't " : ""
3446 else { pmflag(flagsp, *RExC_parse); }
3450 if (*RExC_parse == '-') {
3452 wastedflags = 0; /* reset so (?g-c) warns twice */
3456 RExC_flags |= posflags;
3457 RExC_flags &= ~negflags;
3458 if (*RExC_parse == ':') {
3464 if (*RExC_parse != ')') {
3466 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3468 nextchar(pRExC_state);
3476 ret = reganode(pRExC_state, OPEN, parno);
3477 Set_Node_Length(ret, 1); /* MJD */
3478 Set_Node_Offset(ret, RExC_parse); /* MJD */
3485 /* Pick up the branches, linking them together. */
3486 parse_start = RExC_parse; /* MJD */
3487 br = regbranch(pRExC_state, &flags, 1);
3488 /* branch_len = (paren != 0); */
3492 if (*RExC_parse == '|') {
3493 if (!SIZE_ONLY && RExC_extralen) {
3494 reginsert(pRExC_state, BRANCHJ, br);
3497 reginsert(pRExC_state, BRANCH, br);
3498 Set_Node_Length(br, paren != 0);
3499 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3503 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3505 else if (paren == ':') {
3506 *flagp |= flags&SIMPLE;
3508 if (is_open) { /* Starts with OPEN. */
3509 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3511 else if (paren != '?') /* Not Conditional */
3513 *flagp |= flags & (SPSTART | HASWIDTH);
3515 while (*RExC_parse == '|') {
3516 if (!SIZE_ONLY && RExC_extralen) {
3517 ender = reganode(pRExC_state, LONGJMP,0);
3518 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3521 RExC_extralen += 2; /* Account for LONGJMP. */
3522 nextchar(pRExC_state);
3523 br = regbranch(pRExC_state, &flags, 0);
3527 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3531 *flagp |= flags&SPSTART;
3534 if (have_branch || paren != ':') {
3535 /* Make a closing node, and hook it on the end. */
3538 ender = reg_node(pRExC_state, TAIL);
3541 ender = reganode(pRExC_state, CLOSE, parno);
3542 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3543 Set_Node_Length(ender,1); /* MJD */
3549 *flagp &= ~HASWIDTH;
3552 ender = reg_node(pRExC_state, SUCCEED);
3555 ender = reg_node(pRExC_state, END);
3558 regtail(pRExC_state, lastbr, ender);
3561 /* Hook the tails of the branches to the closing node. */
3562 for (br = ret; br != NULL; br = regnext(br)) {
3563 regoptail(pRExC_state, br, ender);
3570 static const char parens[] = "=!<,>";
3572 if (paren && (p = strchr(parens, paren))) {
3573 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3574 int flag = (p - parens) > 1;
3577 node = SUSPEND, flag = 0;
3578 reginsert(pRExC_state, node,ret);
3579 Set_Node_Cur_Length(ret);
3580 Set_Node_Offset(ret, parse_start + 1);
3582 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3586 /* Check for proper termination. */
3588 RExC_flags = oregflags;
3589 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3590 RExC_parse = oregcomp_parse;
3591 vFAIL("Unmatched (");
3594 else if (!paren && RExC_parse < RExC_end) {
3595 if (*RExC_parse == ')') {
3597 vFAIL("Unmatched )");
3600 FAIL("Junk on end of regexp"); /* "Can't happen". */
3608 - regbranch - one alternative of an | operator
3610 * Implements the concatenation operator.
3613 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3616 register regnode *ret;
3617 register regnode *chain = NULL;
3618 register regnode *latest;
3619 I32 flags = 0, c = 0;
3624 if (!SIZE_ONLY && RExC_extralen)
3625 ret = reganode(pRExC_state, BRANCHJ,0);
3627 ret = reg_node(pRExC_state, BRANCH);
3628 Set_Node_Length(ret, 1);
3632 if (!first && SIZE_ONLY)
3633 RExC_extralen += 1; /* BRANCHJ */
3635 *flagp = WORST; /* Tentatively. */
3638 nextchar(pRExC_state);
3639 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3641 latest = regpiece(pRExC_state, &flags);
3642 if (latest == NULL) {
3643 if (flags & TRYAGAIN)
3647 else if (ret == NULL)
3649 *flagp |= flags&HASWIDTH;
3650 if (chain == NULL) /* First piece. */
3651 *flagp |= flags&SPSTART;
3654 regtail(pRExC_state, chain, latest);
3659 if (chain == NULL) { /* Loop ran zero times. */
3660 chain = reg_node(pRExC_state, NOTHING);
3665 *flagp |= flags&SIMPLE;
3672 - regpiece - something followed by possible [*+?]
3674 * Note that the branching code sequences used for ? and the general cases
3675 * of * and + are somewhat optimized: they use the same NOTHING node as
3676 * both the endmarker for their branch list and the body of the last branch.
3677 * It might seem that this node could be dispensed with entirely, but the
3678 * endmarker role is not redundant.
3681 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3684 register regnode *ret;
3686 register char *next;
3688 const char * const origparse = RExC_parse;
3691 I32 max = REG_INFTY;
3694 ret = regatom(pRExC_state, &flags);
3696 if (flags & TRYAGAIN)
3703 if (op == '{' && regcurly(RExC_parse)) {
3704 parse_start = RExC_parse; /* MJD */
3705 next = RExC_parse + 1;
3707 while (isDIGIT(*next) || *next == ',') {
3716 if (*next == '}') { /* got one */
3720 min = atoi(RExC_parse);
3724 maxpos = RExC_parse;
3726 if (!max && *maxpos != '0')
3727 max = REG_INFTY; /* meaning "infinity" */
3728 else if (max >= REG_INFTY)
3729 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3731 nextchar(pRExC_state);
3734 if ((flags&SIMPLE)) {
3735 RExC_naughty += 2 + RExC_naughty / 2;
3736 reginsert(pRExC_state, CURLY, ret);
3737 Set_Node_Offset(ret, parse_start+1); /* MJD */
3738 Set_Node_Cur_Length(ret);
3741 regnode *w = reg_node(pRExC_state, WHILEM);
3744 regtail(pRExC_state, ret, w);
3745 if (!SIZE_ONLY && RExC_extralen) {
3746 reginsert(pRExC_state, LONGJMP,ret);
3747 reginsert(pRExC_state, NOTHING,ret);
3748 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3750 reginsert(pRExC_state, CURLYX,ret);
3752 Set_Node_Offset(ret, parse_start+1);
3753 Set_Node_Length(ret,
3754 op == '{' ? (RExC_parse - parse_start) : 1);
3756 if (!SIZE_ONLY && RExC_extralen)
3757 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3758 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3760 RExC_whilem_seen++, RExC_extralen += 3;
3761 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3769 if (max && max < min)
3770 vFAIL("Can't do {n,m} with n > m");
3772 ARG1_SET(ret, (U16)min);
3773 ARG2_SET(ret, (U16)max);
3785 #if 0 /* Now runtime fix should be reliable. */
3787 /* if this is reinstated, don't forget to put this back into perldiag:
3789 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3791 (F) The part of the regexp subject to either the * or + quantifier
3792 could match an empty string. The {#} shows in the regular
3793 expression about where the problem was discovered.
3797 if (!(flags&HASWIDTH) && op != '?')
3798 vFAIL("Regexp *+ operand could be empty");
3801 parse_start = RExC_parse;
3802 nextchar(pRExC_state);
3804 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3806 if (op == '*' && (flags&SIMPLE)) {
3807 reginsert(pRExC_state, STAR, ret);
3811 else if (op == '*') {
3815 else if (op == '+' && (flags&SIMPLE)) {
3816 reginsert(pRExC_state, PLUS, ret);
3820 else if (op == '+') {
3824 else if (op == '?') {
3829 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
3831 "%.*s matches null string many times",
3832 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
3836 if (*RExC_parse == '?') {
3837 nextchar(pRExC_state);
3838 reginsert(pRExC_state, MINMOD, ret);
3839 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3841 if (ISMULT2(RExC_parse)) {
3843 vFAIL("Nested quantifiers");
3850 - regatom - the lowest level
3852 * Optimization: gobbles an entire sequence of ordinary characters so that
3853 * it can turn them into a single node, which is smaller to store and
3854 * faster to run. Backslashed characters are exceptions, each becoming a
3855 * separate node; the code is simpler that way and it's not worth fixing.
3857 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3859 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3862 register regnode *ret = NULL;
3864 char *parse_start = RExC_parse;
3866 *flagp = WORST; /* Tentatively. */
3869 switch (*RExC_parse) {
3871 RExC_seen_zerolen++;
3872 nextchar(pRExC_state);
3873 if (RExC_flags & PMf_MULTILINE)
3874 ret = reg_node(pRExC_state, MBOL);
3875 else if (RExC_flags & PMf_SINGLELINE)
3876 ret = reg_node(pRExC_state, SBOL);
3878 ret = reg_node(pRExC_state, BOL);
3879 Set_Node_Length(ret, 1); /* MJD */
3882 nextchar(pRExC_state);
3884 RExC_seen_zerolen++;
3885 if (RExC_flags & PMf_MULTILINE)
3886 ret = reg_node(pRExC_state, MEOL);
3887 else if (RExC_flags & PMf_SINGLELINE)
3888 ret = reg_node(pRExC_state, SEOL);
3890 ret = reg_node(pRExC_state, EOL);
3891 Set_Node_Length(ret, 1); /* MJD */
3894 nextchar(pRExC_state);
3895 if (RExC_flags & PMf_SINGLELINE)
3896 ret = reg_node(pRExC_state, SANY);
3898 ret = reg_node(pRExC_state, REG_ANY);
3899 *flagp |= HASWIDTH|SIMPLE;
3901 Set_Node_Length(ret, 1); /* MJD */
3905 char *oregcomp_parse = ++RExC_parse;
3906 ret = regclass(pRExC_state);
3907 if (*RExC_parse != ']') {
3908 RExC_parse = oregcomp_parse;
3909 vFAIL("Unmatched [");
3911 nextchar(pRExC_state);
3912 *flagp |= HASWIDTH|SIMPLE;
3913 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3917 nextchar(pRExC_state);
3918 ret = reg(pRExC_state, 1, &flags);
3920 if (flags & TRYAGAIN) {
3921 if (RExC_parse == RExC_end) {
3922 /* Make parent create an empty node if needed. */
3930 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3934 if (flags & TRYAGAIN) {
3938 vFAIL("Internal urp");
3939 /* Supposed to be caught earlier. */
3942 if (!regcurly(RExC_parse)) {
3951 vFAIL("Quantifier follows nothing");
3954 switch (*++RExC_parse) {
3956 RExC_seen_zerolen++;
3957 ret = reg_node(pRExC_state, SBOL);
3959 nextchar(pRExC_state);
3960 Set_Node_Length(ret, 2); /* MJD */
3963 ret = reg_node(pRExC_state, GPOS);
3964 RExC_seen |= REG_SEEN_GPOS;
3966 nextchar(pRExC_state);
3967 Set_Node_Length(ret, 2); /* MJD */
3970 ret = reg_node(pRExC_state, SEOL);
3972 RExC_seen_zerolen++; /* Do not optimize RE away */
3973 nextchar(pRExC_state);
3976 ret = reg_node(pRExC_state, EOS);
3978 RExC_seen_zerolen++; /* Do not optimize RE away */
3979 nextchar(pRExC_state);
3980 Set_Node_Length(ret, 2); /* MJD */
3983 ret = reg_node(pRExC_state, CANY);
3984 RExC_seen |= REG_SEEN_CANY;
3985 *flagp |= HASWIDTH|SIMPLE;
3986 nextchar(pRExC_state);
3987 Set_Node_Length(ret, 2); /* MJD */
3990 ret = reg_node(pRExC_state, CLUMP);
3992 nextchar(pRExC_state);
3993 Set_Node_Length(ret, 2); /* MJD */
3996 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
3997 *flagp |= HASWIDTH|SIMPLE;
3998 nextchar(pRExC_state);
3999 Set_Node_Length(ret, 2); /* MJD */
4002 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4003 *flagp |= HASWIDTH|SIMPLE;
4004 nextchar(pRExC_state);
4005 Set_Node_Length(ret, 2); /* MJD */
4008 RExC_seen_zerolen++;
4009 RExC_seen |= REG_SEEN_LOOKBEHIND;
4010 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4012 nextchar(pRExC_state);
4013 Set_Node_Length(ret, 2); /* MJD */
4016 RExC_seen_zerolen++;
4017 RExC_seen |= REG_SEEN_LOOKBEHIND;
4018 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4020 nextchar(pRExC_state);
4021 Set_Node_Length(ret, 2); /* MJD */
4024 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4025 *flagp |= HASWIDTH|SIMPLE;
4026 nextchar(pRExC_state);
4027 Set_Node_Length(ret, 2); /* MJD */
4030 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4031 *flagp |= HASWIDTH|SIMPLE;
4032 nextchar(pRExC_state);
4033 Set_Node_Length(ret, 2); /* MJD */
4036 ret = reg_node(pRExC_state, DIGIT);
4037 *flagp |= HASWIDTH|SIMPLE;
4038 nextchar(pRExC_state);
4039 Set_Node_Length(ret, 2); /* MJD */
4042 ret = reg_node(pRExC_state, NDIGIT);
4043 *flagp |= HASWIDTH|SIMPLE;
4044 nextchar(pRExC_state);
4045 Set_Node_Length(ret, 2); /* MJD */
4050 char* oldregxend = RExC_end;
4051 char* parse_start = RExC_parse - 2;
4053 if (RExC_parse[1] == '{') {
4054 /* a lovely hack--pretend we saw [\pX] instead */
4055 RExC_end = strchr(RExC_parse, '}');
4057 U8 c = (U8)*RExC_parse;
4059 RExC_end = oldregxend;
4060 vFAIL2("Missing right brace on \\%c{}", c);
4065 RExC_end = RExC_parse + 2;
4066 if (RExC_end > oldregxend)
4067 RExC_end = oldregxend;
4071 ret = regclass(pRExC_state);
4073 RExC_end = oldregxend;
4076 Set_Node_Offset(ret, parse_start + 2);
4077 Set_Node_Cur_Length(ret);
4078 nextchar(pRExC_state);
4079 *flagp |= HASWIDTH|SIMPLE;
4092 case '1': case '2': case '3': case '4':
4093 case '5': case '6': case '7': case '8': case '9':
4095 const I32 num = atoi(RExC_parse);
4097 if (num > 9 && num >= RExC_npar)
4100 char * parse_start = RExC_parse - 1; /* MJD */
4101 while (isDIGIT(*RExC_parse))
4104 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4105 vFAIL("Reference to nonexistent group");
4107 ret = reganode(pRExC_state,
4108 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4112 /* override incorrect value set in reganode MJD */
4113 Set_Node_Offset(ret, parse_start+1);
4114 Set_Node_Cur_Length(ret); /* MJD */
4116 nextchar(pRExC_state);
4121 if (RExC_parse >= RExC_end)
4122 FAIL("Trailing \\");
4125 /* Do not generate "unrecognized" warnings here, we fall
4126 back into the quick-grab loop below */
4133 if (RExC_flags & PMf_EXTENDED) {
4134 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4135 if (RExC_parse < RExC_end)
4141 register STRLEN len;
4146 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4148 parse_start = RExC_parse - 1;
4154 ret = reg_node(pRExC_state,
4155 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4157 for (len = 0, p = RExC_parse - 1;
4158 len < 127 && p < RExC_end;
4163 if (RExC_flags & PMf_EXTENDED)
4164 p = regwhite(p, RExC_end);
4211 ender = ASCII_TO_NATIVE('\033');
4215 ender = ASCII_TO_NATIVE('\007');
4220 char* const e = strchr(p, '}');
4224 vFAIL("Missing right brace on \\x{}");
4227 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4228 | PERL_SCAN_DISALLOW_PREFIX;
4229 STRLEN numlen = e - p - 1;
4230 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4237 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4239 ender = grok_hex(p, &numlen, &flags, NULL);
4245 ender = UCHARAT(p++);
4246 ender = toCTRL(ender);
4248 case '0': case '1': case '2': case '3':case '4':
4249 case '5': case '6': case '7': case '8':case '9':
4251 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4254 ender = grok_oct(p, &numlen, &flags, NULL);
4264 FAIL("Trailing \\");
4267 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4268 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4269 goto normal_default;
4274 if (UTF8_IS_START(*p) && UTF) {
4276 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4284 if (RExC_flags & PMf_EXTENDED)
4285 p = regwhite(p, RExC_end);
4287 /* Prime the casefolded buffer. */
4288 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4290 if (ISMULT2(p)) { /* Back off on ?+*. */
4297 /* Emit all the Unicode characters. */
4299 for (foldbuf = tmpbuf;
4301 foldlen -= numlen) {
4302 ender = utf8_to_uvchr(foldbuf, &numlen);
4304 reguni(pRExC_state, ender, s, &unilen);
4307 /* In EBCDIC the numlen
4308 * and unilen can differ. */
4310 if (numlen >= foldlen)
4314 break; /* "Can't happen." */
4318 reguni(pRExC_state, ender, s, &unilen);
4327 REGC((char)ender, s++);
4335 /* Emit all the Unicode characters. */
4337 for (foldbuf = tmpbuf;
4339 foldlen -= numlen) {
4340 ender = utf8_to_uvchr(foldbuf, &numlen);
4342 reguni(pRExC_state, ender, s, &unilen);
4345 /* In EBCDIC the numlen
4346 * and unilen can differ. */
4348 if (numlen >= foldlen)
4356 reguni(pRExC_state, ender, s, &unilen);
4365 REGC((char)ender, s++);
4369 Set_Node_Cur_Length(ret); /* MJD */
4370 nextchar(pRExC_state);
4372 /* len is STRLEN which is unsigned, need to copy to signed */
4375 vFAIL("Internal disaster");
4379 if (len == 1 && UNI_IS_INVARIANT(ender))
4384 RExC_size += STR_SZ(len);
4386 RExC_emit += STR_SZ(len);
4391 /* If the encoding pragma is in effect recode the text of
4392 * any EXACT-kind nodes. */
4393 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4394 STRLEN oldlen = STR_LEN(ret);
4395 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4399 if (sv_utf8_downgrade(sv, TRUE)) {
4400 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4401 const STRLEN newlen = SvCUR(sv);
4406 GET_RE_DEBUG_FLAGS_DECL;
4407 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4408 (int)oldlen, STRING(ret),
4410 Copy(s, STRING(ret), newlen, char);
4411 STR_LEN(ret) += newlen - oldlen;
4412 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4414 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4422 S_regwhite(char *p, const char *e)
4427 else if (*p == '#') {
4430 } while (p < e && *p != '\n');
4438 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4439 Character classes ([:foo:]) can also be negated ([:^foo:]).
4440 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4441 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4442 but trigger failures because they are currently unimplemented. */
4444 #define POSIXCC_DONE(c) ((c) == ':')
4445 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4446 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4449 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4452 I32 namedclass = OOB_NAMEDCLASS;
4454 if (value == '[' && RExC_parse + 1 < RExC_end &&
4455 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4456 POSIXCC(UCHARAT(RExC_parse))) {
4457 const char c = UCHARAT(RExC_parse);
4458 char* s = RExC_parse++;
4460 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4462 if (RExC_parse == RExC_end)
4463 /* Grandfather lone [:, [=, [. */
4466 const char* t = RExC_parse++; /* skip over the c */
4467 const char *posixcc;
4471 if (UCHARAT(RExC_parse) == ']') {
4472 RExC_parse++; /* skip over the ending ] */
4475 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4476 const I32 skip = t - posixcc;
4478 /* Initially switch on the length of the name. */
4481 if (memEQ(posixcc, "word", 4)) {
4482 /* this is not POSIX, this is the Perl \w */;
4484 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4488 /* Names all of length 5. */
4489 /* alnum alpha ascii blank cntrl digit graph lower
4490 print punct space upper */
4491 /* Offset 4 gives the best switch position. */
4492 switch (posixcc[4]) {
4494 if (memEQ(posixcc, "alph", 4)) {
4497 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4501 if (memEQ(posixcc, "spac", 4)) {
4504 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4508 if (memEQ(posixcc, "grap", 4)) {
4511 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4515 if (memEQ(posixcc, "asci", 4)) {
4518 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4522 if (memEQ(posixcc, "blan", 4)) {
4525 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4529 if (memEQ(posixcc, "cntr", 4)) {
4532 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4536 if (memEQ(posixcc, "alnu", 4)) {
4539 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4543 if (memEQ(posixcc, "lowe", 4)) {
4546 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4548 if (memEQ(posixcc, "uppe", 4)) {
4551 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4555 if (memEQ(posixcc, "digi", 4)) {
4558 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4560 if (memEQ(posixcc, "prin", 4)) {
4563 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4565 if (memEQ(posixcc, "punc", 4)) {
4568 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4574 if (memEQ(posixcc, "xdigit", 6)) {
4576 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4581 if (namedclass == OOB_NAMEDCLASS)
4583 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4586 assert (posixcc[skip] == ':');
4587 assert (posixcc[skip+1] == ']');
4588 } else if (!SIZE_ONLY) {
4589 /* [[=foo=]] and [[.foo.]] are still future. */
4591 /* adjust RExC_parse so the warning shows after
4593 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4595 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4598 /* Maternal grandfather:
4599 * "[:" ending in ":" but not in ":]" */
4609 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4612 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4613 const char *s = RExC_parse;
4614 const char c = *s++;
4616 while(*s && isALNUM(*s))
4618 if (*s && c == *s && s[1] == ']') {
4619 if (ckWARN(WARN_REGEXP))
4621 "POSIX syntax [%c %c] belongs inside character classes",
4624 /* [[=foo=]] and [[.foo.]] are still future. */
4625 if (POSIXCC_NOTYET(c)) {
4626 /* adjust RExC_parse so the error shows after
4628 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4630 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4637 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4641 register UV nextvalue;
4642 register IV prevvalue = OOB_UNICODE;
4643 register IV range = 0;
4644 register regnode *ret;
4647 char *rangebegin = NULL;
4648 bool need_class = 0;
4652 bool optimize_invert = TRUE;
4653 AV* unicode_alternate = NULL;
4655 UV literal_endpoint = 0;
4658 ret = reganode(pRExC_state, ANYOF, 0);
4661 ANYOF_FLAGS(ret) = 0;
4663 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4667 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4671 RExC_size += ANYOF_SKIP;
4673 RExC_emit += ANYOF_SKIP;
4675 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4677 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4678 ANYOF_BITMAP_ZERO(ret);
4679 listsv = newSVpvs("# comment\n");
4682 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4684 if (!SIZE_ONLY && POSIXCC(nextvalue))
4685 checkposixcc(pRExC_state);
4687 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4688 if (UCHARAT(RExC_parse) == ']')
4691 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4695 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4698 rangebegin = RExC_parse;
4700 value = utf8n_to_uvchr((U8*)RExC_parse,
4701 RExC_end - RExC_parse,
4703 RExC_parse += numlen;
4706 value = UCHARAT(RExC_parse++);
4707 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4708 if (value == '[' && POSIXCC(nextvalue))
4709 namedclass = regpposixcc(pRExC_state, value);
4710 else if (value == '\\') {
4712 value = utf8n_to_uvchr((U8*)RExC_parse,
4713 RExC_end - RExC_parse,
4715 RExC_parse += numlen;
4718 value = UCHARAT(RExC_parse++);
4719 /* Some compilers cannot handle switching on 64-bit integer
4720 * values, therefore value cannot be an UV. Yes, this will
4721 * be a problem later if we want switch on Unicode.
4722 * A similar issue a little bit later when switching on
4723 * namedclass. --jhi */
4724 switch ((I32)value) {
4725 case 'w': namedclass = ANYOF_ALNUM; break;
4726 case 'W': namedclass = ANYOF_NALNUM; break;
4727 case 's': namedclass = ANYOF_SPACE; break;
4728 case 'S': namedclass = ANYOF_NSPACE; break;
4729 case 'd': namedclass = ANYOF_DIGIT; break;
4730 case 'D': namedclass = ANYOF_NDIGIT; break;
4733 if (RExC_parse >= RExC_end)
4734 vFAIL2("Empty \\%c{}", (U8)value);
4735 if (*RExC_parse == '{') {
4736 const U8 c = (U8)value;
4737 e = strchr(RExC_parse++, '}');
4739 vFAIL2("Missing right brace on \\%c{}", c);
4740 while (isSPACE(UCHARAT(RExC_parse)))
4742 if (e == RExC_parse)
4743 vFAIL2("Empty \\%c{}", c);
4745 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4753 if (UCHARAT(RExC_parse) == '^') {
4756 value = value == 'p' ? 'P' : 'p'; /* toggle */
4757 while (isSPACE(UCHARAT(RExC_parse))) {
4763 Perl_sv_catpvf(aTHX_ listsv,
4764 "+utf8::%.*s\n", (int)n, RExC_parse);
4766 Perl_sv_catpvf(aTHX_ listsv,
4767 "!utf8::%.*s\n", (int)n, RExC_parse);
4770 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4771 namedclass = ANYOF_MAX; /* no official name, but it's named */
4773 case 'n': value = '\n'; break;
4774 case 'r': value = '\r'; break;
4775 case 't': value = '\t'; break;
4776 case 'f': value = '\f'; break;
4777 case 'b': value = '\b'; break;
4778 case 'e': value = ASCII_TO_NATIVE('\033');break;
4779 case 'a': value = ASCII_TO_NATIVE('\007');break;
4781 if (*RExC_parse == '{') {
4782 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4783 | PERL_SCAN_DISALLOW_PREFIX;
4784 e = strchr(RExC_parse++, '}');
4786 vFAIL("Missing right brace on \\x{}");
4788 numlen = e - RExC_parse;
4789 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4793 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4795 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4796 RExC_parse += numlen;
4800 value = UCHARAT(RExC_parse++);
4801 value = toCTRL(value);
4803 case '0': case '1': case '2': case '3': case '4':
4804 case '5': case '6': case '7': case '8': case '9':
4808 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4809 RExC_parse += numlen;
4813 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
4815 "Unrecognized escape \\%c in character class passed through",
4819 } /* end of \blah */
4825 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4827 if (!SIZE_ONLY && !need_class)
4828 ANYOF_CLASS_ZERO(ret);
4832 /* a bad range like a-\d, a-[:digit:] ? */
4835 if (ckWARN(WARN_REGEXP)) {
4837 RExC_parse >= rangebegin ?
4838 RExC_parse - rangebegin : 0;
4840 "False [] range \"%*.*s\"",
4845 if (prevvalue < 256) {
4846 ANYOF_BITMAP_SET(ret, prevvalue);
4847 ANYOF_BITMAP_SET(ret, '-');
4850 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4851 Perl_sv_catpvf(aTHX_ listsv,
4852 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4856 range = 0; /* this was not a true range */
4860 const char *what = NULL;
4863 if (namedclass > OOB_NAMEDCLASS)
4864 optimize_invert = FALSE;
4865 /* Possible truncation here but in some 64-bit environments
4866 * the compiler gets heartburn about switch on 64-bit values.
4867 * A similar issue a little earlier when switching on value.
4869 switch ((I32)namedclass) {
4872 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4874 for (value = 0; value < 256; value++)
4876 ANYOF_BITMAP_SET(ret, value);
4883 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4885 for (value = 0; value < 256; value++)
4886 if (!isALNUM(value))
4887 ANYOF_BITMAP_SET(ret, value);
4894 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4896 for (value = 0; value < 256; value++)
4897 if (isALNUMC(value))
4898 ANYOF_BITMAP_SET(ret, value);
4905 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4907 for (value = 0; value < 256; value++)
4908 if (!isALNUMC(value))
4909 ANYOF_BITMAP_SET(ret, value);
4916 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4918 for (value = 0; value < 256; value++)
4920 ANYOF_BITMAP_SET(ret, value);
4927 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4929 for (value = 0; value < 256; value++)
4930 if (!isALPHA(value))
4931 ANYOF_BITMAP_SET(ret, value);
4938 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4941 for (value = 0; value < 128; value++)
4942 ANYOF_BITMAP_SET(ret, value);
4944 for (value = 0; value < 256; value++) {
4946 ANYOF_BITMAP_SET(ret, value);
4955 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4958 for (value = 128; value < 256; value++)
4959 ANYOF_BITMAP_SET(ret, value);
4961 for (value = 0; value < 256; value++) {
4962 if (!isASCII(value))
4963 ANYOF_BITMAP_SET(ret, value);
4972 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4974 for (value = 0; value < 256; value++)
4976 ANYOF_BITMAP_SET(ret, value);
4983 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4985 for (value = 0; value < 256; value++)
4986 if (!isBLANK(value))
4987 ANYOF_BITMAP_SET(ret, value);
4994 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
4996 for (value = 0; value < 256; value++)
4998 ANYOF_BITMAP_SET(ret, value);
5005 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5007 for (value = 0; value < 256; value++)
5008 if (!isCNTRL(value))
5009 ANYOF_BITMAP_SET(ret, value);
5016 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5018 /* consecutive digits assumed */
5019 for (value = '0'; value <= '9'; value++)
5020 ANYOF_BITMAP_SET(ret, value);
5027 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5029 /* consecutive digits assumed */
5030 for (value = 0; value < '0'; value++)
5031 ANYOF_BITMAP_SET(ret, value);
5032 for (value = '9' + 1; value < 256; value++)
5033 ANYOF_BITMAP_SET(ret, value);
5040 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5042 for (value = 0; value < 256; value++)
5044 ANYOF_BITMAP_SET(ret, value);
5051 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5053 for (value = 0; value < 256; value++)
5054 if (!isGRAPH(value))
5055 ANYOF_BITMAP_SET(ret, value);
5062 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5064 for (value = 0; value < 256; value++)
5066 ANYOF_BITMAP_SET(ret, value);
5073 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5075 for (value = 0; value < 256; value++)
5076 if (!isLOWER(value))
5077 ANYOF_BITMAP_SET(ret, value);
5084 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5086 for (value = 0; value < 256; value++)
5088 ANYOF_BITMAP_SET(ret, value);
5095 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5097 for (value = 0; value < 256; value++)
5098 if (!isPRINT(value))
5099 ANYOF_BITMAP_SET(ret, value);
5106 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5108 for (value = 0; value < 256; value++)
5109 if (isPSXSPC(value))
5110 ANYOF_BITMAP_SET(ret, value);
5117 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5119 for (value = 0; value < 256; value++)
5120 if (!isPSXSPC(value))
5121 ANYOF_BITMAP_SET(ret, value);
5128 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5130 for (value = 0; value < 256; value++)
5132 ANYOF_BITMAP_SET(ret, value);
5139 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5141 for (value = 0; value < 256; value++)
5142 if (!isPUNCT(value))
5143 ANYOF_BITMAP_SET(ret, value);
5150 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5152 for (value = 0; value < 256; value++)
5154 ANYOF_BITMAP_SET(ret, value);
5161 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5163 for (value = 0; value < 256; value++)
5164 if (!isSPACE(value))
5165 ANYOF_BITMAP_SET(ret, value);
5172 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5174 for (value = 0; value < 256; value++)
5176 ANYOF_BITMAP_SET(ret, value);
5183 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5185 for (value = 0; value < 256; value++)
5186 if (!isUPPER(value))
5187 ANYOF_BITMAP_SET(ret, value);
5194 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5196 for (value = 0; value < 256; value++)
5197 if (isXDIGIT(value))
5198 ANYOF_BITMAP_SET(ret, value);
5205 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5207 for (value = 0; value < 256; value++)
5208 if (!isXDIGIT(value))
5209 ANYOF_BITMAP_SET(ret, value);
5215 /* this is to handle \p and \P */
5218 vFAIL("Invalid [::] class");
5222 /* Strings such as "+utf8::isWord\n" */
5223 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5226 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5229 } /* end of namedclass \blah */
5232 if (prevvalue > (IV)value) /* b-a */ {
5233 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5234 RExC_parse - rangebegin,
5235 RExC_parse - rangebegin,
5237 range = 0; /* not a valid range */
5241 prevvalue = value; /* save the beginning of the range */
5242 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5243 RExC_parse[1] != ']') {
5246 /* a bad range like \w-, [:word:]- ? */
5247 if (namedclass > OOB_NAMEDCLASS) {
5248 if (ckWARN(WARN_REGEXP)) {
5250 RExC_parse >= rangebegin ?
5251 RExC_parse - rangebegin : 0;
5253 "False [] range \"%*.*s\"",
5259 ANYOF_BITMAP_SET(ret, '-');
5261 range = 1; /* yeah, it's a range! */
5262 continue; /* but do it the next time */
5266 /* now is the next time */
5270 if (prevvalue < 256) {
5271 const IV ceilvalue = value < 256 ? value : 255;
5274 /* In EBCDIC [\x89-\x91] should include
5275 * the \x8e but [i-j] should not. */
5276 if (literal_endpoint == 2 &&
5277 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5278 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5280 if (isLOWER(prevvalue)) {
5281 for (i = prevvalue; i <= ceilvalue; i++)
5283 ANYOF_BITMAP_SET(ret, i);
5285 for (i = prevvalue; i <= ceilvalue; i++)
5287 ANYOF_BITMAP_SET(ret, i);
5292 for (i = prevvalue; i <= ceilvalue; i++)
5293 ANYOF_BITMAP_SET(ret, i);
5295 if (value > 255 || UTF) {
5296 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5297 const UV natvalue = NATIVE_TO_UNI(value);
5299 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5300 if (prevnatvalue < natvalue) { /* what about > ? */
5301 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5302 prevnatvalue, natvalue);
5304 else if (prevnatvalue == natvalue) {
5305 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5307 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5309 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5311 /* If folding and foldable and a single
5312 * character, insert also the folded version
5313 * to the charclass. */
5315 if (foldlen == (STRLEN)UNISKIP(f))
5316 Perl_sv_catpvf(aTHX_ listsv,
5319 /* Any multicharacter foldings
5320 * require the following transform:
5321 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5322 * where E folds into "pq" and F folds
5323 * into "rst", all other characters
5324 * fold to single characters. We save
5325 * away these multicharacter foldings,
5326 * to be later saved as part of the
5327 * additional "s" data. */
5330 if (!unicode_alternate)
5331 unicode_alternate = newAV();
5332 sv = newSVpvn((char*)foldbuf, foldlen);
5334 av_push(unicode_alternate, sv);
5338 /* If folding and the value is one of the Greek
5339 * sigmas insert a few more sigmas to make the
5340 * folding rules of the sigmas to work right.
5341 * Note that not all the possible combinations
5342 * are handled here: some of them are handled
5343 * by the standard folding rules, and some of
5344 * them (literal or EXACTF cases) are handled
5345 * during runtime in regexec.c:S_find_byclass(). */
5346 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5347 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5348 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5349 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5350 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5352 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5353 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5354 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5359 literal_endpoint = 0;
5363 range = 0; /* this range (if it was one) is done now */
5367 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5369 RExC_size += ANYOF_CLASS_ADD_SKIP;
5371 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5374 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5376 /* If the only flag is folding (plus possibly inversion). */
5377 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5379 for (value = 0; value < 256; ++value) {
5380 if (ANYOF_BITMAP_TEST(ret, value)) {
5381 UV fold = PL_fold[value];
5384 ANYOF_BITMAP_SET(ret, fold);
5387 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5390 /* optimize inverted simple patterns (e.g. [^a-z]) */
5391 if (!SIZE_ONLY && optimize_invert &&
5392 /* If the only flag is inversion. */
5393 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5394 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5395 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5396 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5403 /* The 0th element stores the character class description
5404 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5405 * to initialize the appropriate swash (which gets stored in
5406 * the 1st element), and also useful for dumping the regnode.
5407 * The 2nd element stores the multicharacter foldings,
5408 * used later (regexec.c:S_reginclass()). */
5409 av_store(av, 0, listsv);
5410 av_store(av, 1, NULL);
5411 av_store(av, 2, (SV*)unicode_alternate);
5412 rv = newRV_noinc((SV*)av);
5413 n = add_data(pRExC_state, 1, "s");
5414 RExC_rx->data->data[n] = (void*)rv;
5422 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5424 char* retval = RExC_parse++;
5427 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5428 RExC_parse[2] == '#') {
5429 while (*RExC_parse != ')') {
5430 if (RExC_parse == RExC_end)
5431 FAIL("Sequence (?#... not terminated");
5437 if (RExC_flags & PMf_EXTENDED) {
5438 if (isSPACE(*RExC_parse)) {
5442 else if (*RExC_parse == '#') {
5443 while (RExC_parse < RExC_end)
5444 if (*RExC_parse++ == '\n') break;
5453 - reg_node - emit a node
5455 STATIC regnode * /* Location. */
5456 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5459 register regnode *ptr;
5460 regnode * const ret = RExC_emit;
5463 SIZE_ALIGN(RExC_size);
5468 NODE_ALIGN_FILL(ret);
5470 FILL_ADVANCE_NODE(ptr, op);
5471 if (RExC_offsets) { /* MJD */
5472 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5473 "reg_node", __LINE__,
5475 RExC_emit - RExC_emit_start > RExC_offsets[0]
5476 ? "Overwriting end of array!\n" : "OK",
5477 RExC_emit - RExC_emit_start,
5478 RExC_parse - RExC_start,
5480 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5489 - reganode - emit a node with an argument
5491 STATIC regnode * /* Location. */
5492 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5495 register regnode *ptr;
5496 regnode * const ret = RExC_emit;
5499 SIZE_ALIGN(RExC_size);
5504 NODE_ALIGN_FILL(ret);
5506 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5507 if (RExC_offsets) { /* MJD */
5508 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5512 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5513 "Overwriting end of array!\n" : "OK",
5514 RExC_emit - RExC_emit_start,
5515 RExC_parse - RExC_start,
5517 Set_Cur_Node_Offset;
5526 - reguni - emit (if appropriate) a Unicode character
5529 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5532 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5536 - reginsert - insert an operator in front of already-emitted operand
5538 * Means relocating the operand.
5541 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5544 register regnode *src;
5545 register regnode *dst;
5546 register regnode *place;
5547 const int offset = regarglen[(U8)op];
5549 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5552 RExC_size += NODE_STEP_REGNODE + offset;
5557 RExC_emit += NODE_STEP_REGNODE + offset;
5559 while (src > opnd) {
5560 StructCopy(--src, --dst, regnode);
5561 if (RExC_offsets) { /* MJD 20010112 */
5562 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5566 dst - RExC_emit_start > RExC_offsets[0]
5567 ? "Overwriting end of array!\n" : "OK",
5568 src - RExC_emit_start,
5569 dst - RExC_emit_start,
5571 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5572 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5577 place = opnd; /* Op node, where operand used to be. */
5578 if (RExC_offsets) { /* MJD */
5579 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5583 place - RExC_emit_start > RExC_offsets[0]
5584 ? "Overwriting end of array!\n" : "OK",
5585 place - RExC_emit_start,
5586 RExC_parse - RExC_start,
5588 Set_Node_Offset(place, RExC_parse);
5589 Set_Node_Length(place, 1);
5591 src = NEXTOPER(place);
5592 FILL_ADVANCE_NODE(place, op);
5593 Zero(src, offset, regnode);
5597 - regtail - set the next-pointer at the end of a node chain of p to val.
5600 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5603 register regnode *scan;
5608 /* Find last node. */
5611 regnode * const temp = regnext(scan);
5617 if (reg_off_by_arg[OP(scan)]) {
5618 ARG_SET(scan, val - scan);
5621 NEXT_OFF(scan) = val - scan;
5626 - regoptail - regtail on operand of first argument; nop if operandless
5629 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5632 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5633 if (p == NULL || SIZE_ONLY)
5635 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5636 regtail(pRExC_state, NEXTOPER(p), val);
5638 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5639 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5646 - regcurly - a little FSA that accepts {\d+,?\d*}
5649 S_regcurly(register const char *s)
5668 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5671 Perl_regdump(pTHX_ regexp *r)
5675 SV * const sv = sv_newmortal();
5677 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5679 /* Header fields of interest. */
5680 if (r->anchored_substr)
5681 PerlIO_printf(Perl_debug_log,
5682 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5684 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5685 SvPVX_const(r->anchored_substr),
5687 SvTAIL(r->anchored_substr) ? "$" : "",
5688 (IV)r->anchored_offset);
5689 else if (r->anchored_utf8)
5690 PerlIO_printf(Perl_debug_log,
5691 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5693 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5694 SvPVX_const(r->anchored_utf8),
5696 SvTAIL(r->anchored_utf8) ? "$" : "",
5697 (IV)r->anchored_offset);
5698 if (r->float_substr)
5699 PerlIO_printf(Perl_debug_log,
5700 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5702 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5703 SvPVX_const(r->float_substr),
5705 SvTAIL(r->float_substr) ? "$" : "",
5706 (IV)r->float_min_offset, (UV)r->float_max_offset);
5707 else if (r->float_utf8)
5708 PerlIO_printf(Perl_debug_log,
5709 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5711 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5712 SvPVX_const(r->float_utf8),
5714 SvTAIL(r->float_utf8) ? "$" : "",
5715 (IV)r->float_min_offset, (UV)r->float_max_offset);
5716 if (r->check_substr || r->check_utf8)
5717 PerlIO_printf(Perl_debug_log,
5718 r->check_substr == r->float_substr
5719 && r->check_utf8 == r->float_utf8
5720 ? "(checking floating" : "(checking anchored");
5721 if (r->reganch & ROPT_NOSCAN)
5722 PerlIO_printf(Perl_debug_log, " noscan");
5723 if (r->reganch & ROPT_CHECK_ALL)
5724 PerlIO_printf(Perl_debug_log, " isall");
5725 if (r->check_substr || r->check_utf8)
5726 PerlIO_printf(Perl_debug_log, ") ");
5728 if (r->regstclass) {
5729 regprop(sv, r->regstclass);
5730 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5732 if (r->reganch & ROPT_ANCH) {
5733 PerlIO_printf(Perl_debug_log, "anchored");
5734 if (r->reganch & ROPT_ANCH_BOL)
5735 PerlIO_printf(Perl_debug_log, "(BOL)");
5736 if (r->reganch & ROPT_ANCH_MBOL)
5737 PerlIO_printf(Perl_debug_log, "(MBOL)");
5738 if (r->reganch & ROPT_ANCH_SBOL)
5739 PerlIO_printf(Perl_debug_log, "(SBOL)");
5740 if (r->reganch & ROPT_ANCH_GPOS)
5741 PerlIO_printf(Perl_debug_log, "(GPOS)");
5742 PerlIO_putc(Perl_debug_log, ' ');
5744 if (r->reganch & ROPT_GPOS_SEEN)
5745 PerlIO_printf(Perl_debug_log, "GPOS ");
5746 if (r->reganch & ROPT_SKIP)
5747 PerlIO_printf(Perl_debug_log, "plus ");
5748 if (r->reganch & ROPT_IMPLICIT)
5749 PerlIO_printf(Perl_debug_log, "implicit ");
5750 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5751 if (r->reganch & ROPT_EVAL_SEEN)
5752 PerlIO_printf(Perl_debug_log, "with eval ");
5753 PerlIO_printf(Perl_debug_log, "\n");
5755 const U32 len = r->offsets[0];
5756 GET_RE_DEBUG_FLAGS_DECL;
5759 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5760 for (i = 1; i <= len; i++)
5761 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5762 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5763 PerlIO_printf(Perl_debug_log, "\n");
5767 PERL_UNUSED_CONTEXT;
5769 #endif /* DEBUGGING */
5773 - regprop - printable representation of opcode
5776 Perl_regprop(pTHX_ SV *sv, const regnode *o)
5782 sv_setpvn(sv, "", 0);
5783 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5784 /* It would be nice to FAIL() here, but this may be called from
5785 regexec.c, and it would be hard to supply pRExC_state. */
5786 Perl_croak(aTHX_ "Corrupted regexp opcode");
5787 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5789 k = PL_regkind[(U8)OP(o)];
5792 SV * const dsv = sv_2mortal(newSVpvs(""));
5793 /* Using is_utf8_string() is a crude hack but it may
5794 * be the best for now since we have no flag "this EXACTish
5795 * node was UTF-8" --jhi */
5796 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5797 const char * const s = do_utf8 ?
5798 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5799 UNI_DISPLAY_REGEX) :
5801 const int len = do_utf8 ?
5804 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5808 } else if (k == TRIE) {
5811 this isn't always safe, as Pl_regdata may not be for this regex yet
5812 (depending on where its called from) so its being moved to dumpuntil
5814 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5815 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5818 trie->uniquecharcount,
5821 } else if (k == CURLY) {
5822 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5823 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5824 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5826 else if (k == WHILEM && o->flags) /* Ordinal/of */
5827 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5828 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5829 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5830 else if (k == LOGICAL)
5831 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5832 else if (k == ANYOF) {
5833 int i, rangestart = -1;
5834 const U8 flags = ANYOF_FLAGS(o);
5836 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5837 static const char * const anyofs[] = {
5870 if (flags & ANYOF_LOCALE)
5871 sv_catpvs(sv, "{loc}");
5872 if (flags & ANYOF_FOLD)
5873 sv_catpvs(sv, "{i}");
5874 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5875 if (flags & ANYOF_INVERT)
5877 for (i = 0; i <= 256; i++) {
5878 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5879 if (rangestart == -1)
5881 } else if (rangestart != -1) {
5882 if (i <= rangestart + 3)
5883 for (; rangestart < i; rangestart++)
5884 put_byte(sv, rangestart);
5886 put_byte(sv, rangestart);
5888 put_byte(sv, i - 1);
5894 if (o->flags & ANYOF_CLASS)
5895 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5896 if (ANYOF_CLASS_TEST(o,i))
5897 sv_catpv(sv, anyofs[i]);
5899 if (flags & ANYOF_UNICODE)
5900 sv_catpvs(sv, "{unicode}");
5901 else if (flags & ANYOF_UNICODE_ALL)
5902 sv_catpvs(sv, "{unicode_all}");
5906 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
5910 U8 s[UTF8_MAXBYTES_CASE+1];
5912 for (i = 0; i <= 256; i++) { /* just the first 256 */
5913 uvchr_to_utf8(s, i);
5915 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5916 if (rangestart == -1)
5918 } else if (rangestart != -1) {
5919 if (i <= rangestart + 3)
5920 for (; rangestart < i; rangestart++) {
5921 const U8 * const e = uvchr_to_utf8(s,rangestart);
5923 for(p = s; p < e; p++)
5927 const U8 *e = uvchr_to_utf8(s,rangestart);
5929 for (p = s; p < e; p++)
5932 e = uvchr_to_utf8(s, i-1);
5933 for (p = s; p < e; p++)
5940 sv_catpvs(sv, "..."); /* et cetera */
5944 char *s = savesvpv(lv);
5945 char * const origs = s;
5947 while(*s && *s != '\n') s++;
5950 const char * const t = ++s;
5968 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5970 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5971 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5973 PERL_UNUSED_CONTEXT;
5974 PERL_UNUSED_ARG(sv);
5976 #endif /* DEBUGGING */
5980 Perl_re_intuit_string(pTHX_ regexp *prog)
5981 { /* Assume that RE_INTUIT is set */
5983 GET_RE_DEBUG_FLAGS_DECL;
5984 PERL_UNUSED_CONTEXT;
5988 const char * const s = SvPV_nolen_const(prog->check_substr
5989 ? prog->check_substr : prog->check_utf8);
5991 if (!PL_colorset) reginitcolors();
5992 PerlIO_printf(Perl_debug_log,
5993 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
5995 prog->check_substr ? "" : "utf8 ",
5996 PL_colors[5],PL_colors[0],
5999 (strlen(s) > 60 ? "..." : ""));
6002 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6006 Perl_pregfree(pTHX_ struct regexp *r)
6010 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6011 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
6015 if (!r || (--r->refcnt > 0))
6017 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6018 const char * const s = (r->reganch & ROPT_UTF8)
6019 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6020 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6021 const int len = SvCUR(dsv);
6024 PerlIO_printf(Perl_debug_log,
6025 "%sFreeing REx:%s %s%*.*s%s%s\n",
6026 PL_colors[4],PL_colors[5],PL_colors[0],
6029 len > 60 ? "..." : "");
6032 /* gcov results gave these as non-null 100% of the time, so there's no
6033 optimisation in checking them before calling Safefree */
6034 Safefree(r->precomp);
6035 Safefree(r->offsets); /* 20010421 MJD */
6036 RX_MATCH_COPY_FREE(r);
6037 #ifdef PERL_OLD_COPY_ON_WRITE
6039 SvREFCNT_dec(r->saved_copy);
6042 if (r->anchored_substr)
6043 SvREFCNT_dec(r->anchored_substr);
6044 if (r->anchored_utf8)
6045 SvREFCNT_dec(r->anchored_utf8);
6046 if (r->float_substr)
6047 SvREFCNT_dec(r->float_substr);
6049 SvREFCNT_dec(r->float_utf8);
6050 Safefree(r->substrs);
6053 int n = r->data->count;
6054 PAD* new_comppad = NULL;
6059 /* If you add a ->what type here, update the comment in regcomp.h */
6060 switch (r->data->what[n]) {
6062 SvREFCNT_dec((SV*)r->data->data[n]);
6065 Safefree(r->data->data[n]);
6068 new_comppad = (AV*)r->data->data[n];
6071 if (new_comppad == NULL)
6072 Perl_croak(aTHX_ "panic: pregfree comppad");
6073 PAD_SAVE_LOCAL(old_comppad,
6074 /* Watch out for global destruction's random ordering. */
6075 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6078 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6081 op_free((OP_4tree*)r->data->data[n]);
6083 PAD_RESTORE_LOCAL(old_comppad);
6084 SvREFCNT_dec((SV*)new_comppad);
6091 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
6094 refcount = --trie->refcount;
6097 Safefree(trie->charmap);
6098 if (trie->widecharmap)
6099 SvREFCNT_dec((SV*)trie->widecharmap);
6100 Safefree(trie->states);
6101 Safefree(trie->trans);
6104 SvREFCNT_dec((SV*)trie->words);
6105 if (trie->revcharmap)
6106 SvREFCNT_dec((SV*)trie->revcharmap);
6108 Safefree(r->data->data[n]); /* do this last!!!! */
6113 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6116 Safefree(r->data->what);
6119 Safefree(r->startp);
6125 - regnext - dig the "next" pointer out of a node
6128 Perl_regnext(pTHX_ register regnode *p)
6131 register I32 offset;
6133 if (p == &PL_regdummy)
6136 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6144 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6147 STRLEN l1 = strlen(pat1);
6148 STRLEN l2 = strlen(pat2);
6151 const char *message;
6157 Copy(pat1, buf, l1 , char);
6158 Copy(pat2, buf + l1, l2 , char);
6159 buf[l1 + l2] = '\n';
6160 buf[l1 + l2 + 1] = '\0';
6162 /* ANSI variant takes additional second argument */
6163 va_start(args, pat2);
6167 msv = vmess(buf, &args);
6169 message = SvPV_const(msv,l1);
6172 Copy(message, buf, l1 , char);
6173 buf[l1-1] = '\0'; /* Overwrite \n */
6174 Perl_croak(aTHX_ "%s", buf);
6177 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6180 Perl_save_re_context(pTHX)
6183 SAVEI32(PL_reg_flags); /* from regexec.c */
6185 SAVEPPTR(PL_reginput); /* String-input pointer. */
6186 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6187 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6188 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6189 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6190 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6191 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6192 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6193 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6194 PL_reg_start_tmp = 0;
6195 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6196 PL_reg_start_tmpl = 0;
6197 SAVEVPTR(PL_regdata);
6198 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6199 SAVEI32(PL_regnarrate); /* from regexec.c */
6200 SAVEVPTR(PL_regprogram); /* from regexec.c */
6201 SAVEINT(PL_regindent); /* from regexec.c */
6202 SAVEVPTR(PL_regcc); /* from regexec.c */
6203 SAVEVPTR(PL_curcop);
6204 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6205 SAVEVPTR(PL_reg_re); /* from regexec.c */
6206 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6207 SAVESPTR(PL_reg_sv); /* from regexec.c */
6208 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6209 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6210 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6211 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6212 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6213 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6214 PL_reg_oldsaved = NULL;
6215 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6216 PL_reg_oldsavedlen = 0;
6217 #ifdef PERL_OLD_COPY_ON_WRITE
6221 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6223 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6224 PL_reg_leftiter = 0;
6225 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6226 PL_reg_poscache = NULL;
6227 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6228 PL_reg_poscache_size = 0;
6229 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6230 SAVEI32(PL_regnpar); /* () count. */
6231 SAVEI32(PL_regsize); /* from regexec.c */
6233 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6235 const REGEXP * const rx = PM_GETRE(PL_curpm);
6238 for (i = 1; i <= rx->nparens; i++) {
6239 char digits[TYPE_CHARS(long)];
6240 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
6241 GV *const *const gvp
6242 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6245 GV * const gv = *gvp;
6246 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6254 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6259 clear_re(pTHX_ void *r)
6262 ReREFCNT_dec((regexp *)r);
6268 S_put_byte(pTHX_ SV *sv, int c)
6270 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6271 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6272 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6273 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6275 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6280 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6283 register U8 op = EXACT; /* Arbitrary non-END op. */
6284 register regnode *next;
6286 while (op != END && (!last || node < last)) {
6287 /* While that wasn't END last time... */
6293 next = regnext(node);
6295 if (OP(node) == OPTIMIZED)
6298 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6299 (int)(2*l + 1), "", SvPVX_const(sv));
6300 if (next == NULL) /* Next ptr. */
6301 PerlIO_printf(Perl_debug_log, "(0)");
6303 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6304 (void)PerlIO_putc(Perl_debug_log, '\n');
6306 if (PL_regkind[(U8)op] == BRANCHJ) {
6307 register regnode *nnode = (OP(next) == LONGJMP
6310 if (last && nnode > last)
6312 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6314 else if (PL_regkind[(U8)op] == BRANCH) {
6315 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6317 else if ( PL_regkind[(U8)op] == TRIE ) {
6318 const I32 n = ARG(node);
6319 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6320 const I32 arry_len = av_len(trie->words)+1;
6322 PerlIO_printf(Perl_debug_log,
6323 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6327 (int)trie->charcount,
6328 trie->uniquecharcount,
6329 (IV)trie->laststate-1,
6330 node->flags ? " EVAL mode" : "");
6332 for (word_idx=0; word_idx < arry_len; word_idx++) {
6333 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6335 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6338 SvPV_nolen_const(*elem_ptr),
6343 PerlIO_printf(Perl_debug_log, "(0)\n");
6345 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6351 node = NEXTOPER(node);
6352 node += regarglen[(U8)op];
6355 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6356 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6357 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6359 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6360 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6363 else if ( op == PLUS || op == STAR) {
6364 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6366 else if (op == ANYOF) {
6367 /* arglen 1 + class block */
6368 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6369 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6370 node = NEXTOPER(node);
6372 else if (PL_regkind[(U8)op] == EXACT) {
6373 /* Literal string, where present. */
6374 node += NODE_SZ_STR(node) - 1;
6375 node = NEXTOPER(node);
6378 node = NEXTOPER(node);
6379 node += regarglen[(U8)op];
6381 if (op == CURLYX || op == OPEN)
6383 else if (op == WHILEM)
6389 #endif /* DEBUGGING */
6393 * c-indentation-style: bsd
6395 * indent-tabs-mode: t
6398 * ex: set ts=8 sts=4 sw=4 noet: