5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
287 #define Simple_vFAIL(m) STMT_START { \
288 const IV offset = RExC_parse - RExC_precomp; \
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
296 #define vFAIL(m) STMT_START { \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
303 * Like Simple_vFAIL(), but accepts two arguments.
305 #define Simple_vFAIL2(m,a1) STMT_START { \
306 const IV offset = RExC_parse - RExC_precomp; \
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
314 #define vFAIL2(m,a1) STMT_START { \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
322 * Like Simple_vFAIL(), but accepts three arguments.
324 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
325 const IV offset = RExC_parse - RExC_precomp; \
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
333 #define vFAIL3(m,a1,a2) STMT_START { \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
340 * Like Simple_vFAIL(), but accepts four arguments.
342 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
343 const IV offset = RExC_parse - RExC_precomp; \
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
348 #define vWARN(loc,m) STMT_START { \
349 const IV offset = loc - RExC_precomp; \
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
354 #define vWARNdep(loc,m) STMT_START { \
355 const IV offset = loc - RExC_precomp; \
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
362 #define vWARN2(loc, m, a1) STMT_START { \
363 const IV offset = loc - RExC_precomp; \
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
368 #define vWARN3(loc, m, a1, a2) STMT_START { \
369 const IV offset = loc - RExC_precomp; \
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
374 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
375 const IV offset = loc - RExC_precomp; \
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
381 const IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
387 /* Allow for side effects in s */
388 #define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
392 /* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
398 #define MJD_OFFSET_DEBUG(x)
399 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
402 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
409 RExC_offsets[2*(node)-1] = (byte); \
414 #define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
418 #define Set_Node_Length_To_R(node,len) STMT_START { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
421 __LINE__, (int)(node), (int)(len))); \
423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
425 RExC_offsets[2*(node)] = (len); \
430 #define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433 #define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
436 /* Get offsets and lengths */
437 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
440 static void clear_re(pTHX_ void *r);
442 /* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
447 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
453 SvSetMagicSV(*data->longest, data->last_found);
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
460 data->flags &= ~SF_FIX_BEFORE_EOL;
463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
464 data->offset_float_max = (l
465 ? data->last_start_max
466 : data->pos_min + data->pos_delta);
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
469 if (data->flags & SF_BEFORE_EOL)
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
473 data->flags &= ~SF_FL_BEFORE_EOL;
476 SvCUR_set(data->last_found, 0);
478 SV * const sv = data->last_found;
480 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
485 data->flags &= ~SF_BEFORE_EOL;
488 /* Can match anything (initialization) */
490 S_cl_anything(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
492 ANYOF_CLASS_ZERO(cl);
493 ANYOF_BITMAP_SETALL(cl);
494 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
496 cl->flags |= ANYOF_LOCALE;
499 /* Can match anything (initialization) */
501 S_cl_is_anything(const struct regnode_charclass_class *cl)
505 for (value = 0; value <= ANYOF_MAX; value += 2)
506 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
508 if (!(cl->flags & ANYOF_UNICODE_ALL))
510 if (!ANYOF_BITMAP_TESTALLSET(cl))
515 /* Can match anything (initialization) */
517 S_cl_init(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
519 Zero(cl, 1, struct regnode_charclass_class);
521 cl_anything(pRExC_state, cl);
525 S_cl_init_zero(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
527 Zero(cl, 1, struct regnode_charclass_class);
529 cl_anything(pRExC_state, cl);
531 cl->flags |= ANYOF_LOCALE;
534 /* 'And' a given class with another one. Can create false positives */
535 /* We assume that cl is not inverted */
537 S_cl_and(struct regnode_charclass_class *cl,
538 const struct regnode_charclass_class *and_with)
540 if (!(and_with->flags & ANYOF_CLASS)
541 && !(cl->flags & ANYOF_CLASS)
542 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
543 && !(and_with->flags & ANYOF_FOLD)
544 && !(cl->flags & ANYOF_FOLD)) {
547 if (and_with->flags & ANYOF_INVERT)
548 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
549 cl->bitmap[i] &= ~and_with->bitmap[i];
551 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
552 cl->bitmap[i] &= and_with->bitmap[i];
553 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
554 if (!(and_with->flags & ANYOF_EOS))
555 cl->flags &= ~ANYOF_EOS;
557 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
558 !(and_with->flags & ANYOF_INVERT)) {
559 cl->flags &= ~ANYOF_UNICODE_ALL;
560 cl->flags |= ANYOF_UNICODE;
561 ARG_SET(cl, ARG(and_with));
563 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
564 !(and_with->flags & ANYOF_INVERT))
565 cl->flags &= ~ANYOF_UNICODE_ALL;
566 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
567 !(and_with->flags & ANYOF_INVERT))
568 cl->flags &= ~ANYOF_UNICODE;
571 /* 'OR' a given class with another one. Can create false positives */
572 /* We assume that cl is not inverted */
574 S_cl_or(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
576 if (or_with->flags & ANYOF_INVERT) {
578 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
579 * <= (B1 | !B2) | (CL1 | !CL2)
580 * which is wasteful if CL2 is small, but we ignore CL2:
581 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
582 * XXXX Can we handle case-fold? Unclear:
583 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
584 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
586 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
587 && !(or_with->flags & ANYOF_FOLD)
588 && !(cl->flags & ANYOF_FOLD) ) {
591 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
592 cl->bitmap[i] |= ~or_with->bitmap[i];
593 } /* XXXX: logic is complicated otherwise */
595 cl_anything(pRExC_state, cl);
598 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
599 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
600 && (!(or_with->flags & ANYOF_FOLD)
601 || (cl->flags & ANYOF_FOLD)) ) {
604 /* OR char bitmap and class bitmap separately */
605 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
606 cl->bitmap[i] |= or_with->bitmap[i];
607 if (or_with->flags & ANYOF_CLASS) {
608 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
609 cl->classflags[i] |= or_with->classflags[i];
610 cl->flags |= ANYOF_CLASS;
613 else { /* XXXX: logic is complicated, leave it along for a moment. */
614 cl_anything(pRExC_state, cl);
617 if (or_with->flags & ANYOF_EOS)
618 cl->flags |= ANYOF_EOS;
620 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
621 ARG(cl) != ARG(or_with)) {
622 cl->flags |= ANYOF_UNICODE_ALL;
623 cl->flags &= ~ANYOF_UNICODE;
625 if (or_with->flags & ANYOF_UNICODE_ALL) {
626 cl->flags |= ANYOF_UNICODE_ALL;
627 cl->flags &= ~ANYOF_UNICODE;
633 make_trie(startbranch,first,last,tail,flags)
634 startbranch: the first branch in the whole branch sequence
635 first : start branch of sequence of branch-exact nodes.
636 May be the same as startbranch
637 last : Thing following the last branch.
638 May be the same as tail.
639 tail : item following the branch sequence
640 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
642 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
644 A trie is an N'ary tree where the branches are determined by digital
645 decomposition of the key. IE, at the root node you look up the 1st character and
646 follow that branch repeat until you find the end of the branches. Nodes can be
647 marked as "accepting" meaning they represent a complete word. Eg:
651 would convert into the following structure. Numbers represent states, letters
652 following numbers represent valid transitions on the letter from that state, if
653 the number is in square brackets it represents an accepting state, otherwise it
654 will be in parenthesis.
656 +-h->+-e->[3]-+-r->(8)-+-s->[9]
660 (1) +-i->(6)-+-s->[7]
662 +-s->(3)-+-h->(4)-+-e->[5]
664 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
666 This shows that when matching against the string 'hers' we will begin at state 1
667 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
668 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
669 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
670 single traverse. We store a mapping from accepting to state to which word was
671 matched, and then when we have multiple possibilities we try to complete the
672 rest of the regex in the order in which they occured in the alternation.
674 The only prior NFA like behaviour that would be changed by the TRIE support is
675 the silent ignoring of duplicate alternations which are of the form:
677 / (DUPE|DUPE) X? (?{ ... }) Y /x
679 Thus EVAL blocks follwing a trie may be called a different number of times with
680 and without the optimisation. With the optimisations dupes will be silently
681 ignored. This inconsistant behaviour of EVAL type nodes is well established as
682 the following demonstrates:
684 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
686 which prints out 'word' three times, but
688 'words'=~/(word|word|word)(?{ print $1 })S/
690 which doesnt print it out at all. This is due to other optimisations kicking in.
692 Example of what happens on a structural level:
694 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
696 1: CURLYM[1] {1,32767}(18)
707 This would be optimizable with startbranch=5, first=5, last=16, tail=16
708 and should turn into:
710 1: CURLYM[1] {1,32767}(18)
712 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
720 Cases where tail != last would be like /(?foo|bar)baz/:
730 which would be optimizable with startbranch=1, first=1, last=7, tail=8
731 and would end up looking like:
734 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
743 #define TRIE_DEBUG_CHAR \
744 DEBUG_TRIE_COMPILE_r({ \
747 tmp = newSVpvs( "" ); \
748 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
750 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
752 av_push( trie->revcharmap, tmp ); \
755 #define TRIE_READ_CHAR STMT_START { \
758 if ( foldlen > 0 ) { \
759 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
764 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
765 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
766 foldlen -= UNISKIP( uvc ); \
767 scan = foldbuf + UNISKIP( uvc ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
779 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
780 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
781 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
782 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
784 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
785 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
786 TRIE_LIST_LEN( state ) *= 2; \
787 Renew( trie->states[ state ].trans.list, \
788 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
790 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
791 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
792 TRIE_LIST_CUR( state )++; \
795 #define TRIE_LIST_NEW(state) STMT_START { \
796 Newxz( trie->states[ state ].trans.list, \
797 4, reg_trie_trans_le ); \
798 TRIE_LIST_CUR( state ) = 1; \
799 TRIE_LIST_LEN( state ) = 4; \
803 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
806 /* first pass, loop through and scan words */
809 const U32 uniflags = UTF8_ALLOW_DEFAULT;
814 /* we just use folder as a flag in utf8 */
815 const U8 * const folder = ( flags == EXACTF
823 const U32 data_slot = add_data( pRExC_state, 1, "t" );
826 GET_RE_DEBUG_FLAGS_DECL;
828 Newxz( trie, 1, reg_trie_data );
830 RExC_rx->data->data[ data_slot ] = (void*)trie;
831 Newxz( trie->charmap, 256, U16 );
833 trie->words = newAV();
834 trie->revcharmap = newAV();
838 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
839 if (!SvIOK(re_trie_maxbuff)) {
840 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
843 /* -- First loop and Setup --
845 We first traverse the branches and scan each word to determine if it
846 contains widechars, and how many unique chars there are, this is
847 important as we have to build a table with at least as many columns as we
850 We use an array of integers to represent the character codes 0..255
851 (trie->charmap) and we use a an HV* to store unicode characters. We use the
852 native representation of the character value as the key and IV's for the
855 *TODO* If we keep track of how many times each character is used we can
856 remap the columns so that the table compression later on is more
857 efficient in terms of memory by ensuring most common value is in the
858 middle and the least common are on the outside. IMO this would be better
859 than a most to least common mapping as theres a decent chance the most
860 common letter will share a node with the least common, meaning the node
861 will not be compressable. With a middle is most common approach the worst
862 case is when we have the least common nodes twice.
867 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
868 regnode * const noper = NEXTOPER( cur );
869 const U8 *uc = (U8*)STRING( noper );
870 const U8 * const e = uc + STR_LEN( noper );
872 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
873 const U8 *scan = (U8*)NULL;
875 for ( ; uc < e ; uc += len ) {
879 if ( !trie->charmap[ uvc ] ) {
880 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
882 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
887 if ( !trie->widecharmap )
888 trie->widecharmap = newHV();
890 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
893 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
895 if ( !SvTRUE( *svpp ) ) {
896 sv_setiv( *svpp, ++trie->uniquecharcount );
902 } /* end first pass */
903 DEBUG_TRIE_COMPILE_r(
904 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
905 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
906 (int)trie->charcount, trie->uniquecharcount )
911 We now know what we are dealing with in terms of unique chars and
912 string sizes so we can calculate how much memory a naive
913 representation using a flat table will take. If it's over a reasonable
914 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
915 conservative but potentially much slower representation using an array
918 At the end we convert both representations into the same compressed
919 form that will be used in regexec.c for matching with. The latter
920 is a form that cannot be used to construct with but has memory
921 properties similar to the list form and access properties similar
922 to the table form making it both suitable for fast searches and
923 small enough that its feasable to store for the duration of a program.
925 See the comment in the code where the compressed table is produced
926 inplace from the flat tabe representation for an explanation of how
927 the compression works.
932 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
934 Second Pass -- Array Of Lists Representation
936 Each state will be represented by a list of charid:state records
937 (reg_trie_trans_le) the first such element holds the CUR and LEN
938 points of the allocated array. (See defines above).
940 We build the initial structure using the lists, and then convert
941 it into the compressed table form which allows faster lookups
942 (but cant be modified once converted).
948 STRLEN transcount = 1;
950 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
954 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
956 regnode * const noper = NEXTOPER( cur );
957 U8 *uc = (U8*)STRING( noper );
958 const U8 * const e = uc + STR_LEN( noper );
959 U32 state = 1; /* required init */
960 U16 charid = 0; /* sanity init */
961 U8 *scan = (U8*)NULL; /* sanity init */
962 STRLEN foldlen = 0; /* required init */
963 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
965 for ( ; uc < e ; uc += len ) {
970 charid = trie->charmap[ uvc ];
972 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
976 charid=(U16)SvIV( *svpp );
985 if ( !trie->states[ state ].trans.list ) {
986 TRIE_LIST_NEW( state );
988 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
989 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
990 newstate = TRIE_LIST_ITEM( state, check ).newstate;
995 newstate = next_alloc++;
996 TRIE_LIST_PUSH( state, charid, newstate );
1001 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1003 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1006 if ( !trie->states[ state ].wordnum ) {
1007 /* we havent inserted this word into the structure yet. */
1008 trie->states[ state ].wordnum = ++curword;
1011 /* store the word for dumping */
1012 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1013 if ( UTF ) SvUTF8_on( tmp );
1014 av_push( trie->words, tmp );
1018 /*EMPTY*/; /* It's a dupe. So ignore it. */
1021 } /* end second pass */
1023 trie->laststate = next_alloc;
1024 Renew( trie->states, next_alloc, reg_trie_state );
1026 DEBUG_TRIE_COMPILE_MORE_r({
1029 /* print out the table precompression. */
1031 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1032 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1034 for( state=1 ; state < next_alloc ; state ++ ) {
1037 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1038 if ( ! trie->states[ state ].wordnum ) {
1039 PerlIO_printf( Perl_debug_log, "%5s| ","");
1041 PerlIO_printf( Perl_debug_log, "W%04x| ",
1042 trie->states[ state ].wordnum
1045 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1046 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1047 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1048 SvPV_nolen_const( *tmp ),
1049 TRIE_LIST_ITEM(state,charid).forid,
1050 (UV)TRIE_LIST_ITEM(state,charid).newstate
1055 PerlIO_printf( Perl_debug_log, "\n\n" );
1058 Newxz( trie->trans, transcount ,reg_trie_trans );
1065 for( state=1 ; state < next_alloc ; state ++ ) {
1069 DEBUG_TRIE_COMPILE_MORE_r(
1070 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1074 if (trie->states[state].trans.list) {
1075 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1079 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1080 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1081 if ( forid < minid ) {
1083 } else if ( forid > maxid ) {
1087 if ( transcount < tp + maxid - minid + 1) {
1089 Renew( trie->trans, transcount, reg_trie_trans );
1090 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1092 base = trie->uniquecharcount + tp - minid;
1093 if ( maxid == minid ) {
1095 for ( ; zp < tp ; zp++ ) {
1096 if ( ! trie->trans[ zp ].next ) {
1097 base = trie->uniquecharcount + zp - minid;
1098 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1099 trie->trans[ zp ].check = state;
1105 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1106 trie->trans[ tp ].check = state;
1111 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1112 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1113 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1114 trie->trans[ tid ].check = state;
1116 tp += ( maxid - minid + 1 );
1118 Safefree(trie->states[ state ].trans.list);
1121 DEBUG_TRIE_COMPILE_MORE_r(
1122 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1125 trie->states[ state ].trans.base=base;
1127 trie->lasttrans = tp + 1;
1131 Second Pass -- Flat Table Representation.
1133 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1134 We know that we will need Charcount+1 trans at most to store the data
1135 (one row per char at worst case) So we preallocate both structures
1136 assuming worst case.
1138 We then construct the trie using only the .next slots of the entry
1141 We use the .check field of the first entry of the node temporarily to
1142 make compression both faster and easier by keeping track of how many non
1143 zero fields are in the node.
1145 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1148 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1149 number representing the first entry of the node, and state as a
1150 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1151 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1152 are 2 entrys per node. eg:
1160 The table is internally in the right hand, idx form. However as we also
1161 have to deal with the states array which is indexed by nodenum we have to
1162 use TRIE_NODENUM() to convert.
1166 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1168 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
1169 next_alloc = trie->uniquecharcount + 1;
1171 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1173 regnode * const noper = NEXTOPER( cur );
1174 const U8 *uc = (U8*)STRING( noper );
1175 const U8 * const e = uc + STR_LEN( noper );
1177 U32 state = 1; /* required init */
1179 U16 charid = 0; /* sanity init */
1180 U32 accept_state = 0; /* sanity init */
1181 U8 *scan = (U8*)NULL; /* sanity init */
1183 STRLEN foldlen = 0; /* required init */
1184 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1187 for ( ; uc < e ; uc += len ) {
1192 charid = trie->charmap[ uvc ];
1194 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1195 charid = svpp ? (U16)SvIV(*svpp) : 0;
1199 if ( !trie->trans[ state + charid ].next ) {
1200 trie->trans[ state + charid ].next = next_alloc;
1201 trie->trans[ state ].check++;
1202 next_alloc += trie->uniquecharcount;
1204 state = trie->trans[ state + charid ].next;
1206 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1208 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1211 accept_state = TRIE_NODENUM( state );
1212 if ( !trie->states[ accept_state ].wordnum ) {
1213 /* we havent inserted this word into the structure yet. */
1214 trie->states[ accept_state ].wordnum = ++curword;
1217 /* store the word for dumping */
1218 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1219 if ( UTF ) SvUTF8_on( tmp );
1220 av_push( trie->words, tmp );
1224 /*EMPTY*/; /* Its a dupe. So ignore it. */
1227 } /* end second pass */
1229 DEBUG_TRIE_COMPILE_MORE_r({
1231 print out the table precompression so that we can do a visual check
1232 that they are identical.
1236 PerlIO_printf( Perl_debug_log, "\nChar : " );
1238 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1239 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1241 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1245 PerlIO_printf( Perl_debug_log, "\nState+-" );
1247 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1248 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1251 PerlIO_printf( Perl_debug_log, "\n" );
1253 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1255 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1257 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1258 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1259 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1261 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1262 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1264 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1265 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1268 PerlIO_printf( Perl_debug_log, "\n\n" );
1272 * Inplace compress the table.*
1274 For sparse data sets the table constructed by the trie algorithm will
1275 be mostly 0/FAIL transitions or to put it another way mostly empty.
1276 (Note that leaf nodes will not contain any transitions.)
1278 This algorithm compresses the tables by eliminating most such
1279 transitions, at the cost of a modest bit of extra work during lookup:
1281 - Each states[] entry contains a .base field which indicates the
1282 index in the state[] array wheres its transition data is stored.
1284 - If .base is 0 there are no valid transitions from that node.
1286 - If .base is nonzero then charid is added to it to find an entry in
1289 -If trans[states[state].base+charid].check!=state then the
1290 transition is taken to be a 0/Fail transition. Thus if there are fail
1291 transitions at the front of the node then the .base offset will point
1292 somewhere inside the previous nodes data (or maybe even into a node
1293 even earlier), but the .check field determines if the transition is
1296 The following process inplace converts the table to the compressed
1297 table: We first do not compress the root node 1,and mark its all its
1298 .check pointers as 1 and set its .base pointer as 1 as well. This
1299 allows to do a DFA construction from the compressed table later, and
1300 ensures that any .base pointers we calculate later are greater than
1303 - We set 'pos' to indicate the first entry of the second node.
1305 - We then iterate over the columns of the node, finding the first and
1306 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1307 and set the .check pointers accordingly, and advance pos
1308 appropriately and repreat for the next node. Note that when we copy
1309 the next pointers we have to convert them from the original
1310 NODEIDX form to NODENUM form as the former is not valid post
1313 - If a node has no transitions used we mark its base as 0 and do not
1314 advance the pos pointer.
1316 - If a node only has one transition we use a second pointer into the
1317 structure to fill in allocated fail transitions from other states.
1318 This pointer is independent of the main pointer and scans forward
1319 looking for null transitions that are allocated to a state. When it
1320 finds one it writes the single transition into the "hole". If the
1321 pointer doesnt find one the single transition is appeneded as normal.
1323 - Once compressed we can Renew/realloc the structures to release the
1326 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1327 specifically Fig 3.47 and the associated pseudocode.
1331 const U32 laststate = TRIE_NODENUM( next_alloc );
1334 trie->laststate = laststate;
1336 for ( state = 1 ; state < laststate ; state++ ) {
1338 const U32 stateidx = TRIE_NODEIDX( state );
1339 const U32 o_used = trie->trans[ stateidx ].check;
1340 U32 used = trie->trans[ stateidx ].check;
1341 trie->trans[ stateidx ].check = 0;
1343 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1344 if ( flag || trie->trans[ stateidx + charid ].next ) {
1345 if ( trie->trans[ stateidx + charid ].next ) {
1347 for ( ; zp < pos ; zp++ ) {
1348 if ( ! trie->trans[ zp ].next ) {
1352 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1353 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1354 trie->trans[ zp ].check = state;
1355 if ( ++zp > pos ) pos = zp;
1362 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1364 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1365 trie->trans[ pos ].check = state;
1370 trie->lasttrans = pos + 1;
1371 Renew( trie->states, laststate + 1, reg_trie_state);
1372 DEBUG_TRIE_COMPILE_MORE_r(
1373 PerlIO_printf( Perl_debug_log,
1374 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1375 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1378 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1381 } /* end table compress */
1383 /* resize the trans array to remove unused space */
1384 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1386 DEBUG_TRIE_COMPILE_r({
1389 Now we print it out again, in a slightly different form as there is additional
1390 info we want to be able to see when its compressed. They are close enough for
1391 visual comparison though.
1393 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1395 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1396 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1398 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1401 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1403 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1404 PerlIO_printf( Perl_debug_log, "-----");
1405 PerlIO_printf( Perl_debug_log, "\n");
1407 for( state = 1 ; state < trie->laststate ; state++ ) {
1408 const U32 base = trie->states[ state ].trans.base;
1410 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1412 if ( trie->states[ state ].wordnum ) {
1413 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1415 PerlIO_printf( Perl_debug_log, "%6s", "" );
1418 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1423 while( ( base + ofs < trie->uniquecharcount ) ||
1424 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1425 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1428 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1430 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1431 if ( ( base + ofs >= trie->uniquecharcount ) &&
1432 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1433 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1435 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1436 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1438 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1442 PerlIO_printf( Perl_debug_log, "]");
1445 PerlIO_printf( Perl_debug_log, "\n" );
1450 /* now finally we "stitch in" the new TRIE node
1451 This means we convert either the first branch or the first Exact,
1452 depending on whether the thing following (in 'last') is a branch
1453 or not and whther first is the startbranch (ie is it a sub part of
1454 the alternation or is it the whole thing.)
1455 Assuming its a sub part we conver the EXACT otherwise we convert
1456 the whole branch sequence, including the first.
1463 if ( first == startbranch && OP( last ) != BRANCH ) {
1466 convert = NEXTOPER( first );
1467 NEXT_OFF( first ) = (U16)(last - first);
1470 OP( convert ) = TRIE + (U8)( flags - EXACT );
1471 NEXT_OFF( convert ) = (U16)(tail - convert);
1472 ARG_SET( convert, data_slot );
1474 /* tells us if we need to handle accept buffers specially */
1475 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1478 /* needed for dumping*/
1480 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1481 /* We now need to mark all of the space originally used by the
1482 branches as optimized away. This keeps the dumpuntil from
1483 throwing a wobbly as it doesnt use regnext() to traverse the
1486 while( optimize < last ) {
1487 OP( optimize ) = OPTIMIZED;
1491 } /* end node insert */
1498 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1499 * These need to be revisited when a newer toolchain becomes available.
1501 #if defined(__sparc64__) && defined(__GNUC__)
1502 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1503 # undef SPARC64_GCC_WORKAROUND
1504 # define SPARC64_GCC_WORKAROUND 1
1508 /* REx optimizer. Converts nodes into quickier variants "in place".
1509 Finds fixed substrings. */
1511 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1512 to the position after last scanned or to NULL. */
1516 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1517 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1518 /* scanp: Start here (read-write). */
1519 /* deltap: Write maxlen-minlen here. */
1520 /* last: Stop before this one. */
1523 I32 min = 0, pars = 0, code;
1524 regnode *scan = *scanp, *next;
1526 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1527 int is_inf_internal = 0; /* The studied chunk is infinite */
1528 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1529 scan_data_t data_fake;
1530 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1531 SV *re_trie_maxbuff = NULL;
1533 GET_RE_DEBUG_FLAGS_DECL;
1535 while (scan && OP(scan) != END && scan < last) {
1536 /* Peephole optimizer: */
1538 SV * const mysv=sv_newmortal();
1539 regprop( mysv, scan);
1540 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1541 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1544 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1545 /* Merge several consecutive EXACTish nodes into one. */
1546 regnode *n = regnext(scan);
1549 regnode *stop = scan;
1552 next = scan + NODE_SZ_STR(scan);
1553 /* Skip NOTHING, merge EXACT*. */
1555 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1556 (stringok && (OP(n) == OP(scan))))
1558 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1559 if (OP(n) == TAIL || n > next)
1561 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1562 NEXT_OFF(scan) += NEXT_OFF(n);
1563 next = n + NODE_STEP_REGNODE;
1570 else if (stringok) {
1571 const int oldl = STR_LEN(scan);
1572 regnode * const nnext = regnext(n);
1574 if (oldl + STR_LEN(n) > U8_MAX)
1576 NEXT_OFF(scan) += NEXT_OFF(n);
1577 STR_LEN(scan) += STR_LEN(n);
1578 next = n + NODE_SZ_STR(n);
1579 /* Now we can overwrite *n : */
1580 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1588 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1590 Two problematic code points in Unicode casefolding of EXACT nodes:
1592 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1593 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1599 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1600 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1602 This means that in case-insensitive matching (or "loose matching",
1603 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1604 length of the above casefolded versions) can match a target string
1605 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1606 This would rather mess up the minimum length computation.
1608 What we'll do is to look for the tail four bytes, and then peek
1609 at the preceding two bytes to see whether we need to decrease
1610 the minimum length by four (six minus two).
1612 Thanks to the design of UTF-8, there cannot be false matches:
1613 A sequence of valid UTF-8 bytes cannot be a subsequence of
1614 another valid sequence of UTF-8 bytes.
1617 char * const s0 = STRING(scan), *s, *t;
1618 char * const s1 = s0 + STR_LEN(scan) - 1;
1619 char * const s2 = s1 - 4;
1620 const char t0[] = "\xcc\x88\xcc\x81";
1621 const char * const t1 = t0 + 3;
1624 s < s2 && (t = ninstr(s, s1, t0, t1));
1626 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1627 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1634 n = scan + NODE_SZ_STR(scan);
1636 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1647 /* Follow the next-chain of the current node and optimize
1648 away all the NOTHINGs from it. */
1649 if (OP(scan) != CURLYX) {
1650 const int max = (reg_off_by_arg[OP(scan)]
1652 /* I32 may be smaller than U16 on CRAYs! */
1653 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1654 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1658 /* Skip NOTHING and LONGJMP. */
1659 while ((n = regnext(n))
1660 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1661 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1662 && off + noff < max)
1664 if (reg_off_by_arg[OP(scan)])
1667 NEXT_OFF(scan) = off;
1670 /* The principal pseudo-switch. Cannot be a switch, since we
1671 look into several different things. */
1672 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1673 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1674 next = regnext(scan);
1676 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1678 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1679 I32 max1 = 0, min1 = I32_MAX, num = 0;
1680 struct regnode_charclass_class accum;
1681 regnode * const startbranch=scan;
1683 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1684 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1685 if (flags & SCF_DO_STCLASS)
1686 cl_init_zero(pRExC_state, &accum);
1688 while (OP(scan) == code) {
1689 I32 deltanext, minnext, f = 0, fake;
1690 struct regnode_charclass_class this_class;
1693 data_fake.flags = 0;
1695 data_fake.whilem_c = data->whilem_c;
1696 data_fake.last_closep = data->last_closep;
1699 data_fake.last_closep = &fake;
1700 next = regnext(scan);
1701 scan = NEXTOPER(scan);
1703 scan = NEXTOPER(scan);
1704 if (flags & SCF_DO_STCLASS) {
1705 cl_init(pRExC_state, &this_class);
1706 data_fake.start_class = &this_class;
1707 f = SCF_DO_STCLASS_AND;
1709 if (flags & SCF_WHILEM_VISITED_POS)
1710 f |= SCF_WHILEM_VISITED_POS;
1712 /* we suppose the run is continuous, last=next...*/
1713 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1714 next, &data_fake, f,depth+1);
1717 if (max1 < minnext + deltanext)
1718 max1 = minnext + deltanext;
1719 if (deltanext == I32_MAX)
1720 is_inf = is_inf_internal = 1;
1722 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1724 if (data && (data_fake.flags & SF_HAS_EVAL))
1725 data->flags |= SF_HAS_EVAL;
1727 data->whilem_c = data_fake.whilem_c;
1728 if (flags & SCF_DO_STCLASS)
1729 cl_or(pRExC_state, &accum, &this_class);
1730 if (code == SUSPEND)
1733 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1735 if (flags & SCF_DO_SUBSTR) {
1736 data->pos_min += min1;
1737 data->pos_delta += max1 - min1;
1738 if (max1 != min1 || is_inf)
1739 data->longest = &(data->longest_float);
1742 delta += max1 - min1;
1743 if (flags & SCF_DO_STCLASS_OR) {
1744 cl_or(pRExC_state, data->start_class, &accum);
1746 cl_and(data->start_class, &and_with);
1747 flags &= ~SCF_DO_STCLASS;
1750 else if (flags & SCF_DO_STCLASS_AND) {
1752 cl_and(data->start_class, &accum);
1753 flags &= ~SCF_DO_STCLASS;
1756 /* Switch to OR mode: cache the old value of
1757 * data->start_class */
1758 StructCopy(data->start_class, &and_with,
1759 struct regnode_charclass_class);
1760 flags &= ~SCF_DO_STCLASS_AND;
1761 StructCopy(&accum, data->start_class,
1762 struct regnode_charclass_class);
1763 flags |= SCF_DO_STCLASS_OR;
1764 data->start_class->flags |= ANYOF_EOS;
1770 Assuming this was/is a branch we are dealing with: 'scan' now
1771 points at the item that follows the branch sequence, whatever
1772 it is. We now start at the beginning of the sequence and look
1778 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1780 If we can find such a subseqence we need to turn the first
1781 element into a trie and then add the subsequent branch exact
1782 strings to the trie.
1786 1. patterns where the whole set of branch can be converted to a trie,
1788 2. patterns where only a subset of the alternations can be
1789 converted to a trie.
1791 In case 1 we can replace the whole set with a single regop
1792 for the trie. In case 2 we need to keep the start and end
1795 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1796 becomes BRANCH TRIE; BRANCH X;
1798 Hypthetically when we know the regex isnt anchored we can
1799 turn a case 1 into a DFA and let it rip... Every time it finds a match
1800 it would just call its tail, no WHILEM/CURLY needed.
1804 if (!re_trie_maxbuff) {
1805 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1806 if (!SvIOK(re_trie_maxbuff))
1807 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1809 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1811 regnode *first = (regnode *)NULL;
1812 regnode *last = (regnode *)NULL;
1813 regnode *tail = scan;
1818 SV * const mysv = sv_newmortal(); /* for dumping */
1820 /* var tail is used because there may be a TAIL
1821 regop in the way. Ie, the exacts will point to the
1822 thing following the TAIL, but the last branch will
1823 point at the TAIL. So we advance tail. If we
1824 have nested (?:) we may have to move through several
1828 while ( OP( tail ) == TAIL ) {
1829 /* this is the TAIL generated by (?:) */
1830 tail = regnext( tail );
1834 regprop( mysv, tail );
1835 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1836 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1837 (RExC_seen_evals) ? "[EVAL]" : ""
1842 step through the branches, cur represents each
1843 branch, noper is the first thing to be matched
1844 as part of that branch and noper_next is the
1845 regnext() of that node. if noper is an EXACT
1846 and noper_next is the same as scan (our current
1847 position in the regex) then the EXACT branch is
1848 a possible optimization target. Once we have
1849 two or more consequetive such branches we can
1850 create a trie of the EXACT's contents and stich
1851 it in place. If the sequence represents all of
1852 the branches we eliminate the whole thing and
1853 replace it with a single TRIE. If it is a
1854 subsequence then we need to stitch it in. This
1855 means the first branch has to remain, and needs
1856 to be repointed at the item on the branch chain
1857 following the last branch optimized. This could
1858 be either a BRANCH, in which case the
1859 subsequence is internal, or it could be the
1860 item following the branch sequence in which
1861 case the subsequence is at the end.
1865 /* dont use tail as the end marker for this traverse */
1866 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1867 regnode * const noper = NEXTOPER( cur );
1868 regnode * const noper_next = regnext( noper );
1871 regprop( mysv, cur);
1872 PerlIO_printf( Perl_debug_log, "%*s%s",
1873 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1875 regprop( mysv, noper);
1876 PerlIO_printf( Perl_debug_log, " -> %s",
1877 SvPV_nolen_const(mysv));
1880 regprop( mysv, noper_next );
1881 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1882 SvPV_nolen_const(mysv));
1884 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1887 if ( ( first ? OP( noper ) == optype
1888 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1889 && noper_next == tail && count<U16_MAX)
1894 optype = OP( noper );
1898 regprop( mysv, first);
1899 PerlIO_printf( Perl_debug_log, "%*s%s",
1900 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1901 regprop( mysv, NEXTOPER(first) );
1902 PerlIO_printf( Perl_debug_log, " -> %s\n",
1903 SvPV_nolen_const( mysv ) );
1908 regprop( mysv, cur);
1909 PerlIO_printf( Perl_debug_log, "%*s%s",
1910 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1911 regprop( mysv, noper );
1912 PerlIO_printf( Perl_debug_log, " -> %s\n",
1913 SvPV_nolen_const( mysv ) );
1919 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1920 (int)depth * 2 + 2, "E:", "**END**" );
1922 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1924 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1925 && noper_next == tail )
1929 optype = OP( noper );
1939 regprop( mysv, cur);
1940 PerlIO_printf( Perl_debug_log,
1941 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1942 " ", SvPV_nolen_const( mysv ), first, last, cur);
1947 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1948 (int)depth * 2 + 2, "E:", "==END==" );
1950 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1955 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1956 scan = NEXTOPER(NEXTOPER(scan));
1957 } else /* single branch is optimized. */
1958 scan = NEXTOPER(scan);
1961 else if (OP(scan) == EXACT) {
1962 I32 l = STR_LEN(scan);
1965 const U8 * const s = (U8*)STRING(scan);
1966 l = utf8_length(s, s + l);
1967 uc = utf8_to_uvchr(s, NULL);
1969 uc = *((U8*)STRING(scan));
1972 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1973 /* The code below prefers earlier match for fixed
1974 offset, later match for variable offset. */
1975 if (data->last_end == -1) { /* Update the start info. */
1976 data->last_start_min = data->pos_min;
1977 data->last_start_max = is_inf
1978 ? I32_MAX : data->pos_min + data->pos_delta;
1980 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
1982 SvUTF8_on(data->last_found);
1984 SV * const sv = data->last_found;
1985 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
1986 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1987 if (mg && mg->mg_len >= 0)
1988 mg->mg_len += utf8_length((U8*)STRING(scan),
1989 (U8*)STRING(scan)+STR_LEN(scan));
1991 data->last_end = data->pos_min + l;
1992 data->pos_min += l; /* As in the first entry. */
1993 data->flags &= ~SF_BEFORE_EOL;
1995 if (flags & SCF_DO_STCLASS_AND) {
1996 /* Check whether it is compatible with what we know already! */
2000 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2001 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2002 && (!(data->start_class->flags & ANYOF_FOLD)
2003 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2006 ANYOF_CLASS_ZERO(data->start_class);
2007 ANYOF_BITMAP_ZERO(data->start_class);
2009 ANYOF_BITMAP_SET(data->start_class, uc);
2010 data->start_class->flags &= ~ANYOF_EOS;
2012 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2014 else if (flags & SCF_DO_STCLASS_OR) {
2015 /* false positive possible if the class is case-folded */
2017 ANYOF_BITMAP_SET(data->start_class, uc);
2019 data->start_class->flags |= ANYOF_UNICODE_ALL;
2020 data->start_class->flags &= ~ANYOF_EOS;
2021 cl_and(data->start_class, &and_with);
2023 flags &= ~SCF_DO_STCLASS;
2025 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2026 I32 l = STR_LEN(scan);
2027 UV uc = *((U8*)STRING(scan));
2029 /* Search for fixed substrings supports EXACT only. */
2030 if (flags & SCF_DO_SUBSTR)
2031 scan_commit(pRExC_state, data);
2033 const U8 * const s = (U8 *)STRING(scan);
2034 l = utf8_length(s, s + l);
2035 uc = utf8_to_uvchr(s, NULL);
2038 if (data && (flags & SCF_DO_SUBSTR))
2040 if (flags & SCF_DO_STCLASS_AND) {
2041 /* Check whether it is compatible with what we know already! */
2045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2046 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2049 ANYOF_CLASS_ZERO(data->start_class);
2050 ANYOF_BITMAP_ZERO(data->start_class);
2052 ANYOF_BITMAP_SET(data->start_class, uc);
2053 data->start_class->flags &= ~ANYOF_EOS;
2054 data->start_class->flags |= ANYOF_FOLD;
2055 if (OP(scan) == EXACTFL)
2056 data->start_class->flags |= ANYOF_LOCALE;
2059 else if (flags & SCF_DO_STCLASS_OR) {
2060 if (data->start_class->flags & ANYOF_FOLD) {
2061 /* false positive possible if the class is case-folded.
2062 Assume that the locale settings are the same... */
2064 ANYOF_BITMAP_SET(data->start_class, uc);
2065 data->start_class->flags &= ~ANYOF_EOS;
2067 cl_and(data->start_class, &and_with);
2069 flags &= ~SCF_DO_STCLASS;
2071 else if (strchr((const char*)PL_varies,OP(scan))) {
2072 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2073 I32 f = flags, pos_before = 0;
2074 regnode * const oscan = scan;
2075 struct regnode_charclass_class this_class;
2076 struct regnode_charclass_class *oclass = NULL;
2077 I32 next_is_eval = 0;
2079 switch (PL_regkind[(U8)OP(scan)]) {
2080 case WHILEM: /* End of (?:...)* . */
2081 scan = NEXTOPER(scan);
2084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2085 next = NEXTOPER(scan);
2086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2088 maxcount = REG_INFTY;
2089 next = regnext(scan);
2090 scan = NEXTOPER(scan);
2094 if (flags & SCF_DO_SUBSTR)
2099 if (flags & SCF_DO_STCLASS) {
2101 maxcount = REG_INFTY;
2102 next = regnext(scan);
2103 scan = NEXTOPER(scan);
2106 is_inf = is_inf_internal = 1;
2107 scan = regnext(scan);
2108 if (flags & SCF_DO_SUBSTR) {
2109 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2110 data->longest = &(data->longest_float);
2112 goto optimize_curly_tail;
2114 mincount = ARG1(scan);
2115 maxcount = ARG2(scan);
2116 next = regnext(scan);
2117 if (OP(scan) == CURLYX) {
2118 I32 lp = (data ? *(data->last_closep) : 0);
2119 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2121 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2122 next_is_eval = (OP(scan) == EVAL);
2124 if (flags & SCF_DO_SUBSTR) {
2125 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2126 pos_before = data->pos_min;
2130 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2132 data->flags |= SF_IS_INF;
2134 if (flags & SCF_DO_STCLASS) {
2135 cl_init(pRExC_state, &this_class);
2136 oclass = data->start_class;
2137 data->start_class = &this_class;
2138 f |= SCF_DO_STCLASS_AND;
2139 f &= ~SCF_DO_STCLASS_OR;
2141 /* These are the cases when once a subexpression
2142 fails at a particular position, it cannot succeed
2143 even after backtracking at the enclosing scope.
2145 XXXX what if minimal match and we are at the
2146 initial run of {n,m}? */
2147 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2148 f &= ~SCF_WHILEM_VISITED_POS;
2150 /* This will finish on WHILEM, setting scan, or on NULL: */
2151 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2153 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2155 if (flags & SCF_DO_STCLASS)
2156 data->start_class = oclass;
2157 if (mincount == 0 || minnext == 0) {
2158 if (flags & SCF_DO_STCLASS_OR) {
2159 cl_or(pRExC_state, data->start_class, &this_class);
2161 else if (flags & SCF_DO_STCLASS_AND) {
2162 /* Switch to OR mode: cache the old value of
2163 * data->start_class */
2164 StructCopy(data->start_class, &and_with,
2165 struct regnode_charclass_class);
2166 flags &= ~SCF_DO_STCLASS_AND;
2167 StructCopy(&this_class, data->start_class,
2168 struct regnode_charclass_class);
2169 flags |= SCF_DO_STCLASS_OR;
2170 data->start_class->flags |= ANYOF_EOS;
2172 } else { /* Non-zero len */
2173 if (flags & SCF_DO_STCLASS_OR) {
2174 cl_or(pRExC_state, data->start_class, &this_class);
2175 cl_and(data->start_class, &and_with);
2177 else if (flags & SCF_DO_STCLASS_AND)
2178 cl_and(data->start_class, &this_class);
2179 flags &= ~SCF_DO_STCLASS;
2181 if (!scan) /* It was not CURLYX, but CURLY. */
2183 if ( /* ? quantifier ok, except for (?{ ... }) */
2184 (next_is_eval || !(mincount == 0 && maxcount == 1))
2185 && (minnext == 0) && (deltanext == 0)
2186 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2187 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2188 && ckWARN(WARN_REGEXP))
2191 "Quantifier unexpected on zero-length expression");
2194 min += minnext * mincount;
2195 is_inf_internal |= ((maxcount == REG_INFTY
2196 && (minnext + deltanext) > 0)
2197 || deltanext == I32_MAX);
2198 is_inf |= is_inf_internal;
2199 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2201 /* Try powerful optimization CURLYX => CURLYN. */
2202 if ( OP(oscan) == CURLYX && data
2203 && data->flags & SF_IN_PAR
2204 && !(data->flags & SF_HAS_EVAL)
2205 && !deltanext && minnext == 1 ) {
2206 /* Try to optimize to CURLYN. */
2207 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2208 regnode * const nxt1 = nxt;
2215 if (!strchr((const char*)PL_simple,OP(nxt))
2216 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2217 && STR_LEN(nxt) == 1))
2223 if (OP(nxt) != CLOSE)
2225 /* Now we know that nxt2 is the only contents: */
2226 oscan->flags = (U8)ARG(nxt);
2228 OP(nxt1) = NOTHING; /* was OPEN. */
2230 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2231 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2232 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2233 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2234 OP(nxt + 1) = OPTIMIZED; /* was count. */
2235 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2240 /* Try optimization CURLYX => CURLYM. */
2241 if ( OP(oscan) == CURLYX && data
2242 && !(data->flags & SF_HAS_PAR)
2243 && !(data->flags & SF_HAS_EVAL)
2244 && !deltanext /* atom is fixed width */
2245 && minnext != 0 /* CURLYM can't handle zero width */
2247 /* XXXX How to optimize if data == 0? */
2248 /* Optimize to a simpler form. */
2249 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2253 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2254 && (OP(nxt2) != WHILEM))
2256 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2257 /* Need to optimize away parenths. */
2258 if (data->flags & SF_IN_PAR) {
2259 /* Set the parenth number. */
2260 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2262 if (OP(nxt) != CLOSE)
2263 FAIL("Panic opt close");
2264 oscan->flags = (U8)ARG(nxt);
2265 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2266 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2268 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2269 OP(nxt + 1) = OPTIMIZED; /* was count. */
2270 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2271 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2274 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2275 regnode *nnxt = regnext(nxt1);
2278 if (reg_off_by_arg[OP(nxt1)])
2279 ARG_SET(nxt1, nxt2 - nxt1);
2280 else if (nxt2 - nxt1 < U16_MAX)
2281 NEXT_OFF(nxt1) = nxt2 - nxt1;
2283 OP(nxt) = NOTHING; /* Cannot beautify */
2288 /* Optimize again: */
2289 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2295 else if ((OP(oscan) == CURLYX)
2296 && (flags & SCF_WHILEM_VISITED_POS)
2297 /* See the comment on a similar expression above.
2298 However, this time it not a subexpression
2299 we care about, but the expression itself. */
2300 && (maxcount == REG_INFTY)
2301 && data && ++data->whilem_c < 16) {
2302 /* This stays as CURLYX, we can put the count/of pair. */
2303 /* Find WHILEM (as in regexec.c) */
2304 regnode *nxt = oscan + NEXT_OFF(oscan);
2306 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2308 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2309 | (RExC_whilem_seen << 4)); /* On WHILEM */
2311 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2313 if (flags & SCF_DO_SUBSTR) {
2314 SV *last_str = NULL;
2315 int counted = mincount != 0;
2317 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2318 #if defined(SPARC64_GCC_WORKAROUND)
2321 const char *s = NULL;
2324 if (pos_before >= data->last_start_min)
2327 b = data->last_start_min;
2330 s = SvPV_const(data->last_found, l);
2331 old = b - data->last_start_min;
2334 I32 b = pos_before >= data->last_start_min
2335 ? pos_before : data->last_start_min;
2337 const char * const s = SvPV_const(data->last_found, l);
2338 I32 old = b - data->last_start_min;
2342 old = utf8_hop((U8*)s, old) - (U8*)s;
2345 /* Get the added string: */
2346 last_str = newSVpvn(s + old, l);
2348 SvUTF8_on(last_str);
2349 if (deltanext == 0 && pos_before == b) {
2350 /* What was added is a constant string */
2352 SvGROW(last_str, (mincount * l) + 1);
2353 repeatcpy(SvPVX(last_str) + l,
2354 SvPVX_const(last_str), l, mincount - 1);
2355 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2356 /* Add additional parts. */
2357 SvCUR_set(data->last_found,
2358 SvCUR(data->last_found) - l);
2359 sv_catsv(data->last_found, last_str);
2361 SV * sv = data->last_found;
2363 SvUTF8(sv) && SvMAGICAL(sv) ?
2364 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2365 if (mg && mg->mg_len >= 0)
2366 mg->mg_len += CHR_SVLEN(last_str);
2368 data->last_end += l * (mincount - 1);
2371 /* start offset must point into the last copy */
2372 data->last_start_min += minnext * (mincount - 1);
2373 data->last_start_max += is_inf ? I32_MAX
2374 : (maxcount - 1) * (minnext + data->pos_delta);
2377 /* It is counted once already... */
2378 data->pos_min += minnext * (mincount - counted);
2379 data->pos_delta += - counted * deltanext +
2380 (minnext + deltanext) * maxcount - minnext * mincount;
2381 if (mincount != maxcount) {
2382 /* Cannot extend fixed substrings found inside
2384 scan_commit(pRExC_state,data);
2385 if (mincount && last_str) {
2386 SV * const sv = data->last_found;
2387 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2388 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2392 sv_setsv(sv, last_str);
2393 data->last_end = data->pos_min;
2394 data->last_start_min =
2395 data->pos_min - CHR_SVLEN(last_str);
2396 data->last_start_max = is_inf
2398 : data->pos_min + data->pos_delta
2399 - CHR_SVLEN(last_str);
2401 data->longest = &(data->longest_float);
2403 SvREFCNT_dec(last_str);
2405 if (data && (fl & SF_HAS_EVAL))
2406 data->flags |= SF_HAS_EVAL;
2407 optimize_curly_tail:
2408 if (OP(oscan) != CURLYX) {
2409 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2411 NEXT_OFF(oscan) += NEXT_OFF(next);
2414 default: /* REF and CLUMP only? */
2415 if (flags & SCF_DO_SUBSTR) {
2416 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2417 data->longest = &(data->longest_float);
2419 is_inf = is_inf_internal = 1;
2420 if (flags & SCF_DO_STCLASS_OR)
2421 cl_anything(pRExC_state, data->start_class);
2422 flags &= ~SCF_DO_STCLASS;
2426 else if (strchr((const char*)PL_simple,OP(scan))) {
2429 if (flags & SCF_DO_SUBSTR) {
2430 scan_commit(pRExC_state,data);
2434 if (flags & SCF_DO_STCLASS) {
2435 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2437 /* Some of the logic below assumes that switching
2438 locale on will only add false positives. */
2439 switch (PL_regkind[(U8)OP(scan)]) {
2443 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2444 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2445 cl_anything(pRExC_state, data->start_class);
2448 if (OP(scan) == SANY)
2450 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2451 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2452 || (data->start_class->flags & ANYOF_CLASS));
2453 cl_anything(pRExC_state, data->start_class);
2455 if (flags & SCF_DO_STCLASS_AND || !value)
2456 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2459 if (flags & SCF_DO_STCLASS_AND)
2460 cl_and(data->start_class,
2461 (struct regnode_charclass_class*)scan);
2463 cl_or(pRExC_state, data->start_class,
2464 (struct regnode_charclass_class*)scan);
2467 if (flags & SCF_DO_STCLASS_AND) {
2468 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2469 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2470 for (value = 0; value < 256; value++)
2471 if (!isALNUM(value))
2472 ANYOF_BITMAP_CLEAR(data->start_class, value);
2476 if (data->start_class->flags & ANYOF_LOCALE)
2477 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2479 for (value = 0; value < 256; value++)
2481 ANYOF_BITMAP_SET(data->start_class, value);
2486 if (flags & SCF_DO_STCLASS_AND) {
2487 if (data->start_class->flags & ANYOF_LOCALE)
2488 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2491 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2492 data->start_class->flags |= ANYOF_LOCALE;
2496 if (flags & SCF_DO_STCLASS_AND) {
2497 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2498 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2499 for (value = 0; value < 256; value++)
2501 ANYOF_BITMAP_CLEAR(data->start_class, value);
2505 if (data->start_class->flags & ANYOF_LOCALE)
2506 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2508 for (value = 0; value < 256; value++)
2509 if (!isALNUM(value))
2510 ANYOF_BITMAP_SET(data->start_class, value);
2515 if (flags & SCF_DO_STCLASS_AND) {
2516 if (data->start_class->flags & ANYOF_LOCALE)
2517 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2520 data->start_class->flags |= ANYOF_LOCALE;
2521 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2525 if (flags & SCF_DO_STCLASS_AND) {
2526 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2527 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2528 for (value = 0; value < 256; value++)
2529 if (!isSPACE(value))
2530 ANYOF_BITMAP_CLEAR(data->start_class, value);
2534 if (data->start_class->flags & ANYOF_LOCALE)
2535 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2537 for (value = 0; value < 256; value++)
2539 ANYOF_BITMAP_SET(data->start_class, value);
2544 if (flags & SCF_DO_STCLASS_AND) {
2545 if (data->start_class->flags & ANYOF_LOCALE)
2546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2549 data->start_class->flags |= ANYOF_LOCALE;
2550 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2554 if (flags & SCF_DO_STCLASS_AND) {
2555 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2557 for (value = 0; value < 256; value++)
2559 ANYOF_BITMAP_CLEAR(data->start_class, value);
2563 if (data->start_class->flags & ANYOF_LOCALE)
2564 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2566 for (value = 0; value < 256; value++)
2567 if (!isSPACE(value))
2568 ANYOF_BITMAP_SET(data->start_class, value);
2573 if (flags & SCF_DO_STCLASS_AND) {
2574 if (data->start_class->flags & ANYOF_LOCALE) {
2575 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2576 for (value = 0; value < 256; value++)
2577 if (!isSPACE(value))
2578 ANYOF_BITMAP_CLEAR(data->start_class, value);
2582 data->start_class->flags |= ANYOF_LOCALE;
2583 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2587 if (flags & SCF_DO_STCLASS_AND) {
2588 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2589 for (value = 0; value < 256; value++)
2590 if (!isDIGIT(value))
2591 ANYOF_BITMAP_CLEAR(data->start_class, value);
2594 if (data->start_class->flags & ANYOF_LOCALE)
2595 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2597 for (value = 0; value < 256; value++)
2599 ANYOF_BITMAP_SET(data->start_class, value);
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2606 for (value = 0; value < 256; value++)
2608 ANYOF_BITMAP_CLEAR(data->start_class, value);
2611 if (data->start_class->flags & ANYOF_LOCALE)
2612 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2614 for (value = 0; value < 256; value++)
2615 if (!isDIGIT(value))
2616 ANYOF_BITMAP_SET(data->start_class, value);
2621 if (flags & SCF_DO_STCLASS_OR)
2622 cl_and(data->start_class, &and_with);
2623 flags &= ~SCF_DO_STCLASS;
2626 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2627 data->flags |= (OP(scan) == MEOL
2631 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2632 /* Lookbehind, or need to calculate parens/evals/stclass: */
2633 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2634 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2635 /* Lookahead/lookbehind */
2636 I32 deltanext, minnext, fake = 0;
2638 struct regnode_charclass_class intrnl;
2641 data_fake.flags = 0;
2643 data_fake.whilem_c = data->whilem_c;
2644 data_fake.last_closep = data->last_closep;
2647 data_fake.last_closep = &fake;
2648 if ( flags & SCF_DO_STCLASS && !scan->flags
2649 && OP(scan) == IFMATCH ) { /* Lookahead */
2650 cl_init(pRExC_state, &intrnl);
2651 data_fake.start_class = &intrnl;
2652 f |= SCF_DO_STCLASS_AND;
2654 if (flags & SCF_WHILEM_VISITED_POS)
2655 f |= SCF_WHILEM_VISITED_POS;
2656 next = regnext(scan);
2657 nscan = NEXTOPER(NEXTOPER(scan));
2658 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2661 vFAIL("Variable length lookbehind not implemented");
2663 else if (minnext > U8_MAX) {
2664 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2666 scan->flags = (U8)minnext;
2668 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2670 if (data && (data_fake.flags & SF_HAS_EVAL))
2671 data->flags |= SF_HAS_EVAL;
2673 data->whilem_c = data_fake.whilem_c;
2674 if (f & SCF_DO_STCLASS_AND) {
2675 const int was = (data->start_class->flags & ANYOF_EOS);
2677 cl_and(data->start_class, &intrnl);
2679 data->start_class->flags |= ANYOF_EOS;
2682 else if (OP(scan) == OPEN) {
2685 else if (OP(scan) == CLOSE) {
2686 if ((I32)ARG(scan) == is_par) {
2687 next = regnext(scan);
2689 if ( next && (OP(next) != WHILEM) && next < last)
2690 is_par = 0; /* Disable optimization */
2693 *(data->last_closep) = ARG(scan);
2695 else if (OP(scan) == EVAL) {
2697 data->flags |= SF_HAS_EVAL;
2699 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2700 if (flags & SCF_DO_SUBSTR) {
2701 scan_commit(pRExC_state,data);
2702 data->longest = &(data->longest_float);
2704 is_inf = is_inf_internal = 1;
2705 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2706 cl_anything(pRExC_state, data->start_class);
2707 flags &= ~SCF_DO_STCLASS;
2709 /* Else: zero-length, ignore. */
2710 scan = regnext(scan);
2715 *deltap = is_inf_internal ? I32_MAX : delta;
2716 if (flags & SCF_DO_SUBSTR && is_inf)
2717 data->pos_delta = I32_MAX - data->pos_min;
2718 if (is_par > U8_MAX)
2720 if (is_par && pars==1 && data) {
2721 data->flags |= SF_IN_PAR;
2722 data->flags &= ~SF_HAS_PAR;
2724 else if (pars && data) {
2725 data->flags |= SF_HAS_PAR;
2726 data->flags &= ~SF_IN_PAR;
2728 if (flags & SCF_DO_STCLASS_OR)
2729 cl_and(data->start_class, &and_with);
2734 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
2736 if (RExC_rx->data) {
2737 Renewc(RExC_rx->data,
2738 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2739 char, struct reg_data);
2740 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2741 RExC_rx->data->count += n;
2744 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2745 char, struct reg_data);
2746 Newx(RExC_rx->data->what, n, U8);
2747 RExC_rx->data->count = n;
2749 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2750 return RExC_rx->data->count - n;
2754 Perl_reginitcolors(pTHX)
2757 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2759 char *t = savepv(s);
2763 t = strchr(t, '\t');
2769 PL_colors[i] = t = (char *)"";
2774 PL_colors[i++] = (char *)"";
2781 - pregcomp - compile a regular expression into internal code
2783 * We can't allocate space until we know how big the compiled form will be,
2784 * but we can't compile it (and thus know how big it is) until we've got a
2785 * place to put the code. So we cheat: we compile it twice, once with code
2786 * generation turned off and size counting turned on, and once "for real".
2787 * This also means that we don't allocate space until we are sure that the
2788 * thing really will compile successfully, and we never have to move the
2789 * code and thus invalidate pointers into it. (Note that it has to be in
2790 * one piece because free() must be able to free it all.) [NB: not true in perl]
2792 * Beware that the optimization-preparation code in here knows about some
2793 * of the structure of the compiled regexp. [I'll say.]
2796 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2807 RExC_state_t RExC_state;
2808 RExC_state_t *pRExC_state = &RExC_state;
2810 GET_RE_DEBUG_FLAGS_DECL;
2813 FAIL("NULL regexp argument");
2815 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2818 DEBUG_r(if (!PL_colorset) reginitcolors());
2820 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2821 PL_colors[4],PL_colors[5],PL_colors[0],
2822 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2824 RExC_flags = pm->op_pmflags;
2828 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2829 RExC_seen_evals = 0;
2832 /* First pass: determine size, legality. */
2839 RExC_emit = &PL_regdummy;
2840 RExC_whilem_seen = 0;
2841 #if 0 /* REGC() is (currently) a NOP at the first pass.
2842 * Clever compilers notice this and complain. --jhi */
2843 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2845 if (reg(pRExC_state, 0, &flags) == NULL) {
2846 RExC_precomp = NULL;
2849 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2851 /* Small enough for pointer-storage convention?
2852 If extralen==0, this means that we will not need long jumps. */
2853 if (RExC_size >= 0x10000L && RExC_extralen)
2854 RExC_size += RExC_extralen;
2857 if (RExC_whilem_seen > 15)
2858 RExC_whilem_seen = 15;
2860 /* Allocate space and initialize. */
2861 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2864 FAIL("Regexp out of space");
2867 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2868 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2871 r->prelen = xend - exp;
2872 r->precomp = savepvn(RExC_precomp, r->prelen);
2874 #ifdef PERL_OLD_COPY_ON_WRITE
2875 r->saved_copy = NULL;
2877 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2878 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2879 r->lastparen = 0; /* mg.c reads this. */
2881 r->substrs = 0; /* Useful during FAIL. */
2882 r->startp = 0; /* Useful during FAIL. */
2883 r->endp = 0; /* Useful during FAIL. */
2885 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2887 r->offsets[0] = RExC_size;
2889 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2890 "%s %"UVuf" bytes for offset annotations.\n",
2891 r->offsets ? "Got" : "Couldn't get",
2892 (UV)((2*RExC_size+1) * sizeof(U32))));
2896 /* Second pass: emit code. */
2897 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2902 RExC_emit_start = r->program;
2903 RExC_emit = r->program;
2904 /* Store the count of eval-groups for security checks: */
2905 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2906 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2908 if (reg(pRExC_state, 0, &flags) == NULL)
2912 /* Dig out information for optimizations. */
2913 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2914 pm->op_pmflags = RExC_flags;
2916 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2917 r->regstclass = NULL;
2918 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2919 r->reganch |= ROPT_NAUGHTY;
2920 scan = r->program + 1; /* First BRANCH. */
2922 /* XXXX To minimize changes to RE engine we always allocate
2923 3-units-long substrs field. */
2924 Newxz(r->substrs, 1, struct reg_substr_data);
2926 StructCopy(&zero_scan_data, &data, scan_data_t);
2927 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2928 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2930 STRLEN longest_float_length, longest_fixed_length;
2931 struct regnode_charclass_class ch_class;
2936 /* Skip introductions and multiplicators >= 1. */
2937 while ((OP(first) == OPEN && (sawopen = 1)) ||
2938 /* An OR of *one* alternative - should not happen now. */
2939 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2940 (OP(first) == PLUS) ||
2941 (OP(first) == MINMOD) ||
2942 /* An {n,m} with n>0 */
2943 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2944 if (OP(first) == PLUS)
2947 first += regarglen[(U8)OP(first)];
2948 first = NEXTOPER(first);
2951 /* Starting-point info. */
2953 if (PL_regkind[(U8)OP(first)] == EXACT) {
2954 if (OP(first) == EXACT)
2955 /*EMPTY*/; /* Empty, get anchored substr later. */
2956 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2957 r->regstclass = first;
2959 else if (strchr((const char*)PL_simple,OP(first)))
2960 r->regstclass = first;
2961 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2962 PL_regkind[(U8)OP(first)] == NBOUND)
2963 r->regstclass = first;
2964 else if (PL_regkind[(U8)OP(first)] == BOL) {
2965 r->reganch |= (OP(first) == MBOL
2967 : (OP(first) == SBOL
2970 first = NEXTOPER(first);
2973 else if (OP(first) == GPOS) {
2974 r->reganch |= ROPT_ANCH_GPOS;
2975 first = NEXTOPER(first);
2978 else if (!sawopen && (OP(first) == STAR &&
2979 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2980 !(r->reganch & ROPT_ANCH) )
2982 /* turn .* into ^.* with an implied $*=1 */
2984 (OP(NEXTOPER(first)) == REG_ANY)
2987 r->reganch |= type | ROPT_IMPLICIT;
2988 first = NEXTOPER(first);
2991 if (sawplus && (!sawopen || !RExC_sawback)
2992 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
2993 /* x+ must match at the 1st pos of run of x's */
2994 r->reganch |= ROPT_SKIP;
2996 /* Scan is after the zeroth branch, first is atomic matcher. */
2997 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
2998 (IV)(first - scan + 1)));
3000 * If there's something expensive in the r.e., find the
3001 * longest literal string that must appear and make it the
3002 * regmust. Resolve ties in favor of later strings, since
3003 * the regstart check works with the beginning of the r.e.
3004 * and avoiding duplication strengthens checking. Not a
3005 * strong reason, but sufficient in the absence of others.
3006 * [Now we resolve ties in favor of the earlier string if
3007 * it happens that c_offset_min has been invalidated, since the
3008 * earlier string may buy us something the later one won't.]
3012 data.longest_fixed = newSVpvs("");
3013 data.longest_float = newSVpvs("");
3014 data.last_found = newSVpvs("");
3015 data.longest = &(data.longest_fixed);
3017 if (!r->regstclass) {
3018 cl_init(pRExC_state, &ch_class);
3019 data.start_class = &ch_class;
3020 stclass_flag = SCF_DO_STCLASS_AND;
3021 } else /* XXXX Check for BOUND? */
3023 data.last_closep = &last_close;
3025 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3026 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3027 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3028 && data.last_start_min == 0 && data.last_end > 0
3029 && !RExC_seen_zerolen
3030 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3031 r->reganch |= ROPT_CHECK_ALL;
3032 scan_commit(pRExC_state, &data);
3033 SvREFCNT_dec(data.last_found);
3035 longest_float_length = CHR_SVLEN(data.longest_float);
3036 if (longest_float_length
3037 || (data.flags & SF_FL_BEFORE_EOL
3038 && (!(data.flags & SF_FL_BEFORE_MEOL)
3039 || (RExC_flags & PMf_MULTILINE)))) {
3042 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3043 && data.offset_fixed == data.offset_float_min
3044 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3045 goto remove_float; /* As in (a)+. */
3047 if (SvUTF8(data.longest_float)) {
3048 r->float_utf8 = data.longest_float;
3049 r->float_substr = NULL;
3051 r->float_substr = data.longest_float;
3052 r->float_utf8 = NULL;
3054 r->float_min_offset = data.offset_float_min;
3055 r->float_max_offset = data.offset_float_max;
3056 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3057 && (!(data.flags & SF_FL_BEFORE_MEOL)
3058 || (RExC_flags & PMf_MULTILINE)));
3059 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3063 r->float_substr = r->float_utf8 = NULL;
3064 SvREFCNT_dec(data.longest_float);
3065 longest_float_length = 0;
3068 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3069 if (longest_fixed_length
3070 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3071 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3072 || (RExC_flags & PMf_MULTILINE)))) {
3075 if (SvUTF8(data.longest_fixed)) {
3076 r->anchored_utf8 = data.longest_fixed;
3077 r->anchored_substr = NULL;
3079 r->anchored_substr = data.longest_fixed;
3080 r->anchored_utf8 = NULL;
3082 r->anchored_offset = data.offset_fixed;
3083 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3084 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3085 || (RExC_flags & PMf_MULTILINE)));
3086 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3089 r->anchored_substr = r->anchored_utf8 = NULL;
3090 SvREFCNT_dec(data.longest_fixed);
3091 longest_fixed_length = 0;
3094 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3095 r->regstclass = NULL;
3096 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3098 && !(data.start_class->flags & ANYOF_EOS)
3099 && !cl_is_anything(data.start_class))
3101 const I32 n = add_data(pRExC_state, 1, "f");
3103 Newx(RExC_rx->data->data[n], 1,
3104 struct regnode_charclass_class);
3105 StructCopy(data.start_class,
3106 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3107 struct regnode_charclass_class);
3108 r->regstclass = (regnode*)RExC_rx->data->data[n];
3109 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3110 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3111 regprop(sv, (regnode*)data.start_class);
3112 PerlIO_printf(Perl_debug_log,
3113 "synthetic stclass \"%s\".\n",
3114 SvPVX_const(sv));});
3117 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3118 if (longest_fixed_length > longest_float_length) {
3119 r->check_substr = r->anchored_substr;
3120 r->check_utf8 = r->anchored_utf8;
3121 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3122 if (r->reganch & ROPT_ANCH_SINGLE)
3123 r->reganch |= ROPT_NOSCAN;
3126 r->check_substr = r->float_substr;
3127 r->check_utf8 = r->float_utf8;
3128 r->check_offset_min = data.offset_float_min;
3129 r->check_offset_max = data.offset_float_max;
3131 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3132 This should be changed ASAP! */
3133 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3134 r->reganch |= RE_USE_INTUIT;
3135 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3136 r->reganch |= RE_INTUIT_TAIL;
3140 /* Several toplevels. Best we can is to set minlen. */
3142 struct regnode_charclass_class ch_class;
3145 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3146 scan = r->program + 1;
3147 cl_init(pRExC_state, &ch_class);
3148 data.start_class = &ch_class;
3149 data.last_closep = &last_close;
3150 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3151 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3152 = r->float_substr = r->float_utf8 = NULL;
3153 if (!(data.start_class->flags & ANYOF_EOS)
3154 && !cl_is_anything(data.start_class))
3156 const I32 n = add_data(pRExC_state, 1, "f");
3158 Newx(RExC_rx->data->data[n], 1,
3159 struct regnode_charclass_class);
3160 StructCopy(data.start_class,
3161 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3162 struct regnode_charclass_class);
3163 r->regstclass = (regnode*)RExC_rx->data->data[n];
3164 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3165 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3166 regprop(sv, (regnode*)data.start_class);
3167 PerlIO_printf(Perl_debug_log,
3168 "synthetic stclass \"%s\".\n",
3169 SvPVX_const(sv));});
3174 if (RExC_seen & REG_SEEN_GPOS)
3175 r->reganch |= ROPT_GPOS_SEEN;
3176 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3177 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3178 if (RExC_seen & REG_SEEN_EVAL)
3179 r->reganch |= ROPT_EVAL_SEEN;
3180 if (RExC_seen & REG_SEEN_CANY)
3181 r->reganch |= ROPT_CANY_SEEN;
3182 Newxz(r->startp, RExC_npar, I32);
3183 Newxz(r->endp, RExC_npar, I32);
3184 DEBUG_COMPILE_r(regdump(r));
3189 - reg - regular expression, i.e. main body or parenthesized thing
3191 * Caller must absorb opening parenthesis.
3193 * Combining parenthesis handling with the base level of regular expression
3194 * is a trifle forced, but the need to tie the tails of the branches to what
3195 * follows makes it hard to avoid.
3198 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3199 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3202 register regnode *ret; /* Will be the head of the group. */
3203 register regnode *br;
3204 register regnode *lastbr;
3205 register regnode *ender = NULL;
3206 register I32 parno = 0;
3208 const I32 oregflags = RExC_flags;
3209 bool have_branch = 0;
3212 /* for (?g), (?gc), and (?o) warnings; warning
3213 about (?c) will warn about (?g) -- japhy */
3215 #define WASTED_O 0x01
3216 #define WASTED_G 0x02
3217 #define WASTED_C 0x04
3218 #define WASTED_GC (0x02|0x04)
3219 I32 wastedflags = 0x00;
3221 char * parse_start = RExC_parse; /* MJD */
3222 char * const oregcomp_parse = RExC_parse;
3224 *flagp = 0; /* Tentatively. */
3227 /* Make an OPEN node, if parenthesized. */
3229 if (*RExC_parse == '?') { /* (?...) */
3230 U32 posflags = 0, negflags = 0;
3231 U32 *flagsp = &posflags;
3232 bool is_logical = 0;
3233 const char * const seqstart = RExC_parse;
3236 paren = *RExC_parse++;
3237 ret = NULL; /* For look-ahead/behind. */
3239 case '<': /* (?<...) */
3240 RExC_seen |= REG_SEEN_LOOKBEHIND;
3241 if (*RExC_parse == '!')
3243 if (*RExC_parse != '=' && *RExC_parse != '!')
3246 case '=': /* (?=...) */
3247 case '!': /* (?!...) */
3248 RExC_seen_zerolen++;
3249 case ':': /* (?:...) */
3250 case '>': /* (?>...) */
3252 case '$': /* (?$...) */
3253 case '@': /* (?@...) */
3254 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3256 case '#': /* (?#...) */
3257 while (*RExC_parse && *RExC_parse != ')')
3259 if (*RExC_parse != ')')
3260 FAIL("Sequence (?#... not terminated");
3261 nextchar(pRExC_state);
3264 case 'p': /* (?p...) */
3265 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3266 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3268 case '?': /* (??...) */
3270 if (*RExC_parse != '{')
3272 paren = *RExC_parse++;
3274 case '{': /* (?{...}) */
3276 I32 count = 1, n = 0;
3278 char *s = RExC_parse;
3280 RExC_seen_zerolen++;
3281 RExC_seen |= REG_SEEN_EVAL;
3282 while (count && (c = *RExC_parse)) {
3293 if (*RExC_parse != ')') {
3295 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3299 OP_4tree *sop, *rop;
3300 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3303 Perl_save_re_context(aTHX);
3304 rop = sv_compile_2op(sv, &sop, "re", &pad);
3305 sop->op_private |= OPpREFCOUNTED;
3306 /* re_dup will OpREFCNT_inc */
3307 OpREFCNT_set(sop, 1);
3310 n = add_data(pRExC_state, 3, "nop");
3311 RExC_rx->data->data[n] = (void*)rop;
3312 RExC_rx->data->data[n+1] = (void*)sop;
3313 RExC_rx->data->data[n+2] = (void*)pad;
3316 else { /* First pass */
3317 if (PL_reginterp_cnt < ++RExC_seen_evals
3319 /* No compiled RE interpolated, has runtime
3320 components ===> unsafe. */
3321 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3322 if (PL_tainting && PL_tainted)
3323 FAIL("Eval-group in insecure regular expression");
3324 if (IN_PERL_COMPILETIME)
3328 nextchar(pRExC_state);
3330 ret = reg_node(pRExC_state, LOGICAL);
3333 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3334 /* deal with the length of this later - MJD */
3337 ret = reganode(pRExC_state, EVAL, n);
3338 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3339 Set_Node_Offset(ret, parse_start);
3342 case '(': /* (?(?{...})...) and (?(?=...)...) */
3344 if (RExC_parse[0] == '?') { /* (?(?...)) */
3345 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3346 || RExC_parse[1] == '<'
3347 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3350 ret = reg_node(pRExC_state, LOGICAL);
3353 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3357 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3360 parno = atoi(RExC_parse++);
3362 while (isDIGIT(*RExC_parse))
3364 ret = reganode(pRExC_state, GROUPP, parno);
3366 if ((c = *nextchar(pRExC_state)) != ')')
3367 vFAIL("Switch condition not recognized");
3369 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3370 br = regbranch(pRExC_state, &flags, 1);
3372 br = reganode(pRExC_state, LONGJMP, 0);
3374 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3375 c = *nextchar(pRExC_state);
3379 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3380 regbranch(pRExC_state, &flags, 1);
3381 regtail(pRExC_state, ret, lastbr);
3384 c = *nextchar(pRExC_state);
3389 vFAIL("Switch (?(condition)... contains too many branches");
3390 ender = reg_node(pRExC_state, TAIL);
3391 regtail(pRExC_state, br, ender);
3393 regtail(pRExC_state, lastbr, ender);
3394 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3397 regtail(pRExC_state, ret, ender);
3401 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3405 RExC_parse--; /* for vFAIL to print correctly */
3406 vFAIL("Sequence (? incomplete");
3410 parse_flags: /* (?i) */
3411 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3412 /* (?g), (?gc) and (?o) are useless here
3413 and must be globally applied -- japhy */
3415 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3416 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3417 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3418 if (! (wastedflags & wflagbit) ) {
3419 wastedflags |= wflagbit;
3422 "Useless (%s%c) - %suse /%c modifier",
3423 flagsp == &negflags ? "?-" : "?",
3425 flagsp == &negflags ? "don't " : "",
3431 else if (*RExC_parse == 'c') {
3432 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3433 if (! (wastedflags & WASTED_C) ) {
3434 wastedflags |= WASTED_GC;
3437 "Useless (%sc) - %suse /gc modifier",
3438 flagsp == &negflags ? "?-" : "?",
3439 flagsp == &negflags ? "don't " : ""
3444 else { pmflag(flagsp, *RExC_parse); }
3448 if (*RExC_parse == '-') {
3450 wastedflags = 0; /* reset so (?g-c) warns twice */
3454 RExC_flags |= posflags;
3455 RExC_flags &= ~negflags;
3456 if (*RExC_parse == ':') {
3462 if (*RExC_parse != ')') {
3464 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3466 nextchar(pRExC_state);
3474 ret = reganode(pRExC_state, OPEN, parno);
3475 Set_Node_Length(ret, 1); /* MJD */
3476 Set_Node_Offset(ret, RExC_parse); /* MJD */
3483 /* Pick up the branches, linking them together. */
3484 parse_start = RExC_parse; /* MJD */
3485 br = regbranch(pRExC_state, &flags, 1);
3486 /* branch_len = (paren != 0); */
3490 if (*RExC_parse == '|') {
3491 if (!SIZE_ONLY && RExC_extralen) {
3492 reginsert(pRExC_state, BRANCHJ, br);
3495 reginsert(pRExC_state, BRANCH, br);
3496 Set_Node_Length(br, paren != 0);
3497 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3501 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3503 else if (paren == ':') {
3504 *flagp |= flags&SIMPLE;
3506 if (is_open) { /* Starts with OPEN. */
3507 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3509 else if (paren != '?') /* Not Conditional */
3511 *flagp |= flags & (SPSTART | HASWIDTH);
3513 while (*RExC_parse == '|') {
3514 if (!SIZE_ONLY && RExC_extralen) {
3515 ender = reganode(pRExC_state, LONGJMP,0);
3516 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3519 RExC_extralen += 2; /* Account for LONGJMP. */
3520 nextchar(pRExC_state);
3521 br = regbranch(pRExC_state, &flags, 0);
3525 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3529 *flagp |= flags&SPSTART;
3532 if (have_branch || paren != ':') {
3533 /* Make a closing node, and hook it on the end. */
3536 ender = reg_node(pRExC_state, TAIL);
3539 ender = reganode(pRExC_state, CLOSE, parno);
3540 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3541 Set_Node_Length(ender,1); /* MJD */
3547 *flagp &= ~HASWIDTH;
3550 ender = reg_node(pRExC_state, SUCCEED);
3553 ender = reg_node(pRExC_state, END);
3556 regtail(pRExC_state, lastbr, ender);
3559 /* Hook the tails of the branches to the closing node. */
3560 for (br = ret; br != NULL; br = regnext(br)) {
3561 regoptail(pRExC_state, br, ender);
3568 static const char parens[] = "=!<,>";
3570 if (paren && (p = strchr(parens, paren))) {
3571 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3572 int flag = (p - parens) > 1;
3575 node = SUSPEND, flag = 0;
3576 reginsert(pRExC_state, node,ret);
3577 Set_Node_Cur_Length(ret);
3578 Set_Node_Offset(ret, parse_start + 1);
3580 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3584 /* Check for proper termination. */
3586 RExC_flags = oregflags;
3587 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3588 RExC_parse = oregcomp_parse;
3589 vFAIL("Unmatched (");
3592 else if (!paren && RExC_parse < RExC_end) {
3593 if (*RExC_parse == ')') {
3595 vFAIL("Unmatched )");
3598 FAIL("Junk on end of regexp"); /* "Can't happen". */
3606 - regbranch - one alternative of an | operator
3608 * Implements the concatenation operator.
3611 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3614 register regnode *ret;
3615 register regnode *chain = NULL;
3616 register regnode *latest;
3617 I32 flags = 0, c = 0;
3622 if (!SIZE_ONLY && RExC_extralen)
3623 ret = reganode(pRExC_state, BRANCHJ,0);
3625 ret = reg_node(pRExC_state, BRANCH);
3626 Set_Node_Length(ret, 1);
3630 if (!first && SIZE_ONLY)
3631 RExC_extralen += 1; /* BRANCHJ */
3633 *flagp = WORST; /* Tentatively. */
3636 nextchar(pRExC_state);
3637 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3639 latest = regpiece(pRExC_state, &flags);
3640 if (latest == NULL) {
3641 if (flags & TRYAGAIN)
3645 else if (ret == NULL)
3647 *flagp |= flags&HASWIDTH;
3648 if (chain == NULL) /* First piece. */
3649 *flagp |= flags&SPSTART;
3652 regtail(pRExC_state, chain, latest);
3657 if (chain == NULL) { /* Loop ran zero times. */
3658 chain = reg_node(pRExC_state, NOTHING);
3663 *flagp |= flags&SIMPLE;
3670 - regpiece - something followed by possible [*+?]
3672 * Note that the branching code sequences used for ? and the general cases
3673 * of * and + are somewhat optimized: they use the same NOTHING node as
3674 * both the endmarker for their branch list and the body of the last branch.
3675 * It might seem that this node could be dispensed with entirely, but the
3676 * endmarker role is not redundant.
3679 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3682 register regnode *ret;
3684 register char *next;
3686 const char * const origparse = RExC_parse;
3689 I32 max = REG_INFTY;
3692 ret = regatom(pRExC_state, &flags);
3694 if (flags & TRYAGAIN)
3701 if (op == '{' && regcurly(RExC_parse)) {
3702 parse_start = RExC_parse; /* MJD */
3703 next = RExC_parse + 1;
3705 while (isDIGIT(*next) || *next == ',') {
3714 if (*next == '}') { /* got one */
3718 min = atoi(RExC_parse);
3722 maxpos = RExC_parse;
3724 if (!max && *maxpos != '0')
3725 max = REG_INFTY; /* meaning "infinity" */
3726 else if (max >= REG_INFTY)
3727 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3729 nextchar(pRExC_state);
3732 if ((flags&SIMPLE)) {
3733 RExC_naughty += 2 + RExC_naughty / 2;
3734 reginsert(pRExC_state, CURLY, ret);
3735 Set_Node_Offset(ret, parse_start+1); /* MJD */
3736 Set_Node_Cur_Length(ret);
3739 regnode *w = reg_node(pRExC_state, WHILEM);
3742 regtail(pRExC_state, ret, w);
3743 if (!SIZE_ONLY && RExC_extralen) {
3744 reginsert(pRExC_state, LONGJMP,ret);
3745 reginsert(pRExC_state, NOTHING,ret);
3746 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3748 reginsert(pRExC_state, CURLYX,ret);
3750 Set_Node_Offset(ret, parse_start+1);
3751 Set_Node_Length(ret,
3752 op == '{' ? (RExC_parse - parse_start) : 1);
3754 if (!SIZE_ONLY && RExC_extralen)
3755 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3756 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3758 RExC_whilem_seen++, RExC_extralen += 3;
3759 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3767 if (max && max < min)
3768 vFAIL("Can't do {n,m} with n > m");
3770 ARG1_SET(ret, (U16)min);
3771 ARG2_SET(ret, (U16)max);
3783 #if 0 /* Now runtime fix should be reliable. */
3785 /* if this is reinstated, don't forget to put this back into perldiag:
3787 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3789 (F) The part of the regexp subject to either the * or + quantifier
3790 could match an empty string. The {#} shows in the regular
3791 expression about where the problem was discovered.
3795 if (!(flags&HASWIDTH) && op != '?')
3796 vFAIL("Regexp *+ operand could be empty");
3799 parse_start = RExC_parse;
3800 nextchar(pRExC_state);
3802 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3804 if (op == '*' && (flags&SIMPLE)) {
3805 reginsert(pRExC_state, STAR, ret);
3809 else if (op == '*') {
3813 else if (op == '+' && (flags&SIMPLE)) {
3814 reginsert(pRExC_state, PLUS, ret);
3818 else if (op == '+') {
3822 else if (op == '?') {
3827 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
3829 "%.*s matches null string many times",
3830 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
3834 if (*RExC_parse == '?') {
3835 nextchar(pRExC_state);
3836 reginsert(pRExC_state, MINMOD, ret);
3837 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3839 if (ISMULT2(RExC_parse)) {
3841 vFAIL("Nested quantifiers");
3848 - regatom - the lowest level
3850 * Optimization: gobbles an entire sequence of ordinary characters so that
3851 * it can turn them into a single node, which is smaller to store and
3852 * faster to run. Backslashed characters are exceptions, each becoming a
3853 * separate node; the code is simpler that way and it's not worth fixing.
3855 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3857 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3860 register regnode *ret = NULL;
3862 char *parse_start = RExC_parse;
3864 *flagp = WORST; /* Tentatively. */
3867 switch (*RExC_parse) {
3869 RExC_seen_zerolen++;
3870 nextchar(pRExC_state);
3871 if (RExC_flags & PMf_MULTILINE)
3872 ret = reg_node(pRExC_state, MBOL);
3873 else if (RExC_flags & PMf_SINGLELINE)
3874 ret = reg_node(pRExC_state, SBOL);
3876 ret = reg_node(pRExC_state, BOL);
3877 Set_Node_Length(ret, 1); /* MJD */
3880 nextchar(pRExC_state);
3882 RExC_seen_zerolen++;
3883 if (RExC_flags & PMf_MULTILINE)
3884 ret = reg_node(pRExC_state, MEOL);
3885 else if (RExC_flags & PMf_SINGLELINE)
3886 ret = reg_node(pRExC_state, SEOL);
3888 ret = reg_node(pRExC_state, EOL);
3889 Set_Node_Length(ret, 1); /* MJD */
3892 nextchar(pRExC_state);
3893 if (RExC_flags & PMf_SINGLELINE)
3894 ret = reg_node(pRExC_state, SANY);
3896 ret = reg_node(pRExC_state, REG_ANY);
3897 *flagp |= HASWIDTH|SIMPLE;
3899 Set_Node_Length(ret, 1); /* MJD */
3903 char *oregcomp_parse = ++RExC_parse;
3904 ret = regclass(pRExC_state);
3905 if (*RExC_parse != ']') {
3906 RExC_parse = oregcomp_parse;
3907 vFAIL("Unmatched [");
3909 nextchar(pRExC_state);
3910 *flagp |= HASWIDTH|SIMPLE;
3911 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3915 nextchar(pRExC_state);
3916 ret = reg(pRExC_state, 1, &flags);
3918 if (flags & TRYAGAIN) {
3919 if (RExC_parse == RExC_end) {
3920 /* Make parent create an empty node if needed. */
3928 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3932 if (flags & TRYAGAIN) {
3936 vFAIL("Internal urp");
3937 /* Supposed to be caught earlier. */
3940 if (!regcurly(RExC_parse)) {
3949 vFAIL("Quantifier follows nothing");
3952 switch (*++RExC_parse) {
3954 RExC_seen_zerolen++;
3955 ret = reg_node(pRExC_state, SBOL);
3957 nextchar(pRExC_state);
3958 Set_Node_Length(ret, 2); /* MJD */
3961 ret = reg_node(pRExC_state, GPOS);
3962 RExC_seen |= REG_SEEN_GPOS;
3964 nextchar(pRExC_state);
3965 Set_Node_Length(ret, 2); /* MJD */
3968 ret = reg_node(pRExC_state, SEOL);
3970 RExC_seen_zerolen++; /* Do not optimize RE away */
3971 nextchar(pRExC_state);
3974 ret = reg_node(pRExC_state, EOS);
3976 RExC_seen_zerolen++; /* Do not optimize RE away */
3977 nextchar(pRExC_state);
3978 Set_Node_Length(ret, 2); /* MJD */
3981 ret = reg_node(pRExC_state, CANY);
3982 RExC_seen |= REG_SEEN_CANY;
3983 *flagp |= HASWIDTH|SIMPLE;
3984 nextchar(pRExC_state);
3985 Set_Node_Length(ret, 2); /* MJD */
3988 ret = reg_node(pRExC_state, CLUMP);
3990 nextchar(pRExC_state);
3991 Set_Node_Length(ret, 2); /* MJD */
3994 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
3995 *flagp |= HASWIDTH|SIMPLE;
3996 nextchar(pRExC_state);
3997 Set_Node_Length(ret, 2); /* MJD */
4000 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4001 *flagp |= HASWIDTH|SIMPLE;
4002 nextchar(pRExC_state);
4003 Set_Node_Length(ret, 2); /* MJD */
4006 RExC_seen_zerolen++;
4007 RExC_seen |= REG_SEEN_LOOKBEHIND;
4008 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4010 nextchar(pRExC_state);
4011 Set_Node_Length(ret, 2); /* MJD */
4014 RExC_seen_zerolen++;
4015 RExC_seen |= REG_SEEN_LOOKBEHIND;
4016 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4018 nextchar(pRExC_state);
4019 Set_Node_Length(ret, 2); /* MJD */
4022 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4023 *flagp |= HASWIDTH|SIMPLE;
4024 nextchar(pRExC_state);
4025 Set_Node_Length(ret, 2); /* MJD */
4028 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4029 *flagp |= HASWIDTH|SIMPLE;
4030 nextchar(pRExC_state);
4031 Set_Node_Length(ret, 2); /* MJD */
4034 ret = reg_node(pRExC_state, DIGIT);
4035 *flagp |= HASWIDTH|SIMPLE;
4036 nextchar(pRExC_state);
4037 Set_Node_Length(ret, 2); /* MJD */
4040 ret = reg_node(pRExC_state, NDIGIT);
4041 *flagp |= HASWIDTH|SIMPLE;
4042 nextchar(pRExC_state);
4043 Set_Node_Length(ret, 2); /* MJD */
4048 char* oldregxend = RExC_end;
4049 char* parse_start = RExC_parse - 2;
4051 if (RExC_parse[1] == '{') {
4052 /* a lovely hack--pretend we saw [\pX] instead */
4053 RExC_end = strchr(RExC_parse, '}');
4055 U8 c = (U8)*RExC_parse;
4057 RExC_end = oldregxend;
4058 vFAIL2("Missing right brace on \\%c{}", c);
4063 RExC_end = RExC_parse + 2;
4064 if (RExC_end > oldregxend)
4065 RExC_end = oldregxend;
4069 ret = regclass(pRExC_state);
4071 RExC_end = oldregxend;
4074 Set_Node_Offset(ret, parse_start + 2);
4075 Set_Node_Cur_Length(ret);
4076 nextchar(pRExC_state);
4077 *flagp |= HASWIDTH|SIMPLE;
4090 case '1': case '2': case '3': case '4':
4091 case '5': case '6': case '7': case '8': case '9':
4093 const I32 num = atoi(RExC_parse);
4095 if (num > 9 && num >= RExC_npar)
4098 char * parse_start = RExC_parse - 1; /* MJD */
4099 while (isDIGIT(*RExC_parse))
4102 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4103 vFAIL("Reference to nonexistent group");
4105 ret = reganode(pRExC_state,
4106 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4110 /* override incorrect value set in reganode MJD */
4111 Set_Node_Offset(ret, parse_start+1);
4112 Set_Node_Cur_Length(ret); /* MJD */
4114 nextchar(pRExC_state);
4119 if (RExC_parse >= RExC_end)
4120 FAIL("Trailing \\");
4123 /* Do not generate "unrecognized" warnings here, we fall
4124 back into the quick-grab loop below */
4131 if (RExC_flags & PMf_EXTENDED) {
4132 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4133 if (RExC_parse < RExC_end)
4139 register STRLEN len;
4144 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4146 parse_start = RExC_parse - 1;
4152 ret = reg_node(pRExC_state,
4153 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4155 for (len = 0, p = RExC_parse - 1;
4156 len < 127 && p < RExC_end;
4161 if (RExC_flags & PMf_EXTENDED)
4162 p = regwhite(p, RExC_end);
4209 ender = ASCII_TO_NATIVE('\033');
4213 ender = ASCII_TO_NATIVE('\007');
4218 char* const e = strchr(p, '}');
4222 vFAIL("Missing right brace on \\x{}");
4225 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4226 | PERL_SCAN_DISALLOW_PREFIX;
4227 STRLEN numlen = e - p - 1;
4228 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4235 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4237 ender = grok_hex(p, &numlen, &flags, NULL);
4243 ender = UCHARAT(p++);
4244 ender = toCTRL(ender);
4246 case '0': case '1': case '2': case '3':case '4':
4247 case '5': case '6': case '7': case '8':case '9':
4249 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4252 ender = grok_oct(p, &numlen, &flags, NULL);
4262 FAIL("Trailing \\");
4265 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4266 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4267 goto normal_default;
4272 if (UTF8_IS_START(*p) && UTF) {
4274 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4275 &numlen, UTF8_ALLOW_DEFAULT);
4282 if (RExC_flags & PMf_EXTENDED)
4283 p = regwhite(p, RExC_end);
4285 /* Prime the casefolded buffer. */
4286 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4288 if (ISMULT2(p)) { /* Back off on ?+*. */
4295 /* Emit all the Unicode characters. */
4297 for (foldbuf = tmpbuf;
4299 foldlen -= numlen) {
4300 ender = utf8_to_uvchr(foldbuf, &numlen);
4302 reguni(pRExC_state, ender, s, &unilen);
4305 /* In EBCDIC the numlen
4306 * and unilen can differ. */
4308 if (numlen >= foldlen)
4312 break; /* "Can't happen." */
4316 reguni(pRExC_state, ender, s, &unilen);
4325 REGC((char)ender, s++);
4333 /* Emit all the Unicode characters. */
4335 for (foldbuf = tmpbuf;
4337 foldlen -= numlen) {
4338 ender = utf8_to_uvchr(foldbuf, &numlen);
4340 reguni(pRExC_state, ender, s, &unilen);
4343 /* In EBCDIC the numlen
4344 * and unilen can differ. */
4346 if (numlen >= foldlen)
4354 reguni(pRExC_state, ender, s, &unilen);
4363 REGC((char)ender, s++);
4367 Set_Node_Cur_Length(ret); /* MJD */
4368 nextchar(pRExC_state);
4370 /* len is STRLEN which is unsigned, need to copy to signed */
4373 vFAIL("Internal disaster");
4377 if (len == 1 && UNI_IS_INVARIANT(ender))
4382 RExC_size += STR_SZ(len);
4384 RExC_emit += STR_SZ(len);
4389 /* If the encoding pragma is in effect recode the text of
4390 * any EXACT-kind nodes. */
4391 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4392 STRLEN oldlen = STR_LEN(ret);
4393 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4397 if (sv_utf8_downgrade(sv, TRUE)) {
4398 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4399 const STRLEN newlen = SvCUR(sv);
4404 GET_RE_DEBUG_FLAGS_DECL;
4405 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4406 (int)oldlen, STRING(ret),
4408 Copy(s, STRING(ret), newlen, char);
4409 STR_LEN(ret) += newlen - oldlen;
4410 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4412 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4420 S_regwhite(char *p, const char *e)
4425 else if (*p == '#') {
4428 } while (p < e && *p != '\n');
4436 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4437 Character classes ([:foo:]) can also be negated ([:^foo:]).
4438 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4439 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4440 but trigger failures because they are currently unimplemented. */
4442 #define POSIXCC_DONE(c) ((c) == ':')
4443 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4444 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4447 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4450 I32 namedclass = OOB_NAMEDCLASS;
4452 if (value == '[' && RExC_parse + 1 < RExC_end &&
4453 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4454 POSIXCC(UCHARAT(RExC_parse))) {
4455 const char c = UCHARAT(RExC_parse);
4456 char* s = RExC_parse++;
4458 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4460 if (RExC_parse == RExC_end)
4461 /* Grandfather lone [:, [=, [. */
4464 const char* t = RExC_parse++; /* skip over the c */
4465 const char *posixcc;
4469 if (UCHARAT(RExC_parse) == ']') {
4470 RExC_parse++; /* skip over the ending ] */
4473 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4474 const I32 skip = t - posixcc;
4476 /* Initially switch on the length of the name. */
4479 if (memEQ(posixcc, "word", 4)) {
4480 /* this is not POSIX, this is the Perl \w */;
4482 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4486 /* Names all of length 5. */
4487 /* alnum alpha ascii blank cntrl digit graph lower
4488 print punct space upper */
4489 /* Offset 4 gives the best switch position. */
4490 switch (posixcc[4]) {
4492 if (memEQ(posixcc, "alph", 4)) {
4495 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4499 if (memEQ(posixcc, "spac", 4)) {
4502 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4506 if (memEQ(posixcc, "grap", 4)) {
4509 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4513 if (memEQ(posixcc, "asci", 4)) {
4516 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4520 if (memEQ(posixcc, "blan", 4)) {
4523 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4527 if (memEQ(posixcc, "cntr", 4)) {
4530 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4534 if (memEQ(posixcc, "alnu", 4)) {
4537 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4541 if (memEQ(posixcc, "lowe", 4)) {
4544 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4546 if (memEQ(posixcc, "uppe", 4)) {
4549 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4553 if (memEQ(posixcc, "digi", 4)) {
4556 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4558 if (memEQ(posixcc, "prin", 4)) {
4561 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4563 if (memEQ(posixcc, "punc", 4)) {
4566 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4572 if (memEQ(posixcc, "xdigit", 6)) {
4574 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4579 if (namedclass == OOB_NAMEDCLASS)
4581 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4584 assert (posixcc[skip] == ':');
4585 assert (posixcc[skip+1] == ']');
4586 } else if (!SIZE_ONLY) {
4587 /* [[=foo=]] and [[.foo.]] are still future. */
4589 /* adjust RExC_parse so the warning shows after
4591 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4593 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4596 /* Maternal grandfather:
4597 * "[:" ending in ":" but not in ":]" */
4607 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4610 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4611 const char *s = RExC_parse;
4612 const char c = *s++;
4614 while(*s && isALNUM(*s))
4616 if (*s && c == *s && s[1] == ']') {
4617 if (ckWARN(WARN_REGEXP))
4619 "POSIX syntax [%c %c] belongs inside character classes",
4622 /* [[=foo=]] and [[.foo.]] are still future. */
4623 if (POSIXCC_NOTYET(c)) {
4624 /* adjust RExC_parse so the error shows after
4626 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4628 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4635 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4639 register UV nextvalue;
4640 register IV prevvalue = OOB_UNICODE;
4641 register IV range = 0;
4642 register regnode *ret;
4645 char *rangebegin = NULL;
4646 bool need_class = 0;
4650 bool optimize_invert = TRUE;
4651 AV* unicode_alternate = NULL;
4653 UV literal_endpoint = 0;
4656 ret = reganode(pRExC_state, ANYOF, 0);
4659 ANYOF_FLAGS(ret) = 0;
4661 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4665 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4669 RExC_size += ANYOF_SKIP;
4671 RExC_emit += ANYOF_SKIP;
4673 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4675 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4676 ANYOF_BITMAP_ZERO(ret);
4677 listsv = newSVpvs("# comment\n");
4680 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4682 if (!SIZE_ONLY && POSIXCC(nextvalue))
4683 checkposixcc(pRExC_state);
4685 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4686 if (UCHARAT(RExC_parse) == ']')
4689 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4693 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4696 rangebegin = RExC_parse;
4698 value = utf8n_to_uvchr((U8*)RExC_parse,
4699 RExC_end - RExC_parse,
4700 &numlen, UTF8_ALLOW_DEFAULT);
4701 RExC_parse += numlen;
4704 value = UCHARAT(RExC_parse++);
4705 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4706 if (value == '[' && POSIXCC(nextvalue))
4707 namedclass = regpposixcc(pRExC_state, value);
4708 else if (value == '\\') {
4710 value = utf8n_to_uvchr((U8*)RExC_parse,
4711 RExC_end - RExC_parse,
4712 &numlen, UTF8_ALLOW_DEFAULT);
4713 RExC_parse += numlen;
4716 value = UCHARAT(RExC_parse++);
4717 /* Some compilers cannot handle switching on 64-bit integer
4718 * values, therefore value cannot be an UV. Yes, this will
4719 * be a problem later if we want switch on Unicode.
4720 * A similar issue a little bit later when switching on
4721 * namedclass. --jhi */
4722 switch ((I32)value) {
4723 case 'w': namedclass = ANYOF_ALNUM; break;
4724 case 'W': namedclass = ANYOF_NALNUM; break;
4725 case 's': namedclass = ANYOF_SPACE; break;
4726 case 'S': namedclass = ANYOF_NSPACE; break;
4727 case 'd': namedclass = ANYOF_DIGIT; break;
4728 case 'D': namedclass = ANYOF_NDIGIT; break;
4731 if (RExC_parse >= RExC_end)
4732 vFAIL2("Empty \\%c{}", (U8)value);
4733 if (*RExC_parse == '{') {
4734 const U8 c = (U8)value;
4735 e = strchr(RExC_parse++, '}');
4737 vFAIL2("Missing right brace on \\%c{}", c);
4738 while (isSPACE(UCHARAT(RExC_parse)))
4740 if (e == RExC_parse)
4741 vFAIL2("Empty \\%c{}", c);
4743 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4751 if (UCHARAT(RExC_parse) == '^') {
4754 value = value == 'p' ? 'P' : 'p'; /* toggle */
4755 while (isSPACE(UCHARAT(RExC_parse))) {
4761 Perl_sv_catpvf(aTHX_ listsv,
4762 "+utf8::%.*s\n", (int)n, RExC_parse);
4764 Perl_sv_catpvf(aTHX_ listsv,
4765 "!utf8::%.*s\n", (int)n, RExC_parse);
4768 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4769 namedclass = ANYOF_MAX; /* no official name, but it's named */
4771 case 'n': value = '\n'; break;
4772 case 'r': value = '\r'; break;
4773 case 't': value = '\t'; break;
4774 case 'f': value = '\f'; break;
4775 case 'b': value = '\b'; break;
4776 case 'e': value = ASCII_TO_NATIVE('\033');break;
4777 case 'a': value = ASCII_TO_NATIVE('\007');break;
4779 if (*RExC_parse == '{') {
4780 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4781 | PERL_SCAN_DISALLOW_PREFIX;
4782 e = strchr(RExC_parse++, '}');
4784 vFAIL("Missing right brace on \\x{}");
4786 numlen = e - RExC_parse;
4787 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4791 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4793 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4794 RExC_parse += numlen;
4798 value = UCHARAT(RExC_parse++);
4799 value = toCTRL(value);
4801 case '0': case '1': case '2': case '3': case '4':
4802 case '5': case '6': case '7': case '8': case '9':
4806 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4807 RExC_parse += numlen;
4811 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
4813 "Unrecognized escape \\%c in character class passed through",
4817 } /* end of \blah */
4823 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4825 if (!SIZE_ONLY && !need_class)
4826 ANYOF_CLASS_ZERO(ret);
4830 /* a bad range like a-\d, a-[:digit:] ? */
4833 if (ckWARN(WARN_REGEXP)) {
4835 RExC_parse >= rangebegin ?
4836 RExC_parse - rangebegin : 0;
4838 "False [] range \"%*.*s\"",
4843 if (prevvalue < 256) {
4844 ANYOF_BITMAP_SET(ret, prevvalue);
4845 ANYOF_BITMAP_SET(ret, '-');
4848 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4849 Perl_sv_catpvf(aTHX_ listsv,
4850 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4854 range = 0; /* this was not a true range */
4858 const char *what = NULL;
4861 if (namedclass > OOB_NAMEDCLASS)
4862 optimize_invert = FALSE;
4863 /* Possible truncation here but in some 64-bit environments
4864 * the compiler gets heartburn about switch on 64-bit values.
4865 * A similar issue a little earlier when switching on value.
4867 switch ((I32)namedclass) {
4870 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4872 for (value = 0; value < 256; value++)
4874 ANYOF_BITMAP_SET(ret, value);
4881 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4883 for (value = 0; value < 256; value++)
4884 if (!isALNUM(value))
4885 ANYOF_BITMAP_SET(ret, value);
4892 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4894 for (value = 0; value < 256; value++)
4895 if (isALNUMC(value))
4896 ANYOF_BITMAP_SET(ret, value);
4903 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4905 for (value = 0; value < 256; value++)
4906 if (!isALNUMC(value))
4907 ANYOF_BITMAP_SET(ret, value);
4914 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4916 for (value = 0; value < 256; value++)
4918 ANYOF_BITMAP_SET(ret, value);
4925 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4927 for (value = 0; value < 256; value++)
4928 if (!isALPHA(value))
4929 ANYOF_BITMAP_SET(ret, value);
4936 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4939 for (value = 0; value < 128; value++)
4940 ANYOF_BITMAP_SET(ret, value);
4942 for (value = 0; value < 256; value++) {
4944 ANYOF_BITMAP_SET(ret, value);
4953 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4956 for (value = 128; value < 256; value++)
4957 ANYOF_BITMAP_SET(ret, value);
4959 for (value = 0; value < 256; value++) {
4960 if (!isASCII(value))
4961 ANYOF_BITMAP_SET(ret, value);
4970 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4972 for (value = 0; value < 256; value++)
4974 ANYOF_BITMAP_SET(ret, value);
4981 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4983 for (value = 0; value < 256; value++)
4984 if (!isBLANK(value))
4985 ANYOF_BITMAP_SET(ret, value);
4992 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
4994 for (value = 0; value < 256; value++)
4996 ANYOF_BITMAP_SET(ret, value);
5003 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5005 for (value = 0; value < 256; value++)
5006 if (!isCNTRL(value))
5007 ANYOF_BITMAP_SET(ret, value);
5014 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5016 /* consecutive digits assumed */
5017 for (value = '0'; value <= '9'; value++)
5018 ANYOF_BITMAP_SET(ret, value);
5025 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5027 /* consecutive digits assumed */
5028 for (value = 0; value < '0'; value++)
5029 ANYOF_BITMAP_SET(ret, value);
5030 for (value = '9' + 1; value < 256; value++)
5031 ANYOF_BITMAP_SET(ret, value);
5038 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5040 for (value = 0; value < 256; value++)
5042 ANYOF_BITMAP_SET(ret, value);
5049 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5051 for (value = 0; value < 256; value++)
5052 if (!isGRAPH(value))
5053 ANYOF_BITMAP_SET(ret, value);
5060 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5062 for (value = 0; value < 256; value++)
5064 ANYOF_BITMAP_SET(ret, value);
5071 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5073 for (value = 0; value < 256; value++)
5074 if (!isLOWER(value))
5075 ANYOF_BITMAP_SET(ret, value);
5082 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5084 for (value = 0; value < 256; value++)
5086 ANYOF_BITMAP_SET(ret, value);
5093 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5095 for (value = 0; value < 256; value++)
5096 if (!isPRINT(value))
5097 ANYOF_BITMAP_SET(ret, value);
5104 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5106 for (value = 0; value < 256; value++)
5107 if (isPSXSPC(value))
5108 ANYOF_BITMAP_SET(ret, value);
5115 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5117 for (value = 0; value < 256; value++)
5118 if (!isPSXSPC(value))
5119 ANYOF_BITMAP_SET(ret, value);
5126 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5128 for (value = 0; value < 256; value++)
5130 ANYOF_BITMAP_SET(ret, value);
5137 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5139 for (value = 0; value < 256; value++)
5140 if (!isPUNCT(value))
5141 ANYOF_BITMAP_SET(ret, value);
5148 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5150 for (value = 0; value < 256; value++)
5152 ANYOF_BITMAP_SET(ret, value);
5159 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5161 for (value = 0; value < 256; value++)
5162 if (!isSPACE(value))
5163 ANYOF_BITMAP_SET(ret, value);
5170 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5172 for (value = 0; value < 256; value++)
5174 ANYOF_BITMAP_SET(ret, value);
5181 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5183 for (value = 0; value < 256; value++)
5184 if (!isUPPER(value))
5185 ANYOF_BITMAP_SET(ret, value);
5192 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5194 for (value = 0; value < 256; value++)
5195 if (isXDIGIT(value))
5196 ANYOF_BITMAP_SET(ret, value);
5203 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5205 for (value = 0; value < 256; value++)
5206 if (!isXDIGIT(value))
5207 ANYOF_BITMAP_SET(ret, value);
5213 /* this is to handle \p and \P */
5216 vFAIL("Invalid [::] class");
5220 /* Strings such as "+utf8::isWord\n" */
5221 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5224 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5227 } /* end of namedclass \blah */
5230 if (prevvalue > (IV)value) /* b-a */ {
5231 const int w = RExC_parse - rangebegin;
5232 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5233 range = 0; /* not a valid range */
5237 prevvalue = value; /* save the beginning of the range */
5238 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5239 RExC_parse[1] != ']') {
5242 /* a bad range like \w-, [:word:]- ? */
5243 if (namedclass > OOB_NAMEDCLASS) {
5244 if (ckWARN(WARN_REGEXP)) {
5246 RExC_parse >= rangebegin ?
5247 RExC_parse - rangebegin : 0;
5249 "False [] range \"%*.*s\"",
5255 ANYOF_BITMAP_SET(ret, '-');
5257 range = 1; /* yeah, it's a range! */
5258 continue; /* but do it the next time */
5262 /* now is the next time */
5266 if (prevvalue < 256) {
5267 const IV ceilvalue = value < 256 ? value : 255;
5270 /* In EBCDIC [\x89-\x91] should include
5271 * the \x8e but [i-j] should not. */
5272 if (literal_endpoint == 2 &&
5273 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5274 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5276 if (isLOWER(prevvalue)) {
5277 for (i = prevvalue; i <= ceilvalue; i++)
5279 ANYOF_BITMAP_SET(ret, i);
5281 for (i = prevvalue; i <= ceilvalue; i++)
5283 ANYOF_BITMAP_SET(ret, i);
5288 for (i = prevvalue; i <= ceilvalue; i++)
5289 ANYOF_BITMAP_SET(ret, i);
5291 if (value > 255 || UTF) {
5292 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5293 const UV natvalue = NATIVE_TO_UNI(value);
5295 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5296 if (prevnatvalue < natvalue) { /* what about > ? */
5297 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5298 prevnatvalue, natvalue);
5300 else if (prevnatvalue == natvalue) {
5301 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5303 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5305 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5307 /* If folding and foldable and a single
5308 * character, insert also the folded version
5309 * to the charclass. */
5311 if (foldlen == (STRLEN)UNISKIP(f))
5312 Perl_sv_catpvf(aTHX_ listsv,
5315 /* Any multicharacter foldings
5316 * require the following transform:
5317 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5318 * where E folds into "pq" and F folds
5319 * into "rst", all other characters
5320 * fold to single characters. We save
5321 * away these multicharacter foldings,
5322 * to be later saved as part of the
5323 * additional "s" data. */
5326 if (!unicode_alternate)
5327 unicode_alternate = newAV();
5328 sv = newSVpvn((char*)foldbuf, foldlen);
5330 av_push(unicode_alternate, sv);
5334 /* If folding and the value is one of the Greek
5335 * sigmas insert a few more sigmas to make the
5336 * folding rules of the sigmas to work right.
5337 * Note that not all the possible combinations
5338 * are handled here: some of them are handled
5339 * by the standard folding rules, and some of
5340 * them (literal or EXACTF cases) are handled
5341 * during runtime in regexec.c:S_find_byclass(). */
5342 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5343 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5344 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5345 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5346 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5348 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5349 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5350 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5355 literal_endpoint = 0;
5359 range = 0; /* this range (if it was one) is done now */
5363 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5365 RExC_size += ANYOF_CLASS_ADD_SKIP;
5367 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5370 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5372 /* If the only flag is folding (plus possibly inversion). */
5373 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5375 for (value = 0; value < 256; ++value) {
5376 if (ANYOF_BITMAP_TEST(ret, value)) {
5377 UV fold = PL_fold[value];
5380 ANYOF_BITMAP_SET(ret, fold);
5383 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5386 /* optimize inverted simple patterns (e.g. [^a-z]) */
5387 if (!SIZE_ONLY && optimize_invert &&
5388 /* If the only flag is inversion. */
5389 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5390 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5391 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5392 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5399 /* The 0th element stores the character class description
5400 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5401 * to initialize the appropriate swash (which gets stored in
5402 * the 1st element), and also useful for dumping the regnode.
5403 * The 2nd element stores the multicharacter foldings,
5404 * used later (regexec.c:S_reginclass()). */
5405 av_store(av, 0, listsv);
5406 av_store(av, 1, NULL);
5407 av_store(av, 2, (SV*)unicode_alternate);
5408 rv = newRV_noinc((SV*)av);
5409 n = add_data(pRExC_state, 1, "s");
5410 RExC_rx->data->data[n] = (void*)rv;
5418 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5420 char* retval = RExC_parse++;
5423 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5424 RExC_parse[2] == '#') {
5425 while (*RExC_parse != ')') {
5426 if (RExC_parse == RExC_end)
5427 FAIL("Sequence (?#... not terminated");
5433 if (RExC_flags & PMf_EXTENDED) {
5434 if (isSPACE(*RExC_parse)) {
5438 else if (*RExC_parse == '#') {
5439 while (RExC_parse < RExC_end)
5440 if (*RExC_parse++ == '\n') break;
5449 - reg_node - emit a node
5451 STATIC regnode * /* Location. */
5452 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5455 register regnode *ptr;
5456 regnode * const ret = RExC_emit;
5459 SIZE_ALIGN(RExC_size);
5464 NODE_ALIGN_FILL(ret);
5466 FILL_ADVANCE_NODE(ptr, op);
5467 if (RExC_offsets) { /* MJD */
5468 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5469 "reg_node", __LINE__,
5471 RExC_emit - RExC_emit_start > RExC_offsets[0]
5472 ? "Overwriting end of array!\n" : "OK",
5473 RExC_emit - RExC_emit_start,
5474 RExC_parse - RExC_start,
5476 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5485 - reganode - emit a node with an argument
5487 STATIC regnode * /* Location. */
5488 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5491 register regnode *ptr;
5492 regnode * const ret = RExC_emit;
5495 SIZE_ALIGN(RExC_size);
5500 NODE_ALIGN_FILL(ret);
5502 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5503 if (RExC_offsets) { /* MJD */
5504 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5508 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5509 "Overwriting end of array!\n" : "OK",
5510 RExC_emit - RExC_emit_start,
5511 RExC_parse - RExC_start,
5513 Set_Cur_Node_Offset;
5522 - reguni - emit (if appropriate) a Unicode character
5525 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5528 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5532 - reginsert - insert an operator in front of already-emitted operand
5534 * Means relocating the operand.
5537 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5540 register regnode *src;
5541 register regnode *dst;
5542 register regnode *place;
5543 const int offset = regarglen[(U8)op];
5545 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5548 RExC_size += NODE_STEP_REGNODE + offset;
5553 RExC_emit += NODE_STEP_REGNODE + offset;
5555 while (src > opnd) {
5556 StructCopy(--src, --dst, regnode);
5557 if (RExC_offsets) { /* MJD 20010112 */
5558 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5562 dst - RExC_emit_start > RExC_offsets[0]
5563 ? "Overwriting end of array!\n" : "OK",
5564 src - RExC_emit_start,
5565 dst - RExC_emit_start,
5567 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5568 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5573 place = opnd; /* Op node, where operand used to be. */
5574 if (RExC_offsets) { /* MJD */
5575 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5579 place - RExC_emit_start > RExC_offsets[0]
5580 ? "Overwriting end of array!\n" : "OK",
5581 place - RExC_emit_start,
5582 RExC_parse - RExC_start,
5584 Set_Node_Offset(place, RExC_parse);
5585 Set_Node_Length(place, 1);
5587 src = NEXTOPER(place);
5588 FILL_ADVANCE_NODE(place, op);
5589 Zero(src, offset, regnode);
5593 - regtail - set the next-pointer at the end of a node chain of p to val.
5596 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5599 register regnode *scan;
5604 /* Find last node. */
5607 regnode * const temp = regnext(scan);
5613 if (reg_off_by_arg[OP(scan)]) {
5614 ARG_SET(scan, val - scan);
5617 NEXT_OFF(scan) = val - scan;
5622 - regoptail - regtail on operand of first argument; nop if operandless
5625 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5628 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5629 if (p == NULL || SIZE_ONLY)
5631 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5632 regtail(pRExC_state, NEXTOPER(p), val);
5634 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5635 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5642 - regcurly - a little FSA that accepts {\d+,?\d*}
5645 S_regcurly(register const char *s)
5664 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5667 Perl_regdump(pTHX_ regexp *r)
5671 SV * const sv = sv_newmortal();
5673 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
5675 /* Header fields of interest. */
5676 if (r->anchored_substr)
5677 PerlIO_printf(Perl_debug_log,
5678 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5680 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5681 SvPVX_const(r->anchored_substr),
5683 SvTAIL(r->anchored_substr) ? "$" : "",
5684 (IV)r->anchored_offset);
5685 else if (r->anchored_utf8)
5686 PerlIO_printf(Perl_debug_log,
5687 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5689 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5690 SvPVX_const(r->anchored_utf8),
5692 SvTAIL(r->anchored_utf8) ? "$" : "",
5693 (IV)r->anchored_offset);
5694 if (r->float_substr)
5695 PerlIO_printf(Perl_debug_log,
5696 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5698 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5699 SvPVX_const(r->float_substr),
5701 SvTAIL(r->float_substr) ? "$" : "",
5702 (IV)r->float_min_offset, (UV)r->float_max_offset);
5703 else if (r->float_utf8)
5704 PerlIO_printf(Perl_debug_log,
5705 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5707 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5708 SvPVX_const(r->float_utf8),
5710 SvTAIL(r->float_utf8) ? "$" : "",
5711 (IV)r->float_min_offset, (UV)r->float_max_offset);
5712 if (r->check_substr || r->check_utf8)
5713 PerlIO_printf(Perl_debug_log,
5714 r->check_substr == r->float_substr
5715 && r->check_utf8 == r->float_utf8
5716 ? "(checking floating" : "(checking anchored");
5717 if (r->reganch & ROPT_NOSCAN)
5718 PerlIO_printf(Perl_debug_log, " noscan");
5719 if (r->reganch & ROPT_CHECK_ALL)
5720 PerlIO_printf(Perl_debug_log, " isall");
5721 if (r->check_substr || r->check_utf8)
5722 PerlIO_printf(Perl_debug_log, ") ");
5724 if (r->regstclass) {
5725 regprop(sv, r->regstclass);
5726 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5728 if (r->reganch & ROPT_ANCH) {
5729 PerlIO_printf(Perl_debug_log, "anchored");
5730 if (r->reganch & ROPT_ANCH_BOL)
5731 PerlIO_printf(Perl_debug_log, "(BOL)");
5732 if (r->reganch & ROPT_ANCH_MBOL)
5733 PerlIO_printf(Perl_debug_log, "(MBOL)");
5734 if (r->reganch & ROPT_ANCH_SBOL)
5735 PerlIO_printf(Perl_debug_log, "(SBOL)");
5736 if (r->reganch & ROPT_ANCH_GPOS)
5737 PerlIO_printf(Perl_debug_log, "(GPOS)");
5738 PerlIO_putc(Perl_debug_log, ' ');
5740 if (r->reganch & ROPT_GPOS_SEEN)
5741 PerlIO_printf(Perl_debug_log, "GPOS ");
5742 if (r->reganch & ROPT_SKIP)
5743 PerlIO_printf(Perl_debug_log, "plus ");
5744 if (r->reganch & ROPT_IMPLICIT)
5745 PerlIO_printf(Perl_debug_log, "implicit ");
5746 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5747 if (r->reganch & ROPT_EVAL_SEEN)
5748 PerlIO_printf(Perl_debug_log, "with eval ");
5749 PerlIO_printf(Perl_debug_log, "\n");
5751 const U32 len = r->offsets[0];
5752 GET_RE_DEBUG_FLAGS_DECL;
5755 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5756 for (i = 1; i <= len; i++)
5757 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5758 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5759 PerlIO_printf(Perl_debug_log, "\n");
5763 PERL_UNUSED_CONTEXT;
5765 #endif /* DEBUGGING */
5769 - regprop - printable representation of opcode
5772 Perl_regprop(pTHX_ SV *sv, const regnode *o)
5778 sv_setpvn(sv, "", 0);
5779 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5780 /* It would be nice to FAIL() here, but this may be called from
5781 regexec.c, and it would be hard to supply pRExC_state. */
5782 Perl_croak(aTHX_ "Corrupted regexp opcode");
5783 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5785 k = PL_regkind[(U8)OP(o)];
5788 SV * const dsv = sv_2mortal(newSVpvs(""));
5789 /* Using is_utf8_string() is a crude hack but it may
5790 * be the best for now since we have no flag "this EXACTish
5791 * node was UTF-8" --jhi */
5792 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5793 const char * const s = do_utf8 ?
5794 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5795 UNI_DISPLAY_REGEX) :
5797 const int len = do_utf8 ?
5800 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5804 } else if (k == TRIE) {
5806 /* print the details od the trie in dumpuntil instead, as
5807 * prog->data isn't available here */
5808 } else if (k == CURLY) {
5809 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5810 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5811 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5813 else if (k == WHILEM && o->flags) /* Ordinal/of */
5814 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5815 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5816 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5817 else if (k == LOGICAL)
5818 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5819 else if (k == ANYOF) {
5820 int i, rangestart = -1;
5821 const U8 flags = ANYOF_FLAGS(o);
5823 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5824 static const char * const anyofs[] = {
5857 if (flags & ANYOF_LOCALE)
5858 sv_catpvs(sv, "{loc}");
5859 if (flags & ANYOF_FOLD)
5860 sv_catpvs(sv, "{i}");
5861 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5862 if (flags & ANYOF_INVERT)
5864 for (i = 0; i <= 256; i++) {
5865 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5866 if (rangestart == -1)
5868 } else if (rangestart != -1) {
5869 if (i <= rangestart + 3)
5870 for (; rangestart < i; rangestart++)
5871 put_byte(sv, rangestart);
5873 put_byte(sv, rangestart);
5875 put_byte(sv, i - 1);
5881 if (o->flags & ANYOF_CLASS)
5882 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5883 if (ANYOF_CLASS_TEST(o,i))
5884 sv_catpv(sv, anyofs[i]);
5886 if (flags & ANYOF_UNICODE)
5887 sv_catpvs(sv, "{unicode}");
5888 else if (flags & ANYOF_UNICODE_ALL)
5889 sv_catpvs(sv, "{unicode_all}");
5893 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
5897 U8 s[UTF8_MAXBYTES_CASE+1];
5899 for (i = 0; i <= 256; i++) { /* just the first 256 */
5900 uvchr_to_utf8(s, i);
5902 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5903 if (rangestart == -1)
5905 } else if (rangestart != -1) {
5906 if (i <= rangestart + 3)
5907 for (; rangestart < i; rangestart++) {
5908 const U8 * const e = uvchr_to_utf8(s,rangestart);
5910 for(p = s; p < e; p++)
5914 const U8 *e = uvchr_to_utf8(s,rangestart);
5916 for (p = s; p < e; p++)
5919 e = uvchr_to_utf8(s, i-1);
5920 for (p = s; p < e; p++)
5927 sv_catpvs(sv, "..."); /* et cetera */
5931 char *s = savesvpv(lv);
5932 char * const origs = s;
5934 while(*s && *s != '\n') s++;
5937 const char * const t = ++s;
5955 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5957 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5958 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5960 PERL_UNUSED_CONTEXT;
5961 PERL_UNUSED_ARG(sv);
5963 #endif /* DEBUGGING */
5967 Perl_re_intuit_string(pTHX_ regexp *prog)
5968 { /* Assume that RE_INTUIT is set */
5970 GET_RE_DEBUG_FLAGS_DECL;
5971 PERL_UNUSED_CONTEXT;
5975 const char * const s = SvPV_nolen_const(prog->check_substr
5976 ? prog->check_substr : prog->check_utf8);
5978 if (!PL_colorset) reginitcolors();
5979 PerlIO_printf(Perl_debug_log,
5980 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
5982 prog->check_substr ? "" : "utf8 ",
5983 PL_colors[5],PL_colors[0],
5986 (strlen(s) > 60 ? "..." : ""));
5989 return prog->check_substr ? prog->check_substr : prog->check_utf8;
5993 Perl_pregfree(pTHX_ struct regexp *r)
5997 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
5998 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
6002 if (!r || (--r->refcnt > 0))
6004 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6005 const char * const s = (r->reganch & ROPT_UTF8)
6006 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6007 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6008 const int len = SvCUR(dsv);
6011 PerlIO_printf(Perl_debug_log,
6012 "%sFreeing REx:%s %s%*.*s%s%s\n",
6013 PL_colors[4],PL_colors[5],PL_colors[0],
6016 len > 60 ? "..." : "");
6019 /* gcov results gave these as non-null 100% of the time, so there's no
6020 optimisation in checking them before calling Safefree */
6021 Safefree(r->precomp);
6022 Safefree(r->offsets); /* 20010421 MJD */
6023 RX_MATCH_COPY_FREE(r);
6024 #ifdef PERL_OLD_COPY_ON_WRITE
6026 SvREFCNT_dec(r->saved_copy);
6029 if (r->anchored_substr)
6030 SvREFCNT_dec(r->anchored_substr);
6031 if (r->anchored_utf8)
6032 SvREFCNT_dec(r->anchored_utf8);
6033 if (r->float_substr)
6034 SvREFCNT_dec(r->float_substr);
6036 SvREFCNT_dec(r->float_utf8);
6037 Safefree(r->substrs);
6040 int n = r->data->count;
6041 PAD* new_comppad = NULL;
6046 /* If you add a ->what type here, update the comment in regcomp.h */
6047 switch (r->data->what[n]) {
6049 SvREFCNT_dec((SV*)r->data->data[n]);
6052 Safefree(r->data->data[n]);
6055 new_comppad = (AV*)r->data->data[n];
6058 if (new_comppad == NULL)
6059 Perl_croak(aTHX_ "panic: pregfree comppad");
6060 PAD_SAVE_LOCAL(old_comppad,
6061 /* Watch out for global destruction's random ordering. */
6062 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6065 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6068 op_free((OP_4tree*)r->data->data[n]);
6070 PAD_RESTORE_LOCAL(old_comppad);
6071 SvREFCNT_dec((SV*)new_comppad);
6078 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
6081 refcount = --trie->refcount;
6084 Safefree(trie->charmap);
6085 if (trie->widecharmap)
6086 SvREFCNT_dec((SV*)trie->widecharmap);
6087 Safefree(trie->states);
6088 Safefree(trie->trans);
6091 SvREFCNT_dec((SV*)trie->words);
6092 if (trie->revcharmap)
6093 SvREFCNT_dec((SV*)trie->revcharmap);
6095 Safefree(r->data->data[n]); /* do this last!!!! */
6100 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6103 Safefree(r->data->what);
6106 Safefree(r->startp);
6112 - regnext - dig the "next" pointer out of a node
6115 Perl_regnext(pTHX_ register regnode *p)
6118 register I32 offset;
6120 if (p == &PL_regdummy)
6123 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6131 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6134 STRLEN l1 = strlen(pat1);
6135 STRLEN l2 = strlen(pat2);
6138 const char *message;
6144 Copy(pat1, buf, l1 , char);
6145 Copy(pat2, buf + l1, l2 , char);
6146 buf[l1 + l2] = '\n';
6147 buf[l1 + l2 + 1] = '\0';
6149 /* ANSI variant takes additional second argument */
6150 va_start(args, pat2);
6154 msv = vmess(buf, &args);
6156 message = SvPV_const(msv,l1);
6159 Copy(message, buf, l1 , char);
6160 buf[l1-1] = '\0'; /* Overwrite \n */
6161 Perl_croak(aTHX_ "%s", buf);
6164 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6167 Perl_save_re_context(pTHX)
6171 struct re_save_state *state;
6173 SAVEVPTR(PL_curcop);
6174 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6176 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6177 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6178 SSPUSHINT(SAVEt_RE_STATE);
6180 Copy(&PL_reg_state, state, 1, struct re_save_state);
6182 PL_reg_start_tmp = 0;
6183 PL_reg_start_tmpl = 0;
6184 PL_reg_oldsaved = NULL;
6185 PL_reg_oldsavedlen = 0;
6187 PL_reg_leftiter = 0;
6188 PL_reg_poscache = NULL;
6189 PL_reg_poscache_size = 0;
6190 #ifdef PERL_OLD_COPY_ON_WRITE
6194 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6196 const REGEXP * const rx = PM_GETRE(PL_curpm);
6199 for (i = 1; i <= rx->nparens; i++) {
6200 char digits[TYPE_CHARS(long)];
6201 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
6202 GV *const *const gvp
6203 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6206 GV * const gv = *gvp;
6207 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6216 clear_re(pTHX_ void *r)
6219 ReREFCNT_dec((regexp *)r);
6225 S_put_byte(pTHX_ SV *sv, int c)
6227 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6228 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6229 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6230 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6232 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6237 S_dumpuntil(pTHX_ regexp *r, regnode *start, regnode *node, regnode *last,
6241 register U8 op = EXACT; /* Arbitrary non-END op. */
6242 register regnode *next;
6244 while (op != END && (!last || node < last)) {
6245 /* While that wasn't END last time... */
6251 next = regnext(node);
6253 if (OP(node) == OPTIMIZED)
6256 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6257 (int)(2*l + 1), "", SvPVX_const(sv));
6258 if (next == NULL) /* Next ptr. */
6259 PerlIO_printf(Perl_debug_log, "(0)");
6261 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6262 (void)PerlIO_putc(Perl_debug_log, '\n');
6264 if (PL_regkind[(U8)op] == BRANCHJ) {
6265 register regnode *nnode = (OP(next) == LONGJMP
6268 if (last && nnode > last)
6270 node = dumpuntil(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6272 else if (PL_regkind[(U8)op] == BRANCH) {
6273 node = dumpuntil(r, start, NEXTOPER(node), next, sv, l + 1);
6275 else if ( PL_regkind[(U8)op] == TRIE ) {
6276 const I32 n = ARG(node);
6277 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6278 const I32 arry_len = av_len(trie->words)+1;
6280 PerlIO_printf(Perl_debug_log,
6281 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6285 (int)trie->charcount,
6286 trie->uniquecharcount,
6287 (IV)trie->laststate-1,
6288 node->flags ? " EVAL mode" : "");
6290 for (word_idx=0; word_idx < arry_len; word_idx++) {
6291 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6293 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6296 SvPV_nolen_const(*elem_ptr),
6301 PerlIO_printf(Perl_debug_log, "(0)\n");
6303 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6309 node = NEXTOPER(node);
6310 node += regarglen[(U8)op];
6313 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6314 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6315 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6317 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6318 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6321 else if ( op == PLUS || op == STAR) {
6322 node = dumpuntil(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6324 else if (op == ANYOF) {
6325 /* arglen 1 + class block */
6326 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6327 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6328 node = NEXTOPER(node);
6330 else if (PL_regkind[(U8)op] == EXACT) {
6331 /* Literal string, where present. */
6332 node += NODE_SZ_STR(node) - 1;
6333 node = NEXTOPER(node);
6336 node = NEXTOPER(node);
6337 node += regarglen[(U8)op];
6339 if (op == CURLYX || op == OPEN)
6341 else if (op == WHILEM)
6347 #endif /* DEBUGGING */
6351 * c-indentation-style: bsd
6353 * indent-tabs-mode: t
6356 * ex: set ts=8 sts=4 sw=4 noet: