5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
289 #define FAIL2(pat,msg) STMT_START { \
290 const char *ellipses = ""; \
291 IV len = RExC_end - RExC_precomp; \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 #define Simple_vFAIL(m) STMT_START { \
309 const IV offset = RExC_parse - RExC_precomp; \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 #define vFAIL(m) STMT_START { \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
324 * Like Simple_vFAIL(), but accepts two arguments.
326 #define Simple_vFAIL2(m,a1) STMT_START { \
327 const IV offset = RExC_parse - RExC_precomp; \
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 #define vFAIL2(m,a1) STMT_START { \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
346 const IV offset = RExC_parse - RExC_precomp; \
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 #define vFAIL3(m,a1,a2) STMT_START { \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
361 * Like Simple_vFAIL(), but accepts four arguments.
363 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
364 const IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 #define vWARN(loc,m) STMT_START { \
370 const IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARNdep(loc,m) STMT_START { \
376 const IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
383 #define vWARN2(loc, m, a1) STMT_START { \
384 const IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN3(loc, m, a1, a2) STMT_START { \
390 const IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 const IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 const IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 /* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
423 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
430 RExC_offsets[2*(node)-1] = (byte); \
435 #define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439 #define Set_Node_Length_To_R(node,len) STMT_START { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (int)(node), (int)(len))); \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
446 RExC_offsets[2*(node)] = (len); \
451 #define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461 static void clear_re(pTHX_ void *r);
463 /* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 SvSetMagicSV(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 data->flags &= ~SF_FIX_BEFORE_EOL;
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 data->flags &= ~SF_FL_BEFORE_EOL;
497 SvCUR_set(data->last_found, 0);
499 SV * const sv = data->last_found;
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
506 data->flags &= ~SF_BEFORE_EOL;
509 /* Can match anything (initialization) */
511 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 ANYOF_CLASS_ZERO(cl);
514 ANYOF_BITMAP_SETALL(cl);
515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517 cl->flags |= ANYOF_LOCALE;
520 /* Can match anything (initialization) */
522 S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
526 for (value = 0; value <= ANYOF_MAX; value += 2)
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
536 /* Can match anything (initialization) */
538 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 Zero(cl, 1, struct regnode_charclass_class);
542 cl_anything(pRExC_state, cl);
546 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 Zero(cl, 1, struct regnode_charclass_class);
550 cl_anything(pRExC_state, cl);
552 cl->flags |= ANYOF_LOCALE;
555 /* 'And' a given class with another one. Can create false positives */
556 /* We assume that cl is not inverted */
558 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559 const struct regnode_charclass_class *and_with)
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
586 cl->flags &= ~ANYOF_UNICODE_ALL;
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
589 cl->flags &= ~ANYOF_UNICODE;
592 /* 'OR' a given class with another one. Can create false positives */
593 /* We assume that cl is not inverted */
595 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
597 if (or_with->flags & ANYOF_INVERT) {
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
616 cl_anything(pRExC_state, cl);
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && (!(or_with->flags & ANYOF_FOLD)
622 || (cl->flags & ANYOF_FOLD)) ) {
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
635 cl_anything(pRExC_state, cl);
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
654 make_trie(startbranch,first,last,tail,flags)
655 startbranch: the first branch in the whole branch sequence
656 first : start branch of sequence of branch-exact nodes.
657 May be the same as startbranch
658 last : Thing following the last branch.
659 May be the same as tail.
660 tail : item following the branch sequence
661 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
663 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
665 A trie is an N'ary tree where the branches are determined by digital
666 decomposition of the key. IE, at the root node you look up the 1st character and
667 follow that branch repeat until you find the end of the branches. Nodes can be
668 marked as "accepting" meaning they represent a complete word. Eg:
672 would convert into the following structure. Numbers represent states, letters
673 following numbers represent valid transitions on the letter from that state, if
674 the number is in square brackets it represents an accepting state, otherwise it
675 will be in parenthesis.
677 +-h->+-e->[3]-+-r->(8)-+-s->[9]
681 (1) +-i->(6)-+-s->[7]
683 +-s->(3)-+-h->(4)-+-e->[5]
685 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
687 This shows that when matching against the string 'hers' we will begin at state 1
688 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
689 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
690 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
691 single traverse. We store a mapping from accepting to state to which word was
692 matched, and then when we have multiple possibilities we try to complete the
693 rest of the regex in the order in which they occured in the alternation.
695 The only prior NFA like behaviour that would be changed by the TRIE support is
696 the silent ignoring of duplicate alternations which are of the form:
698 / (DUPE|DUPE) X? (?{ ... }) Y /x
700 Thus EVAL blocks follwing a trie may be called a different number of times with
701 and without the optimisation. With the optimisations dupes will be silently
702 ignored. This inconsistant behaviour of EVAL type nodes is well established as
703 the following demonstrates:
705 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
707 which prints out 'word' three times, but
709 'words'=~/(word|word|word)(?{ print $1 })S/
711 which doesnt print it out at all. This is due to other optimisations kicking in.
713 Example of what happens on a structural level:
715 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
717 1: CURLYM[1] {1,32767}(18)
728 This would be optimizable with startbranch=5, first=5, last=16, tail=16
729 and should turn into:
731 1: CURLYM[1] {1,32767}(18)
733 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
741 Cases where tail != last would be like /(?foo|bar)baz/:
751 which would be optimizable with startbranch=1, first=1, last=7, tail=8
752 and would end up looking like:
755 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
764 #define TRIE_DEBUG_CHAR \
765 DEBUG_TRIE_COMPILE_r({ \
768 tmp = newSVpvn( "", 0 ); \
769 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
771 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
773 av_push( trie->revcharmap, tmp ); \
776 #define TRIE_READ_CHAR STMT_START { \
779 if ( foldlen > 0 ) { \
780 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
785 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
786 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
787 foldlen -= UNISKIP( uvc ); \
788 scan = foldbuf + UNISKIP( uvc ); \
791 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
800 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
801 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
802 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
803 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
805 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
806 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
807 TRIE_LIST_LEN( state ) *= 2; \
808 Renew( trie->states[ state ].trans.list, \
809 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
811 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
812 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
813 TRIE_LIST_CUR( state )++; \
816 #define TRIE_LIST_NEW(state) STMT_START { \
817 Newxz( trie->states[ state ].trans.list, \
818 4, reg_trie_trans_le ); \
819 TRIE_LIST_CUR( state ) = 1; \
820 TRIE_LIST_LEN( state ) = 4; \
824 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
827 /* first pass, loop through and scan words */
830 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
835 /* we just use folder as a flag in utf8 */
836 const U8 * const folder = ( flags == EXACTF
844 const U32 data_slot = add_data( pRExC_state, 1, "t" );
847 GET_RE_DEBUG_FLAGS_DECL;
849 Newxz( trie, 1, reg_trie_data );
851 RExC_rx->data->data[ data_slot ] = (void*)trie;
852 Newxz( trie->charmap, 256, U16 );
854 trie->words = newAV();
855 trie->revcharmap = newAV();
859 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
860 if (!SvIOK(re_trie_maxbuff)) {
861 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
864 /* -- First loop and Setup --
866 We first traverse the branches and scan each word to determine if it
867 contains widechars, and how many unique chars there are, this is
868 important as we have to build a table with at least as many columns as we
871 We use an array of integers to represent the character codes 0..255
872 (trie->charmap) and we use a an HV* to store unicode characters. We use the
873 native representation of the character value as the key and IV's for the
876 *TODO* If we keep track of how many times each character is used we can
877 remap the columns so that the table compression later on is more
878 efficient in terms of memory by ensuring most common value is in the
879 middle and the least common are on the outside. IMO this would be better
880 than a most to least common mapping as theres a decent chance the most
881 common letter will share a node with the least common, meaning the node
882 will not be compressable. With a middle is most common approach the worst
883 case is when we have the least common nodes twice.
888 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
889 regnode * const noper = NEXTOPER( cur );
890 const U8 *uc = (U8*)STRING( noper );
891 const U8 * const e = uc + STR_LEN( noper );
893 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
894 const U8 *scan = (U8*)NULL;
896 for ( ; uc < e ; uc += len ) {
900 if ( !trie->charmap[ uvc ] ) {
901 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
903 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
908 if ( !trie->widecharmap )
909 trie->widecharmap = newHV();
911 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
914 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
916 if ( !SvTRUE( *svpp ) ) {
917 sv_setiv( *svpp, ++trie->uniquecharcount );
923 } /* end first pass */
924 DEBUG_TRIE_COMPILE_r(
925 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
926 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
927 (int)trie->charcount, trie->uniquecharcount )
932 We now know what we are dealing with in terms of unique chars and
933 string sizes so we can calculate how much memory a naive
934 representation using a flat table will take. If it's over a reasonable
935 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
936 conservative but potentially much slower representation using an array
939 At the end we convert both representations into the same compressed
940 form that will be used in regexec.c for matching with. The latter
941 is a form that cannot be used to construct with but has memory
942 properties similar to the list form and access properties similar
943 to the table form making it both suitable for fast searches and
944 small enough that its feasable to store for the duration of a program.
946 See the comment in the code where the compressed table is produced
947 inplace from the flat tabe representation for an explanation of how
948 the compression works.
953 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
955 Second Pass -- Array Of Lists Representation
957 Each state will be represented by a list of charid:state records
958 (reg_trie_trans_le) the first such element holds the CUR and LEN
959 points of the allocated array. (See defines above).
961 We build the initial structure using the lists, and then convert
962 it into the compressed table form which allows faster lookups
963 (but cant be modified once converted).
969 STRLEN transcount = 1;
971 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
975 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
977 regnode * const noper = NEXTOPER( cur );
978 U8 *uc = (U8*)STRING( noper );
979 const U8 * const e = uc + STR_LEN( noper );
980 U32 state = 1; /* required init */
981 U16 charid = 0; /* sanity init */
982 U8 *scan = (U8*)NULL; /* sanity init */
983 STRLEN foldlen = 0; /* required init */
984 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
986 for ( ; uc < e ; uc += len ) {
991 charid = trie->charmap[ uvc ];
993 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
997 charid=(U16)SvIV( *svpp );
1006 if ( !trie->states[ state ].trans.list ) {
1007 TRIE_LIST_NEW( state );
1009 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1010 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1011 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1016 newstate = next_alloc++;
1017 TRIE_LIST_PUSH( state, charid, newstate );
1022 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1024 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1027 if ( !trie->states[ state ].wordnum ) {
1028 /* we havent inserted this word into the structure yet. */
1029 trie->states[ state ].wordnum = ++curword;
1032 /* store the word for dumping */
1033 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1034 if ( UTF ) SvUTF8_on( tmp );
1035 av_push( trie->words, tmp );
1039 /* Its a dupe. So ignore it. */
1042 } /* end second pass */
1044 trie->laststate = next_alloc;
1045 Renew( trie->states, next_alloc, reg_trie_state );
1047 DEBUG_TRIE_COMPILE_MORE_r({
1050 /* print out the table precompression. */
1052 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1053 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1055 for( state=1 ; state < next_alloc ; state ++ ) {
1058 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1059 if ( ! trie->states[ state ].wordnum ) {
1060 PerlIO_printf( Perl_debug_log, "%5s| ","");
1062 PerlIO_printf( Perl_debug_log, "W%04x| ",
1063 trie->states[ state ].wordnum
1066 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1067 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1068 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1069 SvPV_nolen_const( *tmp ),
1070 TRIE_LIST_ITEM(state,charid).forid,
1071 (UV)TRIE_LIST_ITEM(state,charid).newstate
1076 PerlIO_printf( Perl_debug_log, "\n\n" );
1079 Newxz( trie->trans, transcount ,reg_trie_trans );
1086 for( state=1 ; state < next_alloc ; state ++ ) {
1090 DEBUG_TRIE_COMPILE_MORE_r(
1091 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1095 if (trie->states[state].trans.list) {
1096 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1100 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1101 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1102 if ( forid < minid ) {
1104 } else if ( forid > maxid ) {
1108 if ( transcount < tp + maxid - minid + 1) {
1110 Renew( trie->trans, transcount, reg_trie_trans );
1111 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1113 base = trie->uniquecharcount + tp - minid;
1114 if ( maxid == minid ) {
1116 for ( ; zp < tp ; zp++ ) {
1117 if ( ! trie->trans[ zp ].next ) {
1118 base = trie->uniquecharcount + zp - minid;
1119 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1120 trie->trans[ zp ].check = state;
1126 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1127 trie->trans[ tp ].check = state;
1132 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1133 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1134 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1135 trie->trans[ tid ].check = state;
1137 tp += ( maxid - minid + 1 );
1139 Safefree(trie->states[ state ].trans.list);
1142 DEBUG_TRIE_COMPILE_MORE_r(
1143 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1146 trie->states[ state ].trans.base=base;
1148 trie->lasttrans = tp + 1;
1152 Second Pass -- Flat Table Representation.
1154 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1155 We know that we will need Charcount+1 trans at most to store the data
1156 (one row per char at worst case) So we preallocate both structures
1157 assuming worst case.
1159 We then construct the trie using only the .next slots of the entry
1162 We use the .check field of the first entry of the node temporarily to
1163 make compression both faster and easier by keeping track of how many non
1164 zero fields are in the node.
1166 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1169 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1170 number representing the first entry of the node, and state as a
1171 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1172 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1173 are 2 entrys per node. eg:
1181 The table is internally in the right hand, idx form. However as we also
1182 have to deal with the states array which is indexed by nodenum we have to
1183 use TRIE_NODENUM() to convert.
1187 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1189 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
1190 next_alloc = trie->uniquecharcount + 1;
1192 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1194 regnode * const noper = NEXTOPER( cur );
1195 const U8 *uc = (U8*)STRING( noper );
1196 const U8 * const e = uc + STR_LEN( noper );
1198 U32 state = 1; /* required init */
1200 U16 charid = 0; /* sanity init */
1201 U32 accept_state = 0; /* sanity init */
1202 U8 *scan = (U8*)NULL; /* sanity init */
1204 STRLEN foldlen = 0; /* required init */
1205 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1208 for ( ; uc < e ; uc += len ) {
1213 charid = trie->charmap[ uvc ];
1215 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1216 charid = svpp ? (U16)SvIV(*svpp) : 0;
1220 if ( !trie->trans[ state + charid ].next ) {
1221 trie->trans[ state + charid ].next = next_alloc;
1222 trie->trans[ state ].check++;
1223 next_alloc += trie->uniquecharcount;
1225 state = trie->trans[ state + charid ].next;
1227 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1229 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1232 accept_state = TRIE_NODENUM( state );
1233 if ( !trie->states[ accept_state ].wordnum ) {
1234 /* we havent inserted this word into the structure yet. */
1235 trie->states[ accept_state ].wordnum = ++curword;
1238 /* store the word for dumping */
1239 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1240 if ( UTF ) SvUTF8_on( tmp );
1241 av_push( trie->words, tmp );
1245 /* Its a dupe. So ignore it. */
1248 } /* end second pass */
1250 DEBUG_TRIE_COMPILE_MORE_r({
1252 print out the table precompression so that we can do a visual check
1253 that they are identical.
1257 PerlIO_printf( Perl_debug_log, "\nChar : " );
1259 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1260 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1262 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1266 PerlIO_printf( Perl_debug_log, "\nState+-" );
1268 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1269 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1272 PerlIO_printf( Perl_debug_log, "\n" );
1274 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1276 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1278 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1279 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1280 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1282 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1283 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1285 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1286 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1289 PerlIO_printf( Perl_debug_log, "\n\n" );
1293 * Inplace compress the table.*
1295 For sparse data sets the table constructed by the trie algorithm will
1296 be mostly 0/FAIL transitions or to put it another way mostly empty.
1297 (Note that leaf nodes will not contain any transitions.)
1299 This algorithm compresses the tables by eliminating most such
1300 transitions, at the cost of a modest bit of extra work during lookup:
1302 - Each states[] entry contains a .base field which indicates the
1303 index in the state[] array wheres its transition data is stored.
1305 - If .base is 0 there are no valid transitions from that node.
1307 - If .base is nonzero then charid is added to it to find an entry in
1310 -If trans[states[state].base+charid].check!=state then the
1311 transition is taken to be a 0/Fail transition. Thus if there are fail
1312 transitions at the front of the node then the .base offset will point
1313 somewhere inside the previous nodes data (or maybe even into a node
1314 even earlier), but the .check field determines if the transition is
1317 The following process inplace converts the table to the compressed
1318 table: We first do not compress the root node 1,and mark its all its
1319 .check pointers as 1 and set its .base pointer as 1 as well. This
1320 allows to do a DFA construction from the compressed table later, and
1321 ensures that any .base pointers we calculate later are greater than
1324 - We set 'pos' to indicate the first entry of the second node.
1326 - We then iterate over the columns of the node, finding the first and
1327 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1328 and set the .check pointers accordingly, and advance pos
1329 appropriately and repreat for the next node. Note that when we copy
1330 the next pointers we have to convert them from the original
1331 NODEIDX form to NODENUM form as the former is not valid post
1334 - If a node has no transitions used we mark its base as 0 and do not
1335 advance the pos pointer.
1337 - If a node only has one transition we use a second pointer into the
1338 structure to fill in allocated fail transitions from other states.
1339 This pointer is independent of the main pointer and scans forward
1340 looking for null transitions that are allocated to a state. When it
1341 finds one it writes the single transition into the "hole". If the
1342 pointer doesnt find one the single transition is appeneded as normal.
1344 - Once compressed we can Renew/realloc the structures to release the
1347 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1348 specifically Fig 3.47 and the associated pseudocode.
1352 const U32 laststate = TRIE_NODENUM( next_alloc );
1355 trie->laststate = laststate;
1357 for ( state = 1 ; state < laststate ; state++ ) {
1359 const U32 stateidx = TRIE_NODEIDX( state );
1360 const U32 o_used = trie->trans[ stateidx ].check;
1361 U32 used = trie->trans[ stateidx ].check;
1362 trie->trans[ stateidx ].check = 0;
1364 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1365 if ( flag || trie->trans[ stateidx + charid ].next ) {
1366 if ( trie->trans[ stateidx + charid ].next ) {
1368 for ( ; zp < pos ; zp++ ) {
1369 if ( ! trie->trans[ zp ].next ) {
1373 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1374 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1375 trie->trans[ zp ].check = state;
1376 if ( ++zp > pos ) pos = zp;
1383 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1385 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1386 trie->trans[ pos ].check = state;
1391 trie->lasttrans = pos + 1;
1392 Renew( trie->states, laststate + 1, reg_trie_state);
1393 DEBUG_TRIE_COMPILE_MORE_r(
1394 PerlIO_printf( Perl_debug_log,
1395 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1396 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1399 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1402 } /* end table compress */
1404 /* resize the trans array to remove unused space */
1405 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1407 DEBUG_TRIE_COMPILE_r({
1410 Now we print it out again, in a slightly different form as there is additional
1411 info we want to be able to see when its compressed. They are close enough for
1412 visual comparison though.
1414 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1416 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1417 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1419 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1422 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1424 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1425 PerlIO_printf( Perl_debug_log, "-----");
1426 PerlIO_printf( Perl_debug_log, "\n");
1428 for( state = 1 ; state < trie->laststate ; state++ ) {
1429 const U32 base = trie->states[ state ].trans.base;
1431 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1433 if ( trie->states[ state ].wordnum ) {
1434 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1436 PerlIO_printf( Perl_debug_log, "%6s", "" );
1439 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1444 while( ( base + ofs < trie->uniquecharcount ) ||
1445 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1446 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1449 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1451 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1452 if ( ( base + ofs >= trie->uniquecharcount ) &&
1453 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1454 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1456 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1457 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1459 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1463 PerlIO_printf( Perl_debug_log, "]");
1466 PerlIO_printf( Perl_debug_log, "\n" );
1471 /* now finally we "stitch in" the new TRIE node
1472 This means we convert either the first branch or the first Exact,
1473 depending on whether the thing following (in 'last') is a branch
1474 or not and whther first is the startbranch (ie is it a sub part of
1475 the alternation or is it the whole thing.)
1476 Assuming its a sub part we conver the EXACT otherwise we convert
1477 the whole branch sequence, including the first.
1484 if ( first == startbranch && OP( last ) != BRANCH ) {
1487 convert = NEXTOPER( first );
1488 NEXT_OFF( first ) = (U16)(last - first);
1491 OP( convert ) = TRIE + (U8)( flags - EXACT );
1492 NEXT_OFF( convert ) = (U16)(tail - convert);
1493 ARG_SET( convert, data_slot );
1495 /* tells us if we need to handle accept buffers specially */
1496 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1499 /* needed for dumping*/
1501 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1502 /* We now need to mark all of the space originally used by the
1503 branches as optimized away. This keeps the dumpuntil from
1504 throwing a wobbly as it doesnt use regnext() to traverse the
1507 while( optimize < last ) {
1508 OP( optimize ) = OPTIMIZED;
1512 } /* end node insert */
1519 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1520 * These need to be revisited when a newer toolchain becomes available.
1522 #if defined(__sparc64__) && defined(__GNUC__)
1523 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1524 # undef SPARC64_GCC_WORKAROUND
1525 # define SPARC64_GCC_WORKAROUND 1
1529 /* REx optimizer. Converts nodes into quickier variants "in place".
1530 Finds fixed substrings. */
1532 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1533 to the position after last scanned or to NULL. */
1537 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1538 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1539 /* scanp: Start here (read-write). */
1540 /* deltap: Write maxlen-minlen here. */
1541 /* last: Stop before this one. */
1543 I32 min = 0, pars = 0, code;
1544 regnode *scan = *scanp, *next;
1546 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1547 int is_inf_internal = 0; /* The studied chunk is infinite */
1548 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1549 scan_data_t data_fake;
1550 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1551 SV *re_trie_maxbuff = NULL;
1553 GET_RE_DEBUG_FLAGS_DECL;
1555 while (scan && OP(scan) != END && scan < last) {
1556 /* Peephole optimizer: */
1558 SV * const mysv=sv_newmortal();
1559 regprop( mysv, scan);
1560 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1561 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1564 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1565 /* Merge several consecutive EXACTish nodes into one. */
1566 regnode *n = regnext(scan);
1569 regnode *stop = scan;
1572 next = scan + NODE_SZ_STR(scan);
1573 /* Skip NOTHING, merge EXACT*. */
1575 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1576 (stringok && (OP(n) == OP(scan))))
1578 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1579 if (OP(n) == TAIL || n > next)
1581 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1582 NEXT_OFF(scan) += NEXT_OFF(n);
1583 next = n + NODE_STEP_REGNODE;
1590 else if (stringok) {
1591 const int oldl = STR_LEN(scan);
1592 regnode * const nnext = regnext(n);
1594 if (oldl + STR_LEN(n) > U8_MAX)
1596 NEXT_OFF(scan) += NEXT_OFF(n);
1597 STR_LEN(scan) += STR_LEN(n);
1598 next = n + NODE_SZ_STR(n);
1599 /* Now we can overwrite *n : */
1600 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1608 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1610 Two problematic code points in Unicode casefolding of EXACT nodes:
1612 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1613 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1619 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1620 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1622 This means that in case-insensitive matching (or "loose matching",
1623 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1624 length of the above casefolded versions) can match a target string
1625 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1626 This would rather mess up the minimum length computation.
1628 What we'll do is to look for the tail four bytes, and then peek
1629 at the preceding two bytes to see whether we need to decrease
1630 the minimum length by four (six minus two).
1632 Thanks to the design of UTF-8, there cannot be false matches:
1633 A sequence of valid UTF-8 bytes cannot be a subsequence of
1634 another valid sequence of UTF-8 bytes.
1637 char * const s0 = STRING(scan), *s, *t;
1638 char * const s1 = s0 + STR_LEN(scan) - 1;
1639 char * const s2 = s1 - 4;
1640 const char * const t0 = "\xcc\x88\xcc\x81";
1641 const char * const t1 = t0 + 3;
1644 s < s2 && (t = ninstr(s, s1, t0, t1));
1646 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1647 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1654 n = scan + NODE_SZ_STR(scan);
1656 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1667 /* Follow the next-chain of the current node and optimize
1668 away all the NOTHINGs from it. */
1669 if (OP(scan) != CURLYX) {
1670 const int max = (reg_off_by_arg[OP(scan)]
1672 /* I32 may be smaller than U16 on CRAYs! */
1673 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1674 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1678 /* Skip NOTHING and LONGJMP. */
1679 while ((n = regnext(n))
1680 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1681 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1682 && off + noff < max)
1684 if (reg_off_by_arg[OP(scan)])
1687 NEXT_OFF(scan) = off;
1690 /* The principal pseudo-switch. Cannot be a switch, since we
1691 look into several different things. */
1692 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1693 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1694 next = regnext(scan);
1696 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1698 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1699 I32 max1 = 0, min1 = I32_MAX, num = 0;
1700 struct regnode_charclass_class accum;
1701 regnode *startbranch=scan;
1703 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1704 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1705 if (flags & SCF_DO_STCLASS)
1706 cl_init_zero(pRExC_state, &accum);
1708 while (OP(scan) == code) {
1709 I32 deltanext, minnext, f = 0, fake;
1710 struct regnode_charclass_class this_class;
1713 data_fake.flags = 0;
1715 data_fake.whilem_c = data->whilem_c;
1716 data_fake.last_closep = data->last_closep;
1719 data_fake.last_closep = &fake;
1720 next = regnext(scan);
1721 scan = NEXTOPER(scan);
1723 scan = NEXTOPER(scan);
1724 if (flags & SCF_DO_STCLASS) {
1725 cl_init(pRExC_state, &this_class);
1726 data_fake.start_class = &this_class;
1727 f = SCF_DO_STCLASS_AND;
1729 if (flags & SCF_WHILEM_VISITED_POS)
1730 f |= SCF_WHILEM_VISITED_POS;
1732 /* we suppose the run is continuous, last=next...*/
1733 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1734 next, &data_fake, f,depth+1);
1737 if (max1 < minnext + deltanext)
1738 max1 = minnext + deltanext;
1739 if (deltanext == I32_MAX)
1740 is_inf = is_inf_internal = 1;
1742 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1744 if (data && (data_fake.flags & SF_HAS_EVAL))
1745 data->flags |= SF_HAS_EVAL;
1747 data->whilem_c = data_fake.whilem_c;
1748 if (flags & SCF_DO_STCLASS)
1749 cl_or(pRExC_state, &accum, &this_class);
1750 if (code == SUSPEND)
1753 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1755 if (flags & SCF_DO_SUBSTR) {
1756 data->pos_min += min1;
1757 data->pos_delta += max1 - min1;
1758 if (max1 != min1 || is_inf)
1759 data->longest = &(data->longest_float);
1762 delta += max1 - min1;
1763 if (flags & SCF_DO_STCLASS_OR) {
1764 cl_or(pRExC_state, data->start_class, &accum);
1766 cl_and(data->start_class, &and_with);
1767 flags &= ~SCF_DO_STCLASS;
1770 else if (flags & SCF_DO_STCLASS_AND) {
1772 cl_and(data->start_class, &accum);
1773 flags &= ~SCF_DO_STCLASS;
1776 /* Switch to OR mode: cache the old value of
1777 * data->start_class */
1778 StructCopy(data->start_class, &and_with,
1779 struct regnode_charclass_class);
1780 flags &= ~SCF_DO_STCLASS_AND;
1781 StructCopy(&accum, data->start_class,
1782 struct regnode_charclass_class);
1783 flags |= SCF_DO_STCLASS_OR;
1784 data->start_class->flags |= ANYOF_EOS;
1790 Assuming this was/is a branch we are dealing with: 'scan' now
1791 points at the item that follows the branch sequence, whatever
1792 it is. We now start at the beginning of the sequence and look
1798 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1800 If we can find such a subseqence we need to turn the first
1801 element into a trie and then add the subsequent branch exact
1802 strings to the trie.
1806 1. patterns where the whole set of branch can be converted to a trie,
1808 2. patterns where only a subset of the alternations can be
1809 converted to a trie.
1811 In case 1 we can replace the whole set with a single regop
1812 for the trie. In case 2 we need to keep the start and end
1815 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1816 becomes BRANCH TRIE; BRANCH X;
1818 Hypthetically when we know the regex isnt anchored we can
1819 turn a case 1 into a DFA and let it rip... Every time it finds a match
1820 it would just call its tail, no WHILEM/CURLY needed.
1824 if (!re_trie_maxbuff) {
1825 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1826 if (!SvIOK(re_trie_maxbuff))
1827 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1829 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1831 regnode *first = (regnode *)NULL;
1832 regnode *last = (regnode *)NULL;
1833 regnode *tail = scan;
1838 SV * const mysv = sv_newmortal(); /* for dumping */
1840 /* var tail is used because there may be a TAIL
1841 regop in the way. Ie, the exacts will point to the
1842 thing following the TAIL, but the last branch will
1843 point at the TAIL. So we advance tail. If we
1844 have nested (?:) we may have to move through several
1848 while ( OP( tail ) == TAIL ) {
1849 /* this is the TAIL generated by (?:) */
1850 tail = regnext( tail );
1854 regprop( mysv, tail );
1855 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1856 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1857 (RExC_seen_evals) ? "[EVAL]" : ""
1862 step through the branches, cur represents each
1863 branch, noper is the first thing to be matched
1864 as part of that branch and noper_next is the
1865 regnext() of that node. if noper is an EXACT
1866 and noper_next is the same as scan (our current
1867 position in the regex) then the EXACT branch is
1868 a possible optimization target. Once we have
1869 two or more consequetive such branches we can
1870 create a trie of the EXACT's contents and stich
1871 it in place. If the sequence represents all of
1872 the branches we eliminate the whole thing and
1873 replace it with a single TRIE. If it is a
1874 subsequence then we need to stitch it in. This
1875 means the first branch has to remain, and needs
1876 to be repointed at the item on the branch chain
1877 following the last branch optimized. This could
1878 be either a BRANCH, in which case the
1879 subsequence is internal, or it could be the
1880 item following the branch sequence in which
1881 case the subsequence is at the end.
1885 /* dont use tail as the end marker for this traverse */
1886 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1887 regnode * const noper = NEXTOPER( cur );
1888 regnode * const noper_next = regnext( noper );
1891 regprop( mysv, cur);
1892 PerlIO_printf( Perl_debug_log, "%*s%s",
1893 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1895 regprop( mysv, noper);
1896 PerlIO_printf( Perl_debug_log, " -> %s",
1897 SvPV_nolen_const(mysv));
1900 regprop( mysv, noper_next );
1901 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1902 SvPV_nolen_const(mysv));
1904 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1907 if ( ( first ? OP( noper ) == optype
1908 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1909 && noper_next == tail && count<U16_MAX)
1914 optype = OP( noper );
1918 regprop( mysv, first);
1919 PerlIO_printf( Perl_debug_log, "%*s%s",
1920 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1921 regprop( mysv, NEXTOPER(first) );
1922 PerlIO_printf( Perl_debug_log, " -> %s\n",
1923 SvPV_nolen_const( mysv ) );
1928 regprop( mysv, cur);
1929 PerlIO_printf( Perl_debug_log, "%*s%s",
1930 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1931 regprop( mysv, noper );
1932 PerlIO_printf( Perl_debug_log, " -> %s\n",
1933 SvPV_nolen_const( mysv ) );
1939 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1940 (int)depth * 2 + 2, "E:", "**END**" );
1942 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1944 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1945 && noper_next == tail )
1949 optype = OP( noper );
1959 regprop( mysv, cur);
1960 PerlIO_printf( Perl_debug_log,
1961 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1962 " ", SvPV_nolen_const( mysv ), first, last, cur);
1967 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1968 (int)depth * 2 + 2, "E:", "==END==" );
1970 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1975 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1976 scan = NEXTOPER(NEXTOPER(scan));
1977 } else /* single branch is optimized. */
1978 scan = NEXTOPER(scan);
1981 else if (OP(scan) == EXACT) {
1982 I32 l = STR_LEN(scan);
1985 const U8 * const s = (U8*)STRING(scan);
1986 l = utf8_length(s, s + l);
1987 uc = utf8_to_uvchr(s, NULL);
1989 uc = *((U8*)STRING(scan));
1992 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1993 /* The code below prefers earlier match for fixed
1994 offset, later match for variable offset. */
1995 if (data->last_end == -1) { /* Update the start info. */
1996 data->last_start_min = data->pos_min;
1997 data->last_start_max = is_inf
1998 ? I32_MAX : data->pos_min + data->pos_delta;
2000 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2002 SV * const sv = data->last_found;
2003 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2004 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2005 if (mg && mg->mg_len >= 0)
2006 mg->mg_len += utf8_length((U8*)STRING(scan),
2007 (U8*)STRING(scan)+STR_LEN(scan));
2010 SvUTF8_on(data->last_found);
2011 data->last_end = data->pos_min + l;
2012 data->pos_min += l; /* As in the first entry. */
2013 data->flags &= ~SF_BEFORE_EOL;
2015 if (flags & SCF_DO_STCLASS_AND) {
2016 /* Check whether it is compatible with what we know already! */
2020 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2021 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2022 && (!(data->start_class->flags & ANYOF_FOLD)
2023 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2026 ANYOF_CLASS_ZERO(data->start_class);
2027 ANYOF_BITMAP_ZERO(data->start_class);
2029 ANYOF_BITMAP_SET(data->start_class, uc);
2030 data->start_class->flags &= ~ANYOF_EOS;
2032 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2034 else if (flags & SCF_DO_STCLASS_OR) {
2035 /* false positive possible if the class is case-folded */
2037 ANYOF_BITMAP_SET(data->start_class, uc);
2039 data->start_class->flags |= ANYOF_UNICODE_ALL;
2040 data->start_class->flags &= ~ANYOF_EOS;
2041 cl_and(data->start_class, &and_with);
2043 flags &= ~SCF_DO_STCLASS;
2045 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2046 I32 l = STR_LEN(scan);
2047 UV uc = *((U8*)STRING(scan));
2049 /* Search for fixed substrings supports EXACT only. */
2050 if (flags & SCF_DO_SUBSTR)
2051 scan_commit(pRExC_state, data);
2053 U8 *s = (U8 *)STRING(scan);
2054 l = utf8_length(s, s + l);
2055 uc = utf8_to_uvchr(s, NULL);
2058 if (data && (flags & SCF_DO_SUBSTR))
2060 if (flags & SCF_DO_STCLASS_AND) {
2061 /* Check whether it is compatible with what we know already! */
2065 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2066 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2067 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2069 ANYOF_CLASS_ZERO(data->start_class);
2070 ANYOF_BITMAP_ZERO(data->start_class);
2072 ANYOF_BITMAP_SET(data->start_class, uc);
2073 data->start_class->flags &= ~ANYOF_EOS;
2074 data->start_class->flags |= ANYOF_FOLD;
2075 if (OP(scan) == EXACTFL)
2076 data->start_class->flags |= ANYOF_LOCALE;
2079 else if (flags & SCF_DO_STCLASS_OR) {
2080 if (data->start_class->flags & ANYOF_FOLD) {
2081 /* false positive possible if the class is case-folded.
2082 Assume that the locale settings are the same... */
2084 ANYOF_BITMAP_SET(data->start_class, uc);
2085 data->start_class->flags &= ~ANYOF_EOS;
2087 cl_and(data->start_class, &and_with);
2089 flags &= ~SCF_DO_STCLASS;
2091 else if (strchr((const char*)PL_varies,OP(scan))) {
2092 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2093 I32 f = flags, pos_before = 0;
2094 regnode *oscan = scan;
2095 struct regnode_charclass_class this_class;
2096 struct regnode_charclass_class *oclass = NULL;
2097 I32 next_is_eval = 0;
2099 switch (PL_regkind[(U8)OP(scan)]) {
2100 case WHILEM: /* End of (?:...)* . */
2101 scan = NEXTOPER(scan);
2104 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2105 next = NEXTOPER(scan);
2106 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2108 maxcount = REG_INFTY;
2109 next = regnext(scan);
2110 scan = NEXTOPER(scan);
2114 if (flags & SCF_DO_SUBSTR)
2119 if (flags & SCF_DO_STCLASS) {
2121 maxcount = REG_INFTY;
2122 next = regnext(scan);
2123 scan = NEXTOPER(scan);
2126 is_inf = is_inf_internal = 1;
2127 scan = regnext(scan);
2128 if (flags & SCF_DO_SUBSTR) {
2129 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2130 data->longest = &(data->longest_float);
2132 goto optimize_curly_tail;
2134 mincount = ARG1(scan);
2135 maxcount = ARG2(scan);
2136 next = regnext(scan);
2137 if (OP(scan) == CURLYX) {
2138 I32 lp = (data ? *(data->last_closep) : 0);
2139 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2141 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2142 next_is_eval = (OP(scan) == EVAL);
2144 if (flags & SCF_DO_SUBSTR) {
2145 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2146 pos_before = data->pos_min;
2150 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2152 data->flags |= SF_IS_INF;
2154 if (flags & SCF_DO_STCLASS) {
2155 cl_init(pRExC_state, &this_class);
2156 oclass = data->start_class;
2157 data->start_class = &this_class;
2158 f |= SCF_DO_STCLASS_AND;
2159 f &= ~SCF_DO_STCLASS_OR;
2161 /* These are the cases when once a subexpression
2162 fails at a particular position, it cannot succeed
2163 even after backtracking at the enclosing scope.
2165 XXXX what if minimal match and we are at the
2166 initial run of {n,m}? */
2167 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2168 f &= ~SCF_WHILEM_VISITED_POS;
2170 /* This will finish on WHILEM, setting scan, or on NULL: */
2171 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2173 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2175 if (flags & SCF_DO_STCLASS)
2176 data->start_class = oclass;
2177 if (mincount == 0 || minnext == 0) {
2178 if (flags & SCF_DO_STCLASS_OR) {
2179 cl_or(pRExC_state, data->start_class, &this_class);
2181 else if (flags & SCF_DO_STCLASS_AND) {
2182 /* Switch to OR mode: cache the old value of
2183 * data->start_class */
2184 StructCopy(data->start_class, &and_with,
2185 struct regnode_charclass_class);
2186 flags &= ~SCF_DO_STCLASS_AND;
2187 StructCopy(&this_class, data->start_class,
2188 struct regnode_charclass_class);
2189 flags |= SCF_DO_STCLASS_OR;
2190 data->start_class->flags |= ANYOF_EOS;
2192 } else { /* Non-zero len */
2193 if (flags & SCF_DO_STCLASS_OR) {
2194 cl_or(pRExC_state, data->start_class, &this_class);
2195 cl_and(data->start_class, &and_with);
2197 else if (flags & SCF_DO_STCLASS_AND)
2198 cl_and(data->start_class, &this_class);
2199 flags &= ~SCF_DO_STCLASS;
2201 if (!scan) /* It was not CURLYX, but CURLY. */
2203 if ( /* ? quantifier ok, except for (?{ ... }) */
2204 (next_is_eval || !(mincount == 0 && maxcount == 1))
2205 && (minnext == 0) && (deltanext == 0)
2206 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2207 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2208 && ckWARN(WARN_REGEXP))
2211 "Quantifier unexpected on zero-length expression");
2214 min += minnext * mincount;
2215 is_inf_internal |= ((maxcount == REG_INFTY
2216 && (minnext + deltanext) > 0)
2217 || deltanext == I32_MAX);
2218 is_inf |= is_inf_internal;
2219 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2221 /* Try powerful optimization CURLYX => CURLYN. */
2222 if ( OP(oscan) == CURLYX && data
2223 && data->flags & SF_IN_PAR
2224 && !(data->flags & SF_HAS_EVAL)
2225 && !deltanext && minnext == 1 ) {
2226 /* Try to optimize to CURLYN. */
2227 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2228 regnode *nxt1 = nxt;
2235 if (!strchr((const char*)PL_simple,OP(nxt))
2236 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2237 && STR_LEN(nxt) == 1))
2243 if (OP(nxt) != CLOSE)
2245 /* Now we know that nxt2 is the only contents: */
2246 oscan->flags = (U8)ARG(nxt);
2248 OP(nxt1) = NOTHING; /* was OPEN. */
2250 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2251 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2252 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2253 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2254 OP(nxt + 1) = OPTIMIZED; /* was count. */
2255 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2260 /* Try optimization CURLYX => CURLYM. */
2261 if ( OP(oscan) == CURLYX && data
2262 && !(data->flags & SF_HAS_PAR)
2263 && !(data->flags & SF_HAS_EVAL)
2264 && !deltanext /* atom is fixed width */
2265 && minnext != 0 /* CURLYM can't handle zero width */
2267 /* XXXX How to optimize if data == 0? */
2268 /* Optimize to a simpler form. */
2269 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2273 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2274 && (OP(nxt2) != WHILEM))
2276 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2277 /* Need to optimize away parenths. */
2278 if (data->flags & SF_IN_PAR) {
2279 /* Set the parenth number. */
2280 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2282 if (OP(nxt) != CLOSE)
2283 FAIL("Panic opt close");
2284 oscan->flags = (U8)ARG(nxt);
2285 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2286 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2288 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2289 OP(nxt + 1) = OPTIMIZED; /* was count. */
2290 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2291 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2294 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2295 regnode *nnxt = regnext(nxt1);
2298 if (reg_off_by_arg[OP(nxt1)])
2299 ARG_SET(nxt1, nxt2 - nxt1);
2300 else if (nxt2 - nxt1 < U16_MAX)
2301 NEXT_OFF(nxt1) = nxt2 - nxt1;
2303 OP(nxt) = NOTHING; /* Cannot beautify */
2308 /* Optimize again: */
2309 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2315 else if ((OP(oscan) == CURLYX)
2316 && (flags & SCF_WHILEM_VISITED_POS)
2317 /* See the comment on a similar expression above.
2318 However, this time it not a subexpression
2319 we care about, but the expression itself. */
2320 && (maxcount == REG_INFTY)
2321 && data && ++data->whilem_c < 16) {
2322 /* This stays as CURLYX, we can put the count/of pair. */
2323 /* Find WHILEM (as in regexec.c) */
2324 regnode *nxt = oscan + NEXT_OFF(oscan);
2326 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2328 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2329 | (RExC_whilem_seen << 4)); /* On WHILEM */
2331 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2333 if (flags & SCF_DO_SUBSTR) {
2334 SV *last_str = NULL;
2335 int counted = mincount != 0;
2337 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2338 #if defined(SPARC64_GCC_WORKAROUND)
2341 const char *s = NULL;
2344 if (pos_before >= data->last_start_min)
2347 b = data->last_start_min;
2350 s = SvPV_const(data->last_found, l);
2351 old = b - data->last_start_min;
2354 I32 b = pos_before >= data->last_start_min
2355 ? pos_before : data->last_start_min;
2357 const char *s = SvPV_const(data->last_found, l);
2358 I32 old = b - data->last_start_min;
2362 old = utf8_hop((U8*)s, old) - (U8*)s;
2365 /* Get the added string: */
2366 last_str = newSVpvn(s + old, l);
2368 SvUTF8_on(last_str);
2369 if (deltanext == 0 && pos_before == b) {
2370 /* What was added is a constant string */
2372 SvGROW(last_str, (mincount * l) + 1);
2373 repeatcpy(SvPVX(last_str) + l,
2374 SvPVX_const(last_str), l, mincount - 1);
2375 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2376 /* Add additional parts. */
2377 SvCUR_set(data->last_found,
2378 SvCUR(data->last_found) - l);
2379 sv_catsv(data->last_found, last_str);
2381 SV * sv = data->last_found;
2383 SvUTF8(sv) && SvMAGICAL(sv) ?
2384 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2385 if (mg && mg->mg_len >= 0)
2386 mg->mg_len += CHR_SVLEN(last_str);
2388 data->last_end += l * (mincount - 1);
2391 /* start offset must point into the last copy */
2392 data->last_start_min += minnext * (mincount - 1);
2393 data->last_start_max += is_inf ? I32_MAX
2394 : (maxcount - 1) * (minnext + data->pos_delta);
2397 /* It is counted once already... */
2398 data->pos_min += minnext * (mincount - counted);
2399 data->pos_delta += - counted * deltanext +
2400 (minnext + deltanext) * maxcount - minnext * mincount;
2401 if (mincount != maxcount) {
2402 /* Cannot extend fixed substrings found inside
2404 scan_commit(pRExC_state,data);
2405 if (mincount && last_str) {
2406 sv_setsv(data->last_found, last_str);
2407 data->last_end = data->pos_min;
2408 data->last_start_min =
2409 data->pos_min - CHR_SVLEN(last_str);
2410 data->last_start_max = is_inf
2412 : data->pos_min + data->pos_delta
2413 - CHR_SVLEN(last_str);
2415 data->longest = &(data->longest_float);
2417 SvREFCNT_dec(last_str);
2419 if (data && (fl & SF_HAS_EVAL))
2420 data->flags |= SF_HAS_EVAL;
2421 optimize_curly_tail:
2422 if (OP(oscan) != CURLYX) {
2423 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2425 NEXT_OFF(oscan) += NEXT_OFF(next);
2428 default: /* REF and CLUMP only? */
2429 if (flags & SCF_DO_SUBSTR) {
2430 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2431 data->longest = &(data->longest_float);
2433 is_inf = is_inf_internal = 1;
2434 if (flags & SCF_DO_STCLASS_OR)
2435 cl_anything(pRExC_state, data->start_class);
2436 flags &= ~SCF_DO_STCLASS;
2440 else if (strchr((const char*)PL_simple,OP(scan))) {
2443 if (flags & SCF_DO_SUBSTR) {
2444 scan_commit(pRExC_state,data);
2448 if (flags & SCF_DO_STCLASS) {
2449 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2451 /* Some of the logic below assumes that switching
2452 locale on will only add false positives. */
2453 switch (PL_regkind[(U8)OP(scan)]) {
2457 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2458 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2459 cl_anything(pRExC_state, data->start_class);
2462 if (OP(scan) == SANY)
2464 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2465 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2466 || (data->start_class->flags & ANYOF_CLASS));
2467 cl_anything(pRExC_state, data->start_class);
2469 if (flags & SCF_DO_STCLASS_AND || !value)
2470 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2473 if (flags & SCF_DO_STCLASS_AND)
2474 cl_and(data->start_class,
2475 (struct regnode_charclass_class*)scan);
2477 cl_or(pRExC_state, data->start_class,
2478 (struct regnode_charclass_class*)scan);
2481 if (flags & SCF_DO_STCLASS_AND) {
2482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2484 for (value = 0; value < 256; value++)
2485 if (!isALNUM(value))
2486 ANYOF_BITMAP_CLEAR(data->start_class, value);
2490 if (data->start_class->flags & ANYOF_LOCALE)
2491 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2493 for (value = 0; value < 256; value++)
2495 ANYOF_BITMAP_SET(data->start_class, value);
2500 if (flags & SCF_DO_STCLASS_AND) {
2501 if (data->start_class->flags & ANYOF_LOCALE)
2502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2505 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2506 data->start_class->flags |= ANYOF_LOCALE;
2510 if (flags & SCF_DO_STCLASS_AND) {
2511 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2512 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2513 for (value = 0; value < 256; value++)
2515 ANYOF_BITMAP_CLEAR(data->start_class, value);
2519 if (data->start_class->flags & ANYOF_LOCALE)
2520 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2522 for (value = 0; value < 256; value++)
2523 if (!isALNUM(value))
2524 ANYOF_BITMAP_SET(data->start_class, value);
2529 if (flags & SCF_DO_STCLASS_AND) {
2530 if (data->start_class->flags & ANYOF_LOCALE)
2531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2534 data->start_class->flags |= ANYOF_LOCALE;
2535 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2539 if (flags & SCF_DO_STCLASS_AND) {
2540 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2542 for (value = 0; value < 256; value++)
2543 if (!isSPACE(value))
2544 ANYOF_BITMAP_CLEAR(data->start_class, value);
2548 if (data->start_class->flags & ANYOF_LOCALE)
2549 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2551 for (value = 0; value < 256; value++)
2553 ANYOF_BITMAP_SET(data->start_class, value);
2558 if (flags & SCF_DO_STCLASS_AND) {
2559 if (data->start_class->flags & ANYOF_LOCALE)
2560 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2563 data->start_class->flags |= ANYOF_LOCALE;
2564 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2568 if (flags & SCF_DO_STCLASS_AND) {
2569 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2570 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2571 for (value = 0; value < 256; value++)
2573 ANYOF_BITMAP_CLEAR(data->start_class, value);
2577 if (data->start_class->flags & ANYOF_LOCALE)
2578 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2580 for (value = 0; value < 256; value++)
2581 if (!isSPACE(value))
2582 ANYOF_BITMAP_SET(data->start_class, value);
2587 if (flags & SCF_DO_STCLASS_AND) {
2588 if (data->start_class->flags & ANYOF_LOCALE) {
2589 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2590 for (value = 0; value < 256; value++)
2591 if (!isSPACE(value))
2592 ANYOF_BITMAP_CLEAR(data->start_class, value);
2596 data->start_class->flags |= ANYOF_LOCALE;
2597 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2601 if (flags & SCF_DO_STCLASS_AND) {
2602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2603 for (value = 0; value < 256; value++)
2604 if (!isDIGIT(value))
2605 ANYOF_BITMAP_CLEAR(data->start_class, value);
2608 if (data->start_class->flags & ANYOF_LOCALE)
2609 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2611 for (value = 0; value < 256; value++)
2613 ANYOF_BITMAP_SET(data->start_class, value);
2618 if (flags & SCF_DO_STCLASS_AND) {
2619 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2620 for (value = 0; value < 256; value++)
2622 ANYOF_BITMAP_CLEAR(data->start_class, value);
2625 if (data->start_class->flags & ANYOF_LOCALE)
2626 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2628 for (value = 0; value < 256; value++)
2629 if (!isDIGIT(value))
2630 ANYOF_BITMAP_SET(data->start_class, value);
2635 if (flags & SCF_DO_STCLASS_OR)
2636 cl_and(data->start_class, &and_with);
2637 flags &= ~SCF_DO_STCLASS;
2640 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2641 data->flags |= (OP(scan) == MEOL
2645 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2646 /* Lookbehind, or need to calculate parens/evals/stclass: */
2647 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2648 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2649 /* Lookahead/lookbehind */
2650 I32 deltanext, minnext, fake = 0;
2652 struct regnode_charclass_class intrnl;
2655 data_fake.flags = 0;
2657 data_fake.whilem_c = data->whilem_c;
2658 data_fake.last_closep = data->last_closep;
2661 data_fake.last_closep = &fake;
2662 if ( flags & SCF_DO_STCLASS && !scan->flags
2663 && OP(scan) == IFMATCH ) { /* Lookahead */
2664 cl_init(pRExC_state, &intrnl);
2665 data_fake.start_class = &intrnl;
2666 f |= SCF_DO_STCLASS_AND;
2668 if (flags & SCF_WHILEM_VISITED_POS)
2669 f |= SCF_WHILEM_VISITED_POS;
2670 next = regnext(scan);
2671 nscan = NEXTOPER(NEXTOPER(scan));
2672 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2675 vFAIL("Variable length lookbehind not implemented");
2677 else if (minnext > U8_MAX) {
2678 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2680 scan->flags = (U8)minnext;
2682 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2684 if (data && (data_fake.flags & SF_HAS_EVAL))
2685 data->flags |= SF_HAS_EVAL;
2687 data->whilem_c = data_fake.whilem_c;
2688 if (f & SCF_DO_STCLASS_AND) {
2689 const int was = (data->start_class->flags & ANYOF_EOS);
2691 cl_and(data->start_class, &intrnl);
2693 data->start_class->flags |= ANYOF_EOS;
2696 else if (OP(scan) == OPEN) {
2699 else if (OP(scan) == CLOSE) {
2700 if ((I32)ARG(scan) == is_par) {
2701 next = regnext(scan);
2703 if ( next && (OP(next) != WHILEM) && next < last)
2704 is_par = 0; /* Disable optimization */
2707 *(data->last_closep) = ARG(scan);
2709 else if (OP(scan) == EVAL) {
2711 data->flags |= SF_HAS_EVAL;
2713 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2714 if (flags & SCF_DO_SUBSTR) {
2715 scan_commit(pRExC_state,data);
2716 data->longest = &(data->longest_float);
2718 is_inf = is_inf_internal = 1;
2719 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2720 cl_anything(pRExC_state, data->start_class);
2721 flags &= ~SCF_DO_STCLASS;
2723 /* Else: zero-length, ignore. */
2724 scan = regnext(scan);
2729 *deltap = is_inf_internal ? I32_MAX : delta;
2730 if (flags & SCF_DO_SUBSTR && is_inf)
2731 data->pos_delta = I32_MAX - data->pos_min;
2732 if (is_par > U8_MAX)
2734 if (is_par && pars==1 && data) {
2735 data->flags |= SF_IN_PAR;
2736 data->flags &= ~SF_HAS_PAR;
2738 else if (pars && data) {
2739 data->flags |= SF_HAS_PAR;
2740 data->flags &= ~SF_IN_PAR;
2742 if (flags & SCF_DO_STCLASS_OR)
2743 cl_and(data->start_class, &and_with);
2748 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
2750 if (RExC_rx->data) {
2751 Renewc(RExC_rx->data,
2752 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2753 char, struct reg_data);
2754 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2755 RExC_rx->data->count += n;
2758 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2759 char, struct reg_data);
2760 Newx(RExC_rx->data->what, n, U8);
2761 RExC_rx->data->count = n;
2763 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2764 return RExC_rx->data->count - n;
2768 Perl_reginitcolors(pTHX)
2770 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2772 char *t = savepv(s);
2776 t = strchr(t, '\t');
2782 PL_colors[i] = t = (char *)"";
2787 PL_colors[i++] = (char *)"";
2794 - pregcomp - compile a regular expression into internal code
2796 * We can't allocate space until we know how big the compiled form will be,
2797 * but we can't compile it (and thus know how big it is) until we've got a
2798 * place to put the code. So we cheat: we compile it twice, once with code
2799 * generation turned off and size counting turned on, and once "for real".
2800 * This also means that we don't allocate space until we are sure that the
2801 * thing really will compile successfully, and we never have to move the
2802 * code and thus invalidate pointers into it. (Note that it has to be in
2803 * one piece because free() must be able to free it all.) [NB: not true in perl]
2805 * Beware that the optimization-preparation code in here knows about some
2806 * of the structure of the compiled regexp. [I'll say.]
2809 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2819 RExC_state_t RExC_state;
2820 RExC_state_t *pRExC_state = &RExC_state;
2822 GET_RE_DEBUG_FLAGS_DECL;
2825 FAIL("NULL regexp argument");
2827 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2830 DEBUG_r(if (!PL_colorset) reginitcolors());
2832 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2833 PL_colors[4],PL_colors[5],PL_colors[0],
2834 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2836 RExC_flags = pm->op_pmflags;
2840 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2841 RExC_seen_evals = 0;
2844 /* First pass: determine size, legality. */
2851 RExC_emit = &PL_regdummy;
2852 RExC_whilem_seen = 0;
2853 #if 0 /* REGC() is (currently) a NOP at the first pass.
2854 * Clever compilers notice this and complain. --jhi */
2855 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2857 if (reg(pRExC_state, 0, &flags) == NULL) {
2858 RExC_precomp = NULL;
2861 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2863 /* Small enough for pointer-storage convention?
2864 If extralen==0, this means that we will not need long jumps. */
2865 if (RExC_size >= 0x10000L && RExC_extralen)
2866 RExC_size += RExC_extralen;
2869 if (RExC_whilem_seen > 15)
2870 RExC_whilem_seen = 15;
2872 /* Allocate space and initialize. */
2873 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2876 FAIL("Regexp out of space");
2879 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2880 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2883 r->prelen = xend - exp;
2884 r->precomp = savepvn(RExC_precomp, r->prelen);
2886 #ifdef PERL_OLD_COPY_ON_WRITE
2887 r->saved_copy = NULL;
2889 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2890 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2892 r->substrs = 0; /* Useful during FAIL. */
2893 r->startp = 0; /* Useful during FAIL. */
2894 r->endp = 0; /* Useful during FAIL. */
2896 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2898 r->offsets[0] = RExC_size;
2900 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2901 "%s %"UVuf" bytes for offset annotations.\n",
2902 r->offsets ? "Got" : "Couldn't get",
2903 (UV)((2*RExC_size+1) * sizeof(U32))));
2907 /* Second pass: emit code. */
2908 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2913 RExC_emit_start = r->program;
2914 RExC_emit = r->program;
2915 /* Store the count of eval-groups for security checks: */
2916 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2917 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2919 if (reg(pRExC_state, 0, &flags) == NULL)
2923 /* Dig out information for optimizations. */
2924 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2925 pm->op_pmflags = RExC_flags;
2927 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2928 r->regstclass = NULL;
2929 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2930 r->reganch |= ROPT_NAUGHTY;
2931 scan = r->program + 1; /* First BRANCH. */
2933 /* XXXX To minimize changes to RE engine we always allocate
2934 3-units-long substrs field. */
2935 Newxz(r->substrs, 1, struct reg_substr_data);
2937 StructCopy(&zero_scan_data, &data, scan_data_t);
2938 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2939 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2941 STRLEN longest_float_length, longest_fixed_length;
2942 struct regnode_charclass_class ch_class;
2947 /* Skip introductions and multiplicators >= 1. */
2948 while ((OP(first) == OPEN && (sawopen = 1)) ||
2949 /* An OR of *one* alternative - should not happen now. */
2950 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2951 (OP(first) == PLUS) ||
2952 (OP(first) == MINMOD) ||
2953 /* An {n,m} with n>0 */
2954 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2955 if (OP(first) == PLUS)
2958 first += regarglen[(U8)OP(first)];
2959 first = NEXTOPER(first);
2962 /* Starting-point info. */
2964 if (PL_regkind[(U8)OP(first)] == EXACT) {
2965 if (OP(first) == EXACT)
2966 ; /* Empty, get anchored substr later. */
2967 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2968 r->regstclass = first;
2970 else if (strchr((const char*)PL_simple,OP(first)))
2971 r->regstclass = first;
2972 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2973 PL_regkind[(U8)OP(first)] == NBOUND)
2974 r->regstclass = first;
2975 else if (PL_regkind[(U8)OP(first)] == BOL) {
2976 r->reganch |= (OP(first) == MBOL
2978 : (OP(first) == SBOL
2981 first = NEXTOPER(first);
2984 else if (OP(first) == GPOS) {
2985 r->reganch |= ROPT_ANCH_GPOS;
2986 first = NEXTOPER(first);
2989 else if (!sawopen && (OP(first) == STAR &&
2990 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2991 !(r->reganch & ROPT_ANCH) )
2993 /* turn .* into ^.* with an implied $*=1 */
2995 (OP(NEXTOPER(first)) == REG_ANY)
2998 r->reganch |= type | ROPT_IMPLICIT;
2999 first = NEXTOPER(first);
3002 if (sawplus && (!sawopen || !RExC_sawback)
3003 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3004 /* x+ must match at the 1st pos of run of x's */
3005 r->reganch |= ROPT_SKIP;
3007 /* Scan is after the zeroth branch, first is atomic matcher. */
3008 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3009 (IV)(first - scan + 1)));
3011 * If there's something expensive in the r.e., find the
3012 * longest literal string that must appear and make it the
3013 * regmust. Resolve ties in favor of later strings, since
3014 * the regstart check works with the beginning of the r.e.
3015 * and avoiding duplication strengthens checking. Not a
3016 * strong reason, but sufficient in the absence of others.
3017 * [Now we resolve ties in favor of the earlier string if
3018 * it happens that c_offset_min has been invalidated, since the
3019 * earlier string may buy us something the later one won't.]
3023 data.longest_fixed = newSVpvn("",0);
3024 data.longest_float = newSVpvn("",0);
3025 data.last_found = newSVpvn("",0);
3026 data.longest = &(data.longest_fixed);
3028 if (!r->regstclass) {
3029 cl_init(pRExC_state, &ch_class);
3030 data.start_class = &ch_class;
3031 stclass_flag = SCF_DO_STCLASS_AND;
3032 } else /* XXXX Check for BOUND? */
3034 data.last_closep = &last_close;
3036 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3037 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3038 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3039 && data.last_start_min == 0 && data.last_end > 0
3040 && !RExC_seen_zerolen
3041 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3042 r->reganch |= ROPT_CHECK_ALL;
3043 scan_commit(pRExC_state, &data);
3044 SvREFCNT_dec(data.last_found);
3046 longest_float_length = CHR_SVLEN(data.longest_float);
3047 if (longest_float_length
3048 || (data.flags & SF_FL_BEFORE_EOL
3049 && (!(data.flags & SF_FL_BEFORE_MEOL)
3050 || (RExC_flags & PMf_MULTILINE)))) {
3053 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3054 && data.offset_fixed == data.offset_float_min
3055 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3056 goto remove_float; /* As in (a)+. */
3058 if (SvUTF8(data.longest_float)) {
3059 r->float_utf8 = data.longest_float;
3060 r->float_substr = NULL;
3062 r->float_substr = data.longest_float;
3063 r->float_utf8 = NULL;
3065 r->float_min_offset = data.offset_float_min;
3066 r->float_max_offset = data.offset_float_max;
3067 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3068 && (!(data.flags & SF_FL_BEFORE_MEOL)
3069 || (RExC_flags & PMf_MULTILINE)));
3070 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3074 r->float_substr = r->float_utf8 = NULL;
3075 SvREFCNT_dec(data.longest_float);
3076 longest_float_length = 0;
3079 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3080 if (longest_fixed_length
3081 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3082 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3083 || (RExC_flags & PMf_MULTILINE)))) {
3086 if (SvUTF8(data.longest_fixed)) {
3087 r->anchored_utf8 = data.longest_fixed;
3088 r->anchored_substr = NULL;
3090 r->anchored_substr = data.longest_fixed;
3091 r->anchored_utf8 = NULL;
3093 r->anchored_offset = data.offset_fixed;
3094 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3095 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3096 || (RExC_flags & PMf_MULTILINE)));
3097 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3100 r->anchored_substr = r->anchored_utf8 = NULL;
3101 SvREFCNT_dec(data.longest_fixed);
3102 longest_fixed_length = 0;
3105 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3106 r->regstclass = NULL;
3107 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3109 && !(data.start_class->flags & ANYOF_EOS)
3110 && !cl_is_anything(data.start_class))
3112 const I32 n = add_data(pRExC_state, 1, "f");
3114 Newx(RExC_rx->data->data[n], 1,
3115 struct regnode_charclass_class);
3116 StructCopy(data.start_class,
3117 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3118 struct regnode_charclass_class);
3119 r->regstclass = (regnode*)RExC_rx->data->data[n];
3120 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3121 PL_regdata = r->data; /* for regprop() */
3122 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3123 regprop(sv, (regnode*)data.start_class);
3124 PerlIO_printf(Perl_debug_log,
3125 "synthetic stclass \"%s\".\n",
3126 SvPVX_const(sv));});
3129 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3130 if (longest_fixed_length > longest_float_length) {
3131 r->check_substr = r->anchored_substr;
3132 r->check_utf8 = r->anchored_utf8;
3133 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3134 if (r->reganch & ROPT_ANCH_SINGLE)
3135 r->reganch |= ROPT_NOSCAN;
3138 r->check_substr = r->float_substr;
3139 r->check_utf8 = r->float_utf8;
3140 r->check_offset_min = data.offset_float_min;
3141 r->check_offset_max = data.offset_float_max;
3143 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3144 This should be changed ASAP! */
3145 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3146 r->reganch |= RE_USE_INTUIT;
3147 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3148 r->reganch |= RE_INTUIT_TAIL;
3152 /* Several toplevels. Best we can is to set minlen. */
3154 struct regnode_charclass_class ch_class;
3157 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3158 scan = r->program + 1;
3159 cl_init(pRExC_state, &ch_class);
3160 data.start_class = &ch_class;
3161 data.last_closep = &last_close;
3162 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3163 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3164 = r->float_substr = r->float_utf8 = NULL;
3165 if (!(data.start_class->flags & ANYOF_EOS)
3166 && !cl_is_anything(data.start_class))
3168 const I32 n = add_data(pRExC_state, 1, "f");
3170 Newx(RExC_rx->data->data[n], 1,
3171 struct regnode_charclass_class);
3172 StructCopy(data.start_class,
3173 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3174 struct regnode_charclass_class);
3175 r->regstclass = (regnode*)RExC_rx->data->data[n];
3176 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3177 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3178 regprop(sv, (regnode*)data.start_class);
3179 PerlIO_printf(Perl_debug_log,
3180 "synthetic stclass \"%s\".\n",
3181 SvPVX_const(sv));});
3186 if (RExC_seen & REG_SEEN_GPOS)
3187 r->reganch |= ROPT_GPOS_SEEN;
3188 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3189 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3190 if (RExC_seen & REG_SEEN_EVAL)
3191 r->reganch |= ROPT_EVAL_SEEN;
3192 if (RExC_seen & REG_SEEN_CANY)
3193 r->reganch |= ROPT_CANY_SEEN;
3194 Newxz(r->startp, RExC_npar, I32);
3195 Newxz(r->endp, RExC_npar, I32);
3196 PL_regdata = r->data; /* for regprop() */
3197 DEBUG_COMPILE_r(regdump(r));
3202 - reg - regular expression, i.e. main body or parenthesized thing
3204 * Caller must absorb opening parenthesis.
3206 * Combining parenthesis handling with the base level of regular expression
3207 * is a trifle forced, but the need to tie the tails of the branches to what
3208 * follows makes it hard to avoid.
3211 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3212 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3215 register regnode *ret; /* Will be the head of the group. */
3216 register regnode *br;
3217 register regnode *lastbr;
3218 register regnode *ender = 0;
3219 register I32 parno = 0;
3220 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
3222 /* for (?g), (?gc), and (?o) warnings; warning
3223 about (?c) will warn about (?g) -- japhy */
3225 I32 wastedflags = 0x00,
3228 wasted_gc = 0x02 | 0x04,
3231 char * parse_start = RExC_parse; /* MJD */
3232 char * const oregcomp_parse = RExC_parse;
3235 *flagp = 0; /* Tentatively. */
3238 /* Make an OPEN node, if parenthesized. */
3240 if (*RExC_parse == '?') { /* (?...) */
3241 U32 posflags = 0, negflags = 0;
3242 U32 *flagsp = &posflags;
3244 const char * const seqstart = RExC_parse;
3247 paren = *RExC_parse++;
3248 ret = NULL; /* For look-ahead/behind. */
3250 case '<': /* (?<...) */
3251 RExC_seen |= REG_SEEN_LOOKBEHIND;
3252 if (*RExC_parse == '!')
3254 if (*RExC_parse != '=' && *RExC_parse != '!')
3257 case '=': /* (?=...) */
3258 case '!': /* (?!...) */
3259 RExC_seen_zerolen++;
3260 case ':': /* (?:...) */
3261 case '>': /* (?>...) */
3263 case '$': /* (?$...) */
3264 case '@': /* (?@...) */
3265 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3267 case '#': /* (?#...) */
3268 while (*RExC_parse && *RExC_parse != ')')
3270 if (*RExC_parse != ')')
3271 FAIL("Sequence (?#... not terminated");
3272 nextchar(pRExC_state);
3275 case 'p': /* (?p...) */
3276 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3277 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3279 case '?': /* (??...) */
3281 if (*RExC_parse != '{')
3283 paren = *RExC_parse++;
3285 case '{': /* (?{...}) */
3287 I32 count = 1, n = 0;
3289 char *s = RExC_parse;
3291 OP_4tree *sop, *rop;
3293 RExC_seen_zerolen++;
3294 RExC_seen |= REG_SEEN_EVAL;
3295 while (count && (c = *RExC_parse)) {
3296 if (c == '\\' && RExC_parse[1])
3304 if (*RExC_parse != ')')
3307 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3312 if (RExC_parse - 1 - s)
3313 sv = newSVpvn(s, RExC_parse - 1 - s);
3315 sv = newSVpvn("", 0);
3318 Perl_save_re_context(aTHX);
3319 rop = sv_compile_2op(sv, &sop, "re", &pad);
3320 sop->op_private |= OPpREFCOUNTED;
3321 /* re_dup will OpREFCNT_inc */
3322 OpREFCNT_set(sop, 1);
3325 n = add_data(pRExC_state, 3, "nop");
3326 RExC_rx->data->data[n] = (void*)rop;
3327 RExC_rx->data->data[n+1] = (void*)sop;
3328 RExC_rx->data->data[n+2] = (void*)pad;
3331 else { /* First pass */
3332 if (PL_reginterp_cnt < ++RExC_seen_evals
3334 /* No compiled RE interpolated, has runtime
3335 components ===> unsafe. */
3336 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3337 if (PL_tainting && PL_tainted)
3338 FAIL("Eval-group in insecure regular expression");
3339 if (IN_PERL_COMPILETIME)
3343 nextchar(pRExC_state);
3345 ret = reg_node(pRExC_state, LOGICAL);
3348 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3349 /* deal with the length of this later - MJD */
3352 ret = reganode(pRExC_state, EVAL, n);
3353 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3354 Set_Node_Offset(ret, parse_start);
3357 case '(': /* (?(?{...})...) and (?(?=...)...) */
3359 if (RExC_parse[0] == '?') { /* (?(?...)) */
3360 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3361 || RExC_parse[1] == '<'
3362 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3365 ret = reg_node(pRExC_state, LOGICAL);
3368 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3372 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3374 parno = atoi(RExC_parse++);
3376 while (isDIGIT(*RExC_parse))
3378 ret = reganode(pRExC_state, GROUPP, parno);
3380 if ((c = *nextchar(pRExC_state)) != ')')
3381 vFAIL("Switch condition not recognized");
3383 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3384 br = regbranch(pRExC_state, &flags, 1);
3386 br = reganode(pRExC_state, LONGJMP, 0);
3388 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3389 c = *nextchar(pRExC_state);
3393 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3394 regbranch(pRExC_state, &flags, 1);
3395 regtail(pRExC_state, ret, lastbr);
3398 c = *nextchar(pRExC_state);
3403 vFAIL("Switch (?(condition)... contains too many branches");
3404 ender = reg_node(pRExC_state, TAIL);
3405 regtail(pRExC_state, br, ender);
3407 regtail(pRExC_state, lastbr, ender);
3408 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3411 regtail(pRExC_state, ret, ender);
3415 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3419 RExC_parse--; /* for vFAIL to print correctly */
3420 vFAIL("Sequence (? incomplete");
3424 parse_flags: /* (?i) */
3425 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3426 /* (?g), (?gc) and (?o) are useless here
3427 and must be globally applied -- japhy */
3429 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3430 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3431 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3432 if (! (wastedflags & wflagbit) ) {
3433 wastedflags |= wflagbit;
3436 "Useless (%s%c) - %suse /%c modifier",
3437 flagsp == &negflags ? "?-" : "?",
3439 flagsp == &negflags ? "don't " : "",
3445 else if (*RExC_parse == 'c') {
3446 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3447 if (! (wastedflags & wasted_c) ) {
3448 wastedflags |= wasted_gc;
3451 "Useless (%sc) - %suse /gc modifier",
3452 flagsp == &negflags ? "?-" : "?",
3453 flagsp == &negflags ? "don't " : ""
3458 else { pmflag(flagsp, *RExC_parse); }
3462 if (*RExC_parse == '-') {
3464 wastedflags = 0; /* reset so (?g-c) warns twice */
3468 RExC_flags |= posflags;
3469 RExC_flags &= ~negflags;
3470 if (*RExC_parse == ':') {
3476 if (*RExC_parse != ')') {
3478 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3480 nextchar(pRExC_state);
3488 ret = reganode(pRExC_state, OPEN, parno);
3489 Set_Node_Length(ret, 1); /* MJD */
3490 Set_Node_Offset(ret, RExC_parse); /* MJD */
3497 /* Pick up the branches, linking them together. */
3498 parse_start = RExC_parse; /* MJD */
3499 br = regbranch(pRExC_state, &flags, 1);
3500 /* branch_len = (paren != 0); */
3504 if (*RExC_parse == '|') {
3505 if (!SIZE_ONLY && RExC_extralen) {
3506 reginsert(pRExC_state, BRANCHJ, br);
3509 reginsert(pRExC_state, BRANCH, br);
3510 Set_Node_Length(br, paren != 0);
3511 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3515 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3517 else if (paren == ':') {
3518 *flagp |= flags&SIMPLE;
3520 if (open) { /* Starts with OPEN. */
3521 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3523 else if (paren != '?') /* Not Conditional */
3525 *flagp |= flags & (SPSTART | HASWIDTH);
3527 while (*RExC_parse == '|') {
3528 if (!SIZE_ONLY && RExC_extralen) {
3529 ender = reganode(pRExC_state, LONGJMP,0);
3530 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3533 RExC_extralen += 2; /* Account for LONGJMP. */
3534 nextchar(pRExC_state);
3535 br = regbranch(pRExC_state, &flags, 0);
3539 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3543 *flagp |= flags&SPSTART;
3546 if (have_branch || paren != ':') {
3547 /* Make a closing node, and hook it on the end. */
3550 ender = reg_node(pRExC_state, TAIL);
3553 ender = reganode(pRExC_state, CLOSE, parno);
3554 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3555 Set_Node_Length(ender,1); /* MJD */
3561 *flagp &= ~HASWIDTH;
3564 ender = reg_node(pRExC_state, SUCCEED);
3567 ender = reg_node(pRExC_state, END);
3570 regtail(pRExC_state, lastbr, ender);
3573 /* Hook the tails of the branches to the closing node. */
3574 for (br = ret; br != NULL; br = regnext(br)) {
3575 regoptail(pRExC_state, br, ender);
3582 static const char parens[] = "=!<,>";
3584 if (paren && (p = strchr(parens, paren))) {
3585 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3586 int flag = (p - parens) > 1;
3589 node = SUSPEND, flag = 0;
3590 reginsert(pRExC_state, node,ret);
3591 Set_Node_Cur_Length(ret);
3592 Set_Node_Offset(ret, parse_start + 1);
3594 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3598 /* Check for proper termination. */
3600 RExC_flags = oregflags;
3601 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3602 RExC_parse = oregcomp_parse;
3603 vFAIL("Unmatched (");
3606 else if (!paren && RExC_parse < RExC_end) {
3607 if (*RExC_parse == ')') {
3609 vFAIL("Unmatched )");
3612 FAIL("Junk on end of regexp"); /* "Can't happen". */
3620 - regbranch - one alternative of an | operator
3622 * Implements the concatenation operator.
3625 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3627 register regnode *ret;
3628 register regnode *chain = NULL;
3629 register regnode *latest;
3630 I32 flags = 0, c = 0;
3635 if (!SIZE_ONLY && RExC_extralen)
3636 ret = reganode(pRExC_state, BRANCHJ,0);
3638 ret = reg_node(pRExC_state, BRANCH);
3639 Set_Node_Length(ret, 1);
3643 if (!first && SIZE_ONLY)
3644 RExC_extralen += 1; /* BRANCHJ */
3646 *flagp = WORST; /* Tentatively. */
3649 nextchar(pRExC_state);
3650 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3652 latest = regpiece(pRExC_state, &flags);
3653 if (latest == NULL) {
3654 if (flags & TRYAGAIN)
3658 else if (ret == NULL)
3660 *flagp |= flags&HASWIDTH;
3661 if (chain == NULL) /* First piece. */
3662 *flagp |= flags&SPSTART;
3665 regtail(pRExC_state, chain, latest);
3670 if (chain == NULL) { /* Loop ran zero times. */
3671 chain = reg_node(pRExC_state, NOTHING);
3676 *flagp |= flags&SIMPLE;
3683 - regpiece - something followed by possible [*+?]
3685 * Note that the branching code sequences used for ? and the general cases
3686 * of * and + are somewhat optimized: they use the same NOTHING node as
3687 * both the endmarker for their branch list and the body of the last branch.
3688 * It might seem that this node could be dispensed with entirely, but the
3689 * endmarker role is not redundant.
3692 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3694 register regnode *ret;
3696 register char *next;
3698 const char * const origparse = RExC_parse;
3701 I32 max = REG_INFTY;
3704 ret = regatom(pRExC_state, &flags);
3706 if (flags & TRYAGAIN)
3713 if (op == '{' && regcurly(RExC_parse)) {
3714 parse_start = RExC_parse; /* MJD */
3715 next = RExC_parse + 1;
3717 while (isDIGIT(*next) || *next == ',') {
3726 if (*next == '}') { /* got one */
3730 min = atoi(RExC_parse);
3734 maxpos = RExC_parse;
3736 if (!max && *maxpos != '0')
3737 max = REG_INFTY; /* meaning "infinity" */
3738 else if (max >= REG_INFTY)
3739 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3741 nextchar(pRExC_state);
3744 if ((flags&SIMPLE)) {
3745 RExC_naughty += 2 + RExC_naughty / 2;
3746 reginsert(pRExC_state, CURLY, ret);
3747 Set_Node_Offset(ret, parse_start+1); /* MJD */
3748 Set_Node_Cur_Length(ret);
3751 regnode *w = reg_node(pRExC_state, WHILEM);
3754 regtail(pRExC_state, ret, w);
3755 if (!SIZE_ONLY && RExC_extralen) {
3756 reginsert(pRExC_state, LONGJMP,ret);
3757 reginsert(pRExC_state, NOTHING,ret);
3758 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3760 reginsert(pRExC_state, CURLYX,ret);
3762 Set_Node_Offset(ret, parse_start+1);
3763 Set_Node_Length(ret,
3764 op == '{' ? (RExC_parse - parse_start) : 1);
3766 if (!SIZE_ONLY && RExC_extralen)
3767 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3768 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3770 RExC_whilem_seen++, RExC_extralen += 3;
3771 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3779 if (max && max < min)
3780 vFAIL("Can't do {n,m} with n > m");
3782 ARG1_SET(ret, (U16)min);
3783 ARG2_SET(ret, (U16)max);
3795 #if 0 /* Now runtime fix should be reliable. */
3797 /* if this is reinstated, don't forget to put this back into perldiag:
3799 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3801 (F) The part of the regexp subject to either the * or + quantifier
3802 could match an empty string. The {#} shows in the regular
3803 expression about where the problem was discovered.
3807 if (!(flags&HASWIDTH) && op != '?')
3808 vFAIL("Regexp *+ operand could be empty");
3811 parse_start = RExC_parse;
3812 nextchar(pRExC_state);
3814 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3816 if (op == '*' && (flags&SIMPLE)) {
3817 reginsert(pRExC_state, STAR, ret);
3821 else if (op == '*') {
3825 else if (op == '+' && (flags&SIMPLE)) {
3826 reginsert(pRExC_state, PLUS, ret);
3830 else if (op == '+') {
3834 else if (op == '?') {
3839 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
3841 "%.*s matches null string many times",
3842 RExC_parse - origparse,
3846 if (*RExC_parse == '?') {
3847 nextchar(pRExC_state);
3848 reginsert(pRExC_state, MINMOD, ret);
3849 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3851 if (ISMULT2(RExC_parse)) {
3853 vFAIL("Nested quantifiers");
3860 - regatom - the lowest level
3862 * Optimization: gobbles an entire sequence of ordinary characters so that
3863 * it can turn them into a single node, which is smaller to store and
3864 * faster to run. Backslashed characters are exceptions, each becoming a
3865 * separate node; the code is simpler that way and it's not worth fixing.
3867 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3869 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3871 register regnode *ret = 0;
3873 char *parse_start = RExC_parse;
3875 *flagp = WORST; /* Tentatively. */
3878 switch (*RExC_parse) {
3880 RExC_seen_zerolen++;
3881 nextchar(pRExC_state);
3882 if (RExC_flags & PMf_MULTILINE)
3883 ret = reg_node(pRExC_state, MBOL);
3884 else if (RExC_flags & PMf_SINGLELINE)
3885 ret = reg_node(pRExC_state, SBOL);
3887 ret = reg_node(pRExC_state, BOL);
3888 Set_Node_Length(ret, 1); /* MJD */
3891 nextchar(pRExC_state);
3893 RExC_seen_zerolen++;
3894 if (RExC_flags & PMf_MULTILINE)
3895 ret = reg_node(pRExC_state, MEOL);
3896 else if (RExC_flags & PMf_SINGLELINE)
3897 ret = reg_node(pRExC_state, SEOL);
3899 ret = reg_node(pRExC_state, EOL);
3900 Set_Node_Length(ret, 1); /* MJD */
3903 nextchar(pRExC_state);
3904 if (RExC_flags & PMf_SINGLELINE)
3905 ret = reg_node(pRExC_state, SANY);
3907 ret = reg_node(pRExC_state, REG_ANY);
3908 *flagp |= HASWIDTH|SIMPLE;
3910 Set_Node_Length(ret, 1); /* MJD */
3914 char *oregcomp_parse = ++RExC_parse;
3915 ret = regclass(pRExC_state);
3916 if (*RExC_parse != ']') {
3917 RExC_parse = oregcomp_parse;
3918 vFAIL("Unmatched [");
3920 nextchar(pRExC_state);
3921 *flagp |= HASWIDTH|SIMPLE;
3922 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3926 nextchar(pRExC_state);
3927 ret = reg(pRExC_state, 1, &flags);
3929 if (flags & TRYAGAIN) {
3930 if (RExC_parse == RExC_end) {
3931 /* Make parent create an empty node if needed. */
3939 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3943 if (flags & TRYAGAIN) {
3947 vFAIL("Internal urp");
3948 /* Supposed to be caught earlier. */
3951 if (!regcurly(RExC_parse)) {
3960 vFAIL("Quantifier follows nothing");
3963 switch (*++RExC_parse) {
3965 RExC_seen_zerolen++;
3966 ret = reg_node(pRExC_state, SBOL);
3968 nextchar(pRExC_state);
3969 Set_Node_Length(ret, 2); /* MJD */
3972 ret = reg_node(pRExC_state, GPOS);
3973 RExC_seen |= REG_SEEN_GPOS;
3975 nextchar(pRExC_state);
3976 Set_Node_Length(ret, 2); /* MJD */
3979 ret = reg_node(pRExC_state, SEOL);
3981 RExC_seen_zerolen++; /* Do not optimize RE away */
3982 nextchar(pRExC_state);
3985 ret = reg_node(pRExC_state, EOS);
3987 RExC_seen_zerolen++; /* Do not optimize RE away */
3988 nextchar(pRExC_state);
3989 Set_Node_Length(ret, 2); /* MJD */
3992 ret = reg_node(pRExC_state, CANY);
3993 RExC_seen |= REG_SEEN_CANY;
3994 *flagp |= HASWIDTH|SIMPLE;
3995 nextchar(pRExC_state);
3996 Set_Node_Length(ret, 2); /* MJD */
3999 ret = reg_node(pRExC_state, CLUMP);
4001 nextchar(pRExC_state);
4002 Set_Node_Length(ret, 2); /* MJD */
4005 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4006 *flagp |= HASWIDTH|SIMPLE;
4007 nextchar(pRExC_state);
4008 Set_Node_Length(ret, 2); /* MJD */
4011 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4012 *flagp |= HASWIDTH|SIMPLE;
4013 nextchar(pRExC_state);
4014 Set_Node_Length(ret, 2); /* MJD */
4017 RExC_seen_zerolen++;
4018 RExC_seen |= REG_SEEN_LOOKBEHIND;
4019 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4021 nextchar(pRExC_state);
4022 Set_Node_Length(ret, 2); /* MJD */
4025 RExC_seen_zerolen++;
4026 RExC_seen |= REG_SEEN_LOOKBEHIND;
4027 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4029 nextchar(pRExC_state);
4030 Set_Node_Length(ret, 2); /* MJD */
4033 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4034 *flagp |= HASWIDTH|SIMPLE;
4035 nextchar(pRExC_state);
4036 Set_Node_Length(ret, 2); /* MJD */
4039 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4040 *flagp |= HASWIDTH|SIMPLE;
4041 nextchar(pRExC_state);
4042 Set_Node_Length(ret, 2); /* MJD */
4045 ret = reg_node(pRExC_state, DIGIT);
4046 *flagp |= HASWIDTH|SIMPLE;
4047 nextchar(pRExC_state);
4048 Set_Node_Length(ret, 2); /* MJD */
4051 ret = reg_node(pRExC_state, NDIGIT);
4052 *flagp |= HASWIDTH|SIMPLE;
4053 nextchar(pRExC_state);
4054 Set_Node_Length(ret, 2); /* MJD */
4059 char* oldregxend = RExC_end;
4060 char* parse_start = RExC_parse - 2;
4062 if (RExC_parse[1] == '{') {
4063 /* a lovely hack--pretend we saw [\pX] instead */
4064 RExC_end = strchr(RExC_parse, '}');
4066 U8 c = (U8)*RExC_parse;
4068 RExC_end = oldregxend;
4069 vFAIL2("Missing right brace on \\%c{}", c);
4074 RExC_end = RExC_parse + 2;
4075 if (RExC_end > oldregxend)
4076 RExC_end = oldregxend;
4080 ret = regclass(pRExC_state);
4082 RExC_end = oldregxend;
4085 Set_Node_Offset(ret, parse_start + 2);
4086 Set_Node_Cur_Length(ret);
4087 nextchar(pRExC_state);
4088 *flagp |= HASWIDTH|SIMPLE;
4101 case '1': case '2': case '3': case '4':
4102 case '5': case '6': case '7': case '8': case '9':
4104 const I32 num = atoi(RExC_parse);
4106 if (num > 9 && num >= RExC_npar)
4109 char * parse_start = RExC_parse - 1; /* MJD */
4110 while (isDIGIT(*RExC_parse))
4113 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4114 vFAIL("Reference to nonexistent group");
4116 ret = reganode(pRExC_state,
4117 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4121 /* override incorrect value set in reganode MJD */
4122 Set_Node_Offset(ret, parse_start+1);
4123 Set_Node_Cur_Length(ret); /* MJD */
4125 nextchar(pRExC_state);
4130 if (RExC_parse >= RExC_end)
4131 FAIL("Trailing \\");
4134 /* Do not generate "unrecognized" warnings here, we fall
4135 back into the quick-grab loop below */
4142 if (RExC_flags & PMf_EXTENDED) {
4143 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4144 if (RExC_parse < RExC_end)
4150 register STRLEN len;
4155 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4157 parse_start = RExC_parse - 1;
4163 ret = reg_node(pRExC_state,
4164 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4166 for (len = 0, p = RExC_parse - 1;
4167 len < 127 && p < RExC_end;
4172 if (RExC_flags & PMf_EXTENDED)
4173 p = regwhite(p, RExC_end);
4220 ender = ASCII_TO_NATIVE('\033');
4224 ender = ASCII_TO_NATIVE('\007');
4229 char* const e = strchr(p, '}');
4233 vFAIL("Missing right brace on \\x{}");
4236 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4237 | PERL_SCAN_DISALLOW_PREFIX;
4238 STRLEN numlen = e - p - 1;
4239 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4246 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4248 ender = grok_hex(p, &numlen, &flags, NULL);
4254 ender = UCHARAT(p++);
4255 ender = toCTRL(ender);
4257 case '0': case '1': case '2': case '3':case '4':
4258 case '5': case '6': case '7': case '8':case '9':
4260 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4263 ender = grok_oct(p, &numlen, &flags, NULL);
4273 FAIL("Trailing \\");
4276 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4277 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4278 goto normal_default;
4283 if (UTF8_IS_START(*p) && UTF) {
4285 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4293 if (RExC_flags & PMf_EXTENDED)
4294 p = regwhite(p, RExC_end);
4296 /* Prime the casefolded buffer. */
4297 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4299 if (ISMULT2(p)) { /* Back off on ?+*. */
4306 /* Emit all the Unicode characters. */
4308 for (foldbuf = tmpbuf;
4310 foldlen -= numlen) {
4311 ender = utf8_to_uvchr(foldbuf, &numlen);
4313 reguni(pRExC_state, ender, s, &unilen);
4316 /* In EBCDIC the numlen
4317 * and unilen can differ. */
4319 if (numlen >= foldlen)
4323 break; /* "Can't happen." */
4327 reguni(pRExC_state, ender, s, &unilen);
4336 REGC((char)ender, s++);
4344 /* Emit all the Unicode characters. */
4346 for (foldbuf = tmpbuf;
4348 foldlen -= numlen) {
4349 ender = utf8_to_uvchr(foldbuf, &numlen);
4351 reguni(pRExC_state, ender, s, &unilen);
4354 /* In EBCDIC the numlen
4355 * and unilen can differ. */
4357 if (numlen >= foldlen)
4365 reguni(pRExC_state, ender, s, &unilen);
4374 REGC((char)ender, s++);
4378 Set_Node_Cur_Length(ret); /* MJD */
4379 nextchar(pRExC_state);
4381 /* len is STRLEN which is unsigned, need to copy to signed */
4384 vFAIL("Internal disaster");
4388 if (len == 1 && UNI_IS_INVARIANT(ender))
4393 RExC_size += STR_SZ(len);
4395 RExC_emit += STR_SZ(len);
4400 /* If the encoding pragma is in effect recode the text of
4401 * any EXACT-kind nodes. */
4402 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4403 STRLEN oldlen = STR_LEN(ret);
4404 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4408 if (sv_utf8_downgrade(sv, TRUE)) {
4409 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4410 const STRLEN newlen = SvCUR(sv);
4415 GET_RE_DEBUG_FLAGS_DECL;
4416 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4417 (int)oldlen, STRING(ret),
4419 Copy(s, STRING(ret), newlen, char);
4420 STR_LEN(ret) += newlen - oldlen;
4421 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4423 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4431 S_regwhite(pTHX_ char *p, const char *e)
4436 else if (*p == '#') {
4439 } while (p < e && *p != '\n');
4447 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4448 Character classes ([:foo:]) can also be negated ([:^foo:]).
4449 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4450 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4451 but trigger failures because they are currently unimplemented. */
4453 #define POSIXCC_DONE(c) ((c) == ':')
4454 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4455 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4458 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4460 I32 namedclass = OOB_NAMEDCLASS;
4462 if (value == '[' && RExC_parse + 1 < RExC_end &&
4463 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4464 POSIXCC(UCHARAT(RExC_parse))) {
4465 const char c = UCHARAT(RExC_parse);
4466 char* s = RExC_parse++;
4468 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4470 if (RExC_parse == RExC_end)
4471 /* Grandfather lone [:, [=, [. */
4474 const char* t = RExC_parse++; /* skip over the c */
4475 const char *posixcc;
4479 if (UCHARAT(RExC_parse) == ']') {
4480 RExC_parse++; /* skip over the ending ] */
4483 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4484 const I32 skip = t - posixcc;
4486 /* Initially switch on the length of the name. */
4489 if (memEQ(posixcc, "word", 4)) {
4490 /* this is not POSIX, this is the Perl \w */;
4492 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4496 /* Names all of length 5. */
4497 /* alnum alpha ascii blank cntrl digit graph lower
4498 print punct space upper */
4499 /* Offset 4 gives the best switch position. */
4500 switch (posixcc[4]) {
4502 if (memEQ(posixcc, "alph", 4)) {
4505 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4509 if (memEQ(posixcc, "spac", 4)) {
4512 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4516 if (memEQ(posixcc, "grap", 4)) {
4519 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4523 if (memEQ(posixcc, "asci", 4)) {
4526 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4530 if (memEQ(posixcc, "blan", 4)) {
4533 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4537 if (memEQ(posixcc, "cntr", 4)) {
4540 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4544 if (memEQ(posixcc, "alnu", 4)) {
4547 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4551 if (memEQ(posixcc, "lowe", 4)) {
4554 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4556 if (memEQ(posixcc, "uppe", 4)) {
4559 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4563 if (memEQ(posixcc, "digi", 4)) {
4566 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4568 if (memEQ(posixcc, "prin", 4)) {
4571 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4573 if (memEQ(posixcc, "punc", 4)) {
4576 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4582 if (memEQ(posixcc, "xdigit", 6)) {
4584 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4589 if (namedclass == OOB_NAMEDCLASS)
4591 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4594 assert (posixcc[skip] == ':');
4595 assert (posixcc[skip+1] == ']');
4596 } else if (!SIZE_ONLY) {
4597 /* [[=foo=]] and [[.foo.]] are still future. */
4599 /* adjust RExC_parse so the warning shows after
4601 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4603 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4606 /* Maternal grandfather:
4607 * "[:" ending in ":" but not in ":]" */
4617 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4619 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4620 const char *s = RExC_parse;
4621 const char c = *s++;
4623 while(*s && isALNUM(*s))
4625 if (*s && c == *s && s[1] == ']') {
4626 if (ckWARN(WARN_REGEXP))
4628 "POSIX syntax [%c %c] belongs inside character classes",
4631 /* [[=foo=]] and [[.foo.]] are still future. */
4632 if (POSIXCC_NOTYET(c)) {
4633 /* adjust RExC_parse so the error shows after
4635 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4637 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4644 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4647 register UV nextvalue;
4648 register IV prevvalue = OOB_UNICODE;
4649 register IV range = 0;
4650 register regnode *ret;
4653 char *rangebegin = 0;
4654 bool need_class = 0;
4658 bool optimize_invert = TRUE;
4659 AV* unicode_alternate = 0;
4661 UV literal_endpoint = 0;
4664 ret = reganode(pRExC_state, ANYOF, 0);
4667 ANYOF_FLAGS(ret) = 0;
4669 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4673 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4677 RExC_size += ANYOF_SKIP;
4679 RExC_emit += ANYOF_SKIP;
4681 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4683 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4684 ANYOF_BITMAP_ZERO(ret);
4685 listsv = newSVpvn("# comment\n", 10);
4688 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4690 if (!SIZE_ONLY && POSIXCC(nextvalue))
4691 checkposixcc(pRExC_state);
4693 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4694 if (UCHARAT(RExC_parse) == ']')
4697 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4701 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4704 rangebegin = RExC_parse;
4706 value = utf8n_to_uvchr((U8*)RExC_parse,
4707 RExC_end - RExC_parse,
4709 RExC_parse += numlen;
4712 value = UCHARAT(RExC_parse++);
4713 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4714 if (value == '[' && POSIXCC(nextvalue))
4715 namedclass = regpposixcc(pRExC_state, value);
4716 else if (value == '\\') {
4718 value = utf8n_to_uvchr((U8*)RExC_parse,
4719 RExC_end - RExC_parse,
4721 RExC_parse += numlen;
4724 value = UCHARAT(RExC_parse++);
4725 /* Some compilers cannot handle switching on 64-bit integer
4726 * values, therefore value cannot be an UV. Yes, this will
4727 * be a problem later if we want switch on Unicode.
4728 * A similar issue a little bit later when switching on
4729 * namedclass. --jhi */
4730 switch ((I32)value) {
4731 case 'w': namedclass = ANYOF_ALNUM; break;
4732 case 'W': namedclass = ANYOF_NALNUM; break;
4733 case 's': namedclass = ANYOF_SPACE; break;
4734 case 'S': namedclass = ANYOF_NSPACE; break;
4735 case 'd': namedclass = ANYOF_DIGIT; break;
4736 case 'D': namedclass = ANYOF_NDIGIT; break;
4739 if (RExC_parse >= RExC_end)
4740 vFAIL2("Empty \\%c{}", (U8)value);
4741 if (*RExC_parse == '{') {
4742 const U8 c = (U8)value;
4743 e = strchr(RExC_parse++, '}');
4745 vFAIL2("Missing right brace on \\%c{}", c);
4746 while (isSPACE(UCHARAT(RExC_parse)))
4748 if (e == RExC_parse)
4749 vFAIL2("Empty \\%c{}", c);
4751 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4759 if (UCHARAT(RExC_parse) == '^') {
4762 value = value == 'p' ? 'P' : 'p'; /* toggle */
4763 while (isSPACE(UCHARAT(RExC_parse))) {
4769 Perl_sv_catpvf(aTHX_ listsv,
4770 "+utf8::%.*s\n", (int)n, RExC_parse);
4772 Perl_sv_catpvf(aTHX_ listsv,
4773 "!utf8::%.*s\n", (int)n, RExC_parse);
4776 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4777 namedclass = ANYOF_MAX; /* no official name, but it's named */
4779 case 'n': value = '\n'; break;
4780 case 'r': value = '\r'; break;
4781 case 't': value = '\t'; break;
4782 case 'f': value = '\f'; break;
4783 case 'b': value = '\b'; break;
4784 case 'e': value = ASCII_TO_NATIVE('\033');break;
4785 case 'a': value = ASCII_TO_NATIVE('\007');break;
4787 if (*RExC_parse == '{') {
4788 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4789 | PERL_SCAN_DISALLOW_PREFIX;
4790 e = strchr(RExC_parse++, '}');
4792 vFAIL("Missing right brace on \\x{}");
4794 numlen = e - RExC_parse;
4795 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4799 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4801 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4802 RExC_parse += numlen;
4806 value = UCHARAT(RExC_parse++);
4807 value = toCTRL(value);
4809 case '0': case '1': case '2': case '3': case '4':
4810 case '5': case '6': case '7': case '8': case '9':
4814 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4815 RExC_parse += numlen;
4819 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
4821 "Unrecognized escape \\%c in character class passed through",
4825 } /* end of \blah */
4831 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4833 if (!SIZE_ONLY && !need_class)
4834 ANYOF_CLASS_ZERO(ret);
4838 /* a bad range like a-\d, a-[:digit:] ? */
4841 if (ckWARN(WARN_REGEXP))
4843 "False [] range \"%*.*s\"",
4844 RExC_parse - rangebegin,
4845 RExC_parse - rangebegin,
4847 if (prevvalue < 256) {
4848 ANYOF_BITMAP_SET(ret, prevvalue);
4849 ANYOF_BITMAP_SET(ret, '-');
4852 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4853 Perl_sv_catpvf(aTHX_ listsv,
4854 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4858 range = 0; /* this was not a true range */
4862 const char *what = NULL;
4865 if (namedclass > OOB_NAMEDCLASS)
4866 optimize_invert = FALSE;
4867 /* Possible truncation here but in some 64-bit environments
4868 * the compiler gets heartburn about switch on 64-bit values.
4869 * A similar issue a little earlier when switching on value.
4871 switch ((I32)namedclass) {
4874 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4876 for (value = 0; value < 256; value++)
4878 ANYOF_BITMAP_SET(ret, value);
4885 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4887 for (value = 0; value < 256; value++)
4888 if (!isALNUM(value))
4889 ANYOF_BITMAP_SET(ret, value);
4896 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4898 for (value = 0; value < 256; value++)
4899 if (isALNUMC(value))
4900 ANYOF_BITMAP_SET(ret, value);
4907 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4909 for (value = 0; value < 256; value++)
4910 if (!isALNUMC(value))
4911 ANYOF_BITMAP_SET(ret, value);
4918 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4920 for (value = 0; value < 256; value++)
4922 ANYOF_BITMAP_SET(ret, value);
4929 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4931 for (value = 0; value < 256; value++)
4932 if (!isALPHA(value))
4933 ANYOF_BITMAP_SET(ret, value);
4940 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4943 for (value = 0; value < 128; value++)
4944 ANYOF_BITMAP_SET(ret, value);
4946 for (value = 0; value < 256; value++) {
4948 ANYOF_BITMAP_SET(ret, value);
4957 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4960 for (value = 128; value < 256; value++)
4961 ANYOF_BITMAP_SET(ret, value);
4963 for (value = 0; value < 256; value++) {
4964 if (!isASCII(value))
4965 ANYOF_BITMAP_SET(ret, value);
4974 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4976 for (value = 0; value < 256; value++)
4978 ANYOF_BITMAP_SET(ret, value);
4985 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4987 for (value = 0; value < 256; value++)
4988 if (!isBLANK(value))
4989 ANYOF_BITMAP_SET(ret, value);
4996 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
4998 for (value = 0; value < 256; value++)
5000 ANYOF_BITMAP_SET(ret, value);
5007 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5009 for (value = 0; value < 256; value++)
5010 if (!isCNTRL(value))
5011 ANYOF_BITMAP_SET(ret, value);
5018 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5020 /* consecutive digits assumed */
5021 for (value = '0'; value <= '9'; value++)
5022 ANYOF_BITMAP_SET(ret, value);
5029 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5031 /* consecutive digits assumed */
5032 for (value = 0; value < '0'; value++)
5033 ANYOF_BITMAP_SET(ret, value);
5034 for (value = '9' + 1; value < 256; value++)
5035 ANYOF_BITMAP_SET(ret, value);
5042 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5044 for (value = 0; value < 256; value++)
5046 ANYOF_BITMAP_SET(ret, value);
5053 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5055 for (value = 0; value < 256; value++)
5056 if (!isGRAPH(value))
5057 ANYOF_BITMAP_SET(ret, value);
5064 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5066 for (value = 0; value < 256; value++)
5068 ANYOF_BITMAP_SET(ret, value);
5075 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5077 for (value = 0; value < 256; value++)
5078 if (!isLOWER(value))
5079 ANYOF_BITMAP_SET(ret, value);
5086 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5088 for (value = 0; value < 256; value++)
5090 ANYOF_BITMAP_SET(ret, value);
5097 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5099 for (value = 0; value < 256; value++)
5100 if (!isPRINT(value))
5101 ANYOF_BITMAP_SET(ret, value);
5108 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5110 for (value = 0; value < 256; value++)
5111 if (isPSXSPC(value))
5112 ANYOF_BITMAP_SET(ret, value);
5119 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5121 for (value = 0; value < 256; value++)
5122 if (!isPSXSPC(value))
5123 ANYOF_BITMAP_SET(ret, value);
5130 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5132 for (value = 0; value < 256; value++)
5134 ANYOF_BITMAP_SET(ret, value);
5141 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5143 for (value = 0; value < 256; value++)
5144 if (!isPUNCT(value))
5145 ANYOF_BITMAP_SET(ret, value);
5152 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5154 for (value = 0; value < 256; value++)
5156 ANYOF_BITMAP_SET(ret, value);
5163 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5165 for (value = 0; value < 256; value++)
5166 if (!isSPACE(value))
5167 ANYOF_BITMAP_SET(ret, value);
5174 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5176 for (value = 0; value < 256; value++)
5178 ANYOF_BITMAP_SET(ret, value);
5185 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5187 for (value = 0; value < 256; value++)
5188 if (!isUPPER(value))
5189 ANYOF_BITMAP_SET(ret, value);
5196 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5198 for (value = 0; value < 256; value++)
5199 if (isXDIGIT(value))
5200 ANYOF_BITMAP_SET(ret, value);
5207 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5209 for (value = 0; value < 256; value++)
5210 if (!isXDIGIT(value))
5211 ANYOF_BITMAP_SET(ret, value);
5217 /* this is to handle \p and \P */
5220 vFAIL("Invalid [::] class");
5224 /* Strings such as "+utf8::isWord\n" */
5225 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5228 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5231 } /* end of namedclass \blah */
5234 if (prevvalue > (IV)value) /* b-a */ {
5235 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5236 RExC_parse - rangebegin,
5237 RExC_parse - rangebegin,
5239 range = 0; /* not a valid range */
5243 prevvalue = value; /* save the beginning of the range */
5244 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5245 RExC_parse[1] != ']') {
5248 /* a bad range like \w-, [:word:]- ? */
5249 if (namedclass > OOB_NAMEDCLASS) {
5250 if (ckWARN(WARN_REGEXP))
5252 "False [] range \"%*.*s\"",
5253 RExC_parse - rangebegin,
5254 RExC_parse - rangebegin,
5257 ANYOF_BITMAP_SET(ret, '-');
5259 range = 1; /* yeah, it's a range! */
5260 continue; /* but do it the next time */
5264 /* now is the next time */
5268 if (prevvalue < 256) {
5269 const IV ceilvalue = value < 256 ? value : 255;
5272 /* In EBCDIC [\x89-\x91] should include
5273 * the \x8e but [i-j] should not. */
5274 if (literal_endpoint == 2 &&
5275 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5276 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5278 if (isLOWER(prevvalue)) {
5279 for (i = prevvalue; i <= ceilvalue; i++)
5281 ANYOF_BITMAP_SET(ret, i);
5283 for (i = prevvalue; i <= ceilvalue; i++)
5285 ANYOF_BITMAP_SET(ret, i);
5290 for (i = prevvalue; i <= ceilvalue; i++)
5291 ANYOF_BITMAP_SET(ret, i);
5293 if (value > 255 || UTF) {
5294 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5295 const UV natvalue = NATIVE_TO_UNI(value);
5297 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5298 if (prevnatvalue < natvalue) { /* what about > ? */
5299 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5300 prevnatvalue, natvalue);
5302 else if (prevnatvalue == natvalue) {
5303 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5305 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5307 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5309 /* If folding and foldable and a single
5310 * character, insert also the folded version
5311 * to the charclass. */
5313 if (foldlen == (STRLEN)UNISKIP(f))
5314 Perl_sv_catpvf(aTHX_ listsv,
5317 /* Any multicharacter foldings
5318 * require the following transform:
5319 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5320 * where E folds into "pq" and F folds
5321 * into "rst", all other characters
5322 * fold to single characters. We save
5323 * away these multicharacter foldings,
5324 * to be later saved as part of the
5325 * additional "s" data. */
5328 if (!unicode_alternate)
5329 unicode_alternate = newAV();
5330 sv = newSVpvn((char*)foldbuf, foldlen);
5332 av_push(unicode_alternate, sv);
5336 /* If folding and the value is one of the Greek
5337 * sigmas insert a few more sigmas to make the
5338 * folding rules of the sigmas to work right.
5339 * Note that not all the possible combinations
5340 * are handled here: some of them are handled
5341 * by the standard folding rules, and some of
5342 * them (literal or EXACTF cases) are handled
5343 * during runtime in regexec.c:S_find_byclass(). */
5344 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5345 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5346 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5347 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5348 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5350 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5351 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5352 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5357 literal_endpoint = 0;
5361 range = 0; /* this range (if it was one) is done now */
5365 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5367 RExC_size += ANYOF_CLASS_ADD_SKIP;
5369 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5372 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5374 /* If the only flag is folding (plus possibly inversion). */
5375 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5377 for (value = 0; value < 256; ++value) {
5378 if (ANYOF_BITMAP_TEST(ret, value)) {
5379 UV fold = PL_fold[value];
5382 ANYOF_BITMAP_SET(ret, fold);
5385 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5388 /* optimize inverted simple patterns (e.g. [^a-z]) */
5389 if (!SIZE_ONLY && optimize_invert &&
5390 /* If the only flag is inversion. */
5391 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5392 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5393 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5394 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5401 /* The 0th element stores the character class description
5402 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5403 * to initialize the appropriate swash (which gets stored in
5404 * the 1st element), and also useful for dumping the regnode.
5405 * The 2nd element stores the multicharacter foldings,
5406 * used later (regexec.c:S_reginclass()). */
5407 av_store(av, 0, listsv);
5408 av_store(av, 1, NULL);
5409 av_store(av, 2, (SV*)unicode_alternate);
5410 rv = newRV_noinc((SV*)av);
5411 n = add_data(pRExC_state, 1, "s");
5412 RExC_rx->data->data[n] = (void*)rv;
5420 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5422 char* retval = RExC_parse++;
5425 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5426 RExC_parse[2] == '#') {
5427 while (*RExC_parse != ')') {
5428 if (RExC_parse == RExC_end)
5429 FAIL("Sequence (?#... not terminated");
5435 if (RExC_flags & PMf_EXTENDED) {
5436 if (isSPACE(*RExC_parse)) {
5440 else if (*RExC_parse == '#') {
5441 while (RExC_parse < RExC_end)
5442 if (*RExC_parse++ == '\n') break;
5451 - reg_node - emit a node
5453 STATIC regnode * /* Location. */
5454 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5456 register regnode *ptr;
5457 regnode * const ret = RExC_emit;
5460 SIZE_ALIGN(RExC_size);
5465 NODE_ALIGN_FILL(ret);
5467 FILL_ADVANCE_NODE(ptr, op);
5468 if (RExC_offsets) { /* MJD */
5469 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5470 "reg_node", __LINE__,
5472 RExC_emit - RExC_emit_start > RExC_offsets[0]
5473 ? "Overwriting end of array!\n" : "OK",
5474 RExC_emit - RExC_emit_start,
5475 RExC_parse - RExC_start,
5477 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5486 - reganode - emit a node with an argument
5488 STATIC regnode * /* Location. */
5489 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5491 register regnode *ptr;
5492 regnode * const ret = RExC_emit;
5495 SIZE_ALIGN(RExC_size);
5500 NODE_ALIGN_FILL(ret);
5502 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5503 if (RExC_offsets) { /* MJD */
5504 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5508 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5509 "Overwriting end of array!\n" : "OK",
5510 RExC_emit - RExC_emit_start,
5511 RExC_parse - RExC_start,
5513 Set_Cur_Node_Offset;
5522 - reguni - emit (if appropriate) a Unicode character
5525 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5527 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5531 - reginsert - insert an operator in front of already-emitted operand
5533 * Means relocating the operand.
5536 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5538 register regnode *src;
5539 register regnode *dst;
5540 register regnode *place;
5541 const int offset = regarglen[(U8)op];
5543 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5546 RExC_size += NODE_STEP_REGNODE + offset;
5551 RExC_emit += NODE_STEP_REGNODE + offset;
5553 while (src > opnd) {
5554 StructCopy(--src, --dst, regnode);
5555 if (RExC_offsets) { /* MJD 20010112 */
5556 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5560 dst - RExC_emit_start > RExC_offsets[0]
5561 ? "Overwriting end of array!\n" : "OK",
5562 src - RExC_emit_start,
5563 dst - RExC_emit_start,
5565 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5566 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5571 place = opnd; /* Op node, where operand used to be. */
5572 if (RExC_offsets) { /* MJD */
5573 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5577 place - RExC_emit_start > RExC_offsets[0]
5578 ? "Overwriting end of array!\n" : "OK",
5579 place - RExC_emit_start,
5580 RExC_parse - RExC_start,
5582 Set_Node_Offset(place, RExC_parse);
5583 Set_Node_Length(place, 1);
5585 src = NEXTOPER(place);
5586 FILL_ADVANCE_NODE(place, op);
5587 Zero(src, offset, regnode);
5591 - regtail - set the next-pointer at the end of a node chain of p to val.
5594 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5596 register regnode *scan;
5601 /* Find last node. */
5604 regnode * const temp = regnext(scan);
5610 if (reg_off_by_arg[OP(scan)]) {
5611 ARG_SET(scan, val - scan);
5614 NEXT_OFF(scan) = val - scan;
5619 - regoptail - regtail on operand of first argument; nop if operandless
5622 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5624 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5625 if (p == NULL || SIZE_ONLY)
5627 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5628 regtail(pRExC_state, NEXTOPER(p), val);
5630 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5631 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5638 - regcurly - a little FSA that accepts {\d+,?\d*}
5641 S_regcurly(pTHX_ register const char *s)
5660 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5663 Perl_regdump(pTHX_ regexp *r)
5666 SV * const sv = sv_newmortal();
5668 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5670 /* Header fields of interest. */
5671 if (r->anchored_substr)
5672 PerlIO_printf(Perl_debug_log,
5673 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5675 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5676 SvPVX_const(r->anchored_substr),
5678 SvTAIL(r->anchored_substr) ? "$" : "",
5679 (IV)r->anchored_offset);
5680 else if (r->anchored_utf8)
5681 PerlIO_printf(Perl_debug_log,
5682 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5684 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5685 SvPVX_const(r->anchored_utf8),
5687 SvTAIL(r->anchored_utf8) ? "$" : "",
5688 (IV)r->anchored_offset);
5689 if (r->float_substr)
5690 PerlIO_printf(Perl_debug_log,
5691 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5693 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5694 SvPVX_const(r->float_substr),
5696 SvTAIL(r->float_substr) ? "$" : "",
5697 (IV)r->float_min_offset, (UV)r->float_max_offset);
5698 else if (r->float_utf8)
5699 PerlIO_printf(Perl_debug_log,
5700 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5702 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5703 SvPVX_const(r->float_utf8),
5705 SvTAIL(r->float_utf8) ? "$" : "",
5706 (IV)r->float_min_offset, (UV)r->float_max_offset);
5707 if (r->check_substr || r->check_utf8)
5708 PerlIO_printf(Perl_debug_log,
5709 r->check_substr == r->float_substr
5710 && r->check_utf8 == r->float_utf8
5711 ? "(checking floating" : "(checking anchored");
5712 if (r->reganch & ROPT_NOSCAN)
5713 PerlIO_printf(Perl_debug_log, " noscan");
5714 if (r->reganch & ROPT_CHECK_ALL)
5715 PerlIO_printf(Perl_debug_log, " isall");
5716 if (r->check_substr || r->check_utf8)
5717 PerlIO_printf(Perl_debug_log, ") ");
5719 if (r->regstclass) {
5720 regprop(sv, r->regstclass);
5721 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5723 if (r->reganch & ROPT_ANCH) {
5724 PerlIO_printf(Perl_debug_log, "anchored");
5725 if (r->reganch & ROPT_ANCH_BOL)
5726 PerlIO_printf(Perl_debug_log, "(BOL)");
5727 if (r->reganch & ROPT_ANCH_MBOL)
5728 PerlIO_printf(Perl_debug_log, "(MBOL)");
5729 if (r->reganch & ROPT_ANCH_SBOL)
5730 PerlIO_printf(Perl_debug_log, "(SBOL)");
5731 if (r->reganch & ROPT_ANCH_GPOS)
5732 PerlIO_printf(Perl_debug_log, "(GPOS)");
5733 PerlIO_putc(Perl_debug_log, ' ');
5735 if (r->reganch & ROPT_GPOS_SEEN)
5736 PerlIO_printf(Perl_debug_log, "GPOS ");
5737 if (r->reganch & ROPT_SKIP)
5738 PerlIO_printf(Perl_debug_log, "plus ");
5739 if (r->reganch & ROPT_IMPLICIT)
5740 PerlIO_printf(Perl_debug_log, "implicit ");
5741 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5742 if (r->reganch & ROPT_EVAL_SEEN)
5743 PerlIO_printf(Perl_debug_log, "with eval ");
5744 PerlIO_printf(Perl_debug_log, "\n");
5746 const U32 len = r->offsets[0];
5747 GET_RE_DEBUG_FLAGS_DECL;
5750 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5751 for (i = 1; i <= len; i++)
5752 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5753 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5754 PerlIO_printf(Perl_debug_log, "\n");
5759 #endif /* DEBUGGING */
5763 - regprop - printable representation of opcode
5766 Perl_regprop(pTHX_ SV *sv, const regnode *o)
5771 sv_setpvn(sv, "", 0);
5772 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5773 /* It would be nice to FAIL() here, but this may be called from
5774 regexec.c, and it would be hard to supply pRExC_state. */
5775 Perl_croak(aTHX_ "Corrupted regexp opcode");
5776 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5778 k = PL_regkind[(U8)OP(o)];
5781 SV * const dsv = sv_2mortal(newSVpvn("", 0));
5782 /* Using is_utf8_string() is a crude hack but it may
5783 * be the best for now since we have no flag "this EXACTish
5784 * node was UTF-8" --jhi */
5785 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5786 const char * const s = do_utf8 ?
5787 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5788 UNI_DISPLAY_REGEX) :
5790 const int len = do_utf8 ?
5793 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5797 } else if (k == TRIE) {/*
5798 this isn't always safe, as Pl_regdata may not be for this regex yet
5799 (depending on where its called from) so its being moved to dumpuntil
5801 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5802 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5805 trie->uniquecharcount,
5808 } else if (k == CURLY) {
5809 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5810 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5811 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5813 else if (k == WHILEM && o->flags) /* Ordinal/of */
5814 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5815 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5816 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5817 else if (k == LOGICAL)
5818 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5819 else if (k == ANYOF) {
5820 int i, rangestart = -1;
5821 const U8 flags = ANYOF_FLAGS(o);
5823 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5824 static const char * const anyofs[] = {
5857 if (flags & ANYOF_LOCALE)
5858 sv_catpv(sv, "{loc}");
5859 if (flags & ANYOF_FOLD)
5860 sv_catpv(sv, "{i}");
5861 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5862 if (flags & ANYOF_INVERT)
5864 for (i = 0; i <= 256; i++) {
5865 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5866 if (rangestart == -1)
5868 } else if (rangestart != -1) {
5869 if (i <= rangestart + 3)
5870 for (; rangestart < i; rangestart++)
5871 put_byte(sv, rangestart);
5873 put_byte(sv, rangestart);
5875 put_byte(sv, i - 1);
5881 if (o->flags & ANYOF_CLASS)
5882 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5883 if (ANYOF_CLASS_TEST(o,i))
5884 sv_catpv(sv, anyofs[i]);
5886 if (flags & ANYOF_UNICODE)
5887 sv_catpv(sv, "{unicode}");
5888 else if (flags & ANYOF_UNICODE_ALL)
5889 sv_catpv(sv, "{unicode_all}");
5893 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
5897 U8 s[UTF8_MAXBYTES_CASE+1];
5899 for (i = 0; i <= 256; i++) { /* just the first 256 */
5900 uvchr_to_utf8(s, i);
5902 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5903 if (rangestart == -1)
5905 } else if (rangestart != -1) {
5906 if (i <= rangestart + 3)
5907 for (; rangestart < i; rangestart++) {
5908 const U8 * const e = uvchr_to_utf8(s,rangestart);
5910 for(p = s; p < e; p++)
5914 const U8 *e = uvchr_to_utf8(s,rangestart);
5916 for (p = s; p < e; p++)
5918 sv_catpvn(sv, "-", 1);
5919 e = uvchr_to_utf8(s, i-1);
5920 for (p = s; p < e; p++)
5927 sv_catpv(sv, "..."); /* et cetera */
5931 char *s = savesvpv(lv);
5932 char * const origs = s;
5934 while(*s && *s != '\n') s++;
5937 const char * const t = ++s;
5955 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5957 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5958 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5960 PERL_UNUSED_ARG(sv);
5962 #endif /* DEBUGGING */
5966 Perl_re_intuit_string(pTHX_ regexp *prog)
5967 { /* Assume that RE_INTUIT is set */
5968 GET_RE_DEBUG_FLAGS_DECL;
5971 const char * const s = SvPV_nolen_const(prog->check_substr
5972 ? prog->check_substr : prog->check_utf8);
5974 if (!PL_colorset) reginitcolors();
5975 PerlIO_printf(Perl_debug_log,
5976 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
5978 prog->check_substr ? "" : "utf8 ",
5979 PL_colors[5],PL_colors[0],
5982 (strlen(s) > 60 ? "..." : ""));
5985 return prog->check_substr ? prog->check_substr : prog->check_utf8;
5989 Perl_pregfree(pTHX_ struct regexp *r)
5993 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
5994 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
5998 if (!r || (--r->refcnt > 0))
6000 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6001 const char * const s = (r->reganch & ROPT_UTF8)
6002 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6003 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6004 const int len = SvCUR(dsv);
6007 PerlIO_printf(Perl_debug_log,
6008 "%sFreeing REx:%s %s%*.*s%s%s\n",
6009 PL_colors[4],PL_colors[5],PL_colors[0],
6012 len > 60 ? "..." : "");
6015 /* gcov results gave these as non-null 100% of the time, so there's no
6016 optimisation in checking them before calling Safefree */
6017 Safefree(r->precomp);
6018 Safefree(r->offsets); /* 20010421 MJD */
6019 RX_MATCH_COPY_FREE(r);
6020 #ifdef PERL_OLD_COPY_ON_WRITE
6022 SvREFCNT_dec(r->saved_copy);
6025 if (r->anchored_substr)
6026 SvREFCNT_dec(r->anchored_substr);
6027 if (r->anchored_utf8)
6028 SvREFCNT_dec(r->anchored_utf8);
6029 if (r->float_substr)
6030 SvREFCNT_dec(r->float_substr);
6032 SvREFCNT_dec(r->float_utf8);
6033 Safefree(r->substrs);
6036 int n = r->data->count;
6037 PAD* new_comppad = NULL;
6042 /* If you add a ->what type here, update the comment in regcomp.h */
6043 switch (r->data->what[n]) {
6045 SvREFCNT_dec((SV*)r->data->data[n]);
6048 Safefree(r->data->data[n]);
6051 new_comppad = (AV*)r->data->data[n];
6054 if (new_comppad == NULL)
6055 Perl_croak(aTHX_ "panic: pregfree comppad");
6056 PAD_SAVE_LOCAL(old_comppad,
6057 /* Watch out for global destruction's random ordering. */
6058 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6061 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6064 op_free((OP_4tree*)r->data->data[n]);
6066 PAD_RESTORE_LOCAL(old_comppad);
6067 SvREFCNT_dec((SV*)new_comppad);
6074 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
6077 refcount = --trie->refcount;
6080 Safefree(trie->charmap);
6081 if (trie->widecharmap)
6082 SvREFCNT_dec((SV*)trie->widecharmap);
6083 Safefree(trie->states);
6084 Safefree(trie->trans);
6087 SvREFCNT_dec((SV*)trie->words);
6088 if (trie->revcharmap)
6089 SvREFCNT_dec((SV*)trie->revcharmap);
6091 Safefree(r->data->data[n]); /* do this last!!!! */
6096 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6099 Safefree(r->data->what);
6102 Safefree(r->startp);
6108 - regnext - dig the "next" pointer out of a node
6111 Perl_regnext(pTHX_ register regnode *p)
6113 register I32 offset;
6115 if (p == &PL_regdummy)
6118 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6126 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6129 STRLEN l1 = strlen(pat1);
6130 STRLEN l2 = strlen(pat2);
6133 const char *message;
6139 Copy(pat1, buf, l1 , char);
6140 Copy(pat2, buf + l1, l2 , char);
6141 buf[l1 + l2] = '\n';
6142 buf[l1 + l2 + 1] = '\0';
6144 /* ANSI variant takes additional second argument */
6145 va_start(args, pat2);
6149 msv = vmess(buf, &args);
6151 message = SvPV_const(msv,l1);
6154 Copy(message, buf, l1 , char);
6155 buf[l1-1] = '\0'; /* Overwrite \n */
6156 Perl_croak(aTHX_ "%s", buf);
6159 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6162 Perl_save_re_context(pTHX)
6164 SAVEI32(PL_reg_flags); /* from regexec.c */
6166 SAVEPPTR(PL_reginput); /* String-input pointer. */
6167 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6168 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6169 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6170 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6171 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6172 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6173 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6174 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6175 PL_reg_start_tmp = 0;
6176 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6177 PL_reg_start_tmpl = 0;
6178 SAVEVPTR(PL_regdata);
6179 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6180 SAVEI32(PL_regnarrate); /* from regexec.c */
6181 SAVEVPTR(PL_regprogram); /* from regexec.c */
6182 SAVEINT(PL_regindent); /* from regexec.c */
6183 SAVEVPTR(PL_regcc); /* from regexec.c */
6184 SAVEVPTR(PL_curcop);
6185 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6186 SAVEVPTR(PL_reg_re); /* from regexec.c */
6187 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6188 SAVESPTR(PL_reg_sv); /* from regexec.c */
6189 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6190 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6191 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6192 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6193 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6194 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6195 PL_reg_oldsaved = NULL;
6196 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6197 PL_reg_oldsavedlen = 0;
6198 #ifdef PERL_OLD_COPY_ON_WRITE
6202 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6204 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6205 PL_reg_leftiter = 0;
6206 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6207 PL_reg_poscache = NULL;
6208 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6209 PL_reg_poscache_size = 0;
6210 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6211 SAVEI32(PL_regnpar); /* () count. */
6212 SAVEI32(PL_regsize); /* from regexec.c */
6214 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6216 const REGEXP * const rx = PM_GETRE(PL_curpm);
6219 for (i = 1; i <= rx->nparens; i++) {
6220 char digits[TYPE_CHARS(long)];
6221 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
6222 GV * const mgv = gv_fetchpvn_flags(digits, len, 0, SVt_PV);
6230 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6235 clear_re(pTHX_ void *r)
6237 ReREFCNT_dec((regexp *)r);
6243 S_put_byte(pTHX_ SV *sv, int c)
6245 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6246 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6247 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6248 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6250 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6255 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6257 register U8 op = EXACT; /* Arbitrary non-END op. */
6258 register regnode *next;
6260 while (op != END && (!last || node < last)) {
6261 /* While that wasn't END last time... */
6267 next = regnext(node);
6269 if (OP(node) == OPTIMIZED)
6272 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6273 (int)(2*l + 1), "", SvPVX_const(sv));
6274 if (next == NULL) /* Next ptr. */
6275 PerlIO_printf(Perl_debug_log, "(0)");
6277 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6278 (void)PerlIO_putc(Perl_debug_log, '\n');
6280 if (PL_regkind[(U8)op] == BRANCHJ) {
6281 register regnode *nnode = (OP(next) == LONGJMP
6284 if (last && nnode > last)
6286 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6288 else if (PL_regkind[(U8)op] == BRANCH) {
6289 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6291 else if ( PL_regkind[(U8)op] == TRIE ) {
6292 const I32 n = ARG(node);
6293 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6294 const I32 arry_len = av_len(trie->words)+1;
6296 PerlIO_printf(Perl_debug_log,
6297 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6301 (int)trie->charcount,
6302 trie->uniquecharcount,
6303 (IV)trie->laststate-1,
6304 node->flags ? " EVAL mode" : "");
6306 for (word_idx=0; word_idx < arry_len; word_idx++) {
6307 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6309 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6312 SvPV_nolen_const(*elem_ptr),
6317 PerlIO_printf(Perl_debug_log, "(0)\n");
6319 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6325 node = NEXTOPER(node);
6326 node += regarglen[(U8)op];
6329 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6330 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6331 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6333 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6334 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6337 else if ( op == PLUS || op == STAR) {
6338 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6340 else if (op == ANYOF) {
6341 /* arglen 1 + class block */
6342 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6343 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6344 node = NEXTOPER(node);
6346 else if (PL_regkind[(U8)op] == EXACT) {
6347 /* Literal string, where present. */
6348 node += NODE_SZ_STR(node) - 1;
6349 node = NEXTOPER(node);
6352 node = NEXTOPER(node);
6353 node += regarglen[(U8)op];
6355 if (op == CURLYX || op == OPEN)
6357 else if (op == WHILEM)
6363 #endif /* DEBUGGING */
6367 * c-indentation-style: bsd
6369 * indent-tabs-mode: t
6372 * ex: set ts=8 sts=4 sw=4 noet: