5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
289 #define FAIL2(pat,msg) STMT_START { \
290 const char *ellipses = ""; \
291 IV len = RExC_end - RExC_precomp; \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 #define Simple_vFAIL(m) STMT_START { \
309 const IV offset = RExC_parse - RExC_precomp; \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 #define vFAIL(m) STMT_START { \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
324 * Like Simple_vFAIL(), but accepts two arguments.
326 #define Simple_vFAIL2(m,a1) STMT_START { \
327 const IV offset = RExC_parse - RExC_precomp; \
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 #define vFAIL2(m,a1) STMT_START { \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
346 const IV offset = RExC_parse - RExC_precomp; \
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 #define vFAIL3(m,a1,a2) STMT_START { \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
361 * Like Simple_vFAIL(), but accepts four arguments.
363 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
364 const IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 #define vWARN(loc,m) STMT_START { \
370 const IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARNdep(loc,m) STMT_START { \
376 const IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
383 #define vWARN2(loc, m, a1) STMT_START { \
384 const IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN3(loc, m, a1, a2) STMT_START { \
390 const IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 const IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 const IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 /* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
423 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
430 RExC_offsets[2*(node)-1] = (byte); \
435 #define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439 #define Set_Node_Length_To_R(node,len) STMT_START { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (int)(node), (int)(len))); \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
446 RExC_offsets[2*(node)] = (len); \
451 #define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461 static void clear_re(pTHX_ void *r);
463 /* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 SvSetMagicSV(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 data->flags &= ~SF_FIX_BEFORE_EOL;
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 data->flags &= ~SF_FL_BEFORE_EOL;
497 SvCUR_set(data->last_found, 0);
499 SV * const sv = data->last_found;
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
506 data->flags &= ~SF_BEFORE_EOL;
509 /* Can match anything (initialization) */
511 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 ANYOF_CLASS_ZERO(cl);
514 ANYOF_BITMAP_SETALL(cl);
515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517 cl->flags |= ANYOF_LOCALE;
520 /* Can match anything (initialization) */
522 S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
526 for (value = 0; value <= ANYOF_MAX; value += 2)
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
536 /* Can match anything (initialization) */
538 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 Zero(cl, 1, struct regnode_charclass_class);
542 cl_anything(pRExC_state, cl);
546 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 Zero(cl, 1, struct regnode_charclass_class);
550 cl_anything(pRExC_state, cl);
552 cl->flags |= ANYOF_LOCALE;
555 /* 'And' a given class with another one. Can create false positives */
556 /* We assume that cl is not inverted */
558 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559 const struct regnode_charclass_class *and_with)
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
586 cl->flags &= ~ANYOF_UNICODE_ALL;
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
589 cl->flags &= ~ANYOF_UNICODE;
592 /* 'OR' a given class with another one. Can create false positives */
593 /* We assume that cl is not inverted */
595 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
597 if (or_with->flags & ANYOF_INVERT) {
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
616 cl_anything(pRExC_state, cl);
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && (!(or_with->flags & ANYOF_FOLD)
622 || (cl->flags & ANYOF_FOLD)) ) {
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
635 cl_anything(pRExC_state, cl);
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
654 make_trie(startbranch,first,last,tail,flags)
655 startbranch: the first branch in the whole branch sequence
656 first : start branch of sequence of branch-exact nodes.
657 May be the same as startbranch
658 last : Thing following the last branch.
659 May be the same as tail.
660 tail : item following the branch sequence
661 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
663 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
665 A trie is an N'ary tree where the branches are determined by digital
666 decomposition of the key. IE, at the root node you look up the 1st character and
667 follow that branch repeat until you find the end of the branches. Nodes can be
668 marked as "accepting" meaning they represent a complete word. Eg:
672 would convert into the following structure. Numbers represent states, letters
673 following numbers represent valid transitions on the letter from that state, if
674 the number is in square brackets it represents an accepting state, otherwise it
675 will be in parenthesis.
677 +-h->+-e->[3]-+-r->(8)-+-s->[9]
681 (1) +-i->(6)-+-s->[7]
683 +-s->(3)-+-h->(4)-+-e->[5]
685 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
687 This shows that when matching against the string 'hers' we will begin at state 1
688 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
689 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
690 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
691 single traverse. We store a mapping from accepting to state to which word was
692 matched, and then when we have multiple possibilities we try to complete the
693 rest of the regex in the order in which they occured in the alternation.
695 The only prior NFA like behaviour that would be changed by the TRIE support is
696 the silent ignoring of duplicate alternations which are of the form:
698 / (DUPE|DUPE) X? (?{ ... }) Y /x
700 Thus EVAL blocks follwing a trie may be called a different number of times with
701 and without the optimisation. With the optimisations dupes will be silently
702 ignored. This inconsistant behaviour of EVAL type nodes is well established as
703 the following demonstrates:
705 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
707 which prints out 'word' three times, but
709 'words'=~/(word|word|word)(?{ print $1 })S/
711 which doesnt print it out at all. This is due to other optimisations kicking in.
713 Example of what happens on a structural level:
715 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
717 1: CURLYM[1] {1,32767}(18)
728 This would be optimizable with startbranch=5, first=5, last=16, tail=16
729 and should turn into:
731 1: CURLYM[1] {1,32767}(18)
733 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
741 Cases where tail != last would be like /(?foo|bar)baz/:
751 which would be optimizable with startbranch=1, first=1, last=7, tail=8
752 and would end up looking like:
755 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
764 #define TRIE_DEBUG_CHAR \
765 DEBUG_TRIE_COMPILE_r({ \
768 tmp = newSVpvn( "", 0 ); \
769 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
771 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
773 av_push( trie->revcharmap, tmp ); \
776 #define TRIE_READ_CHAR STMT_START { \
779 if ( foldlen > 0 ) { \
780 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
785 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
786 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
787 foldlen -= UNISKIP( uvc ); \
788 scan = foldbuf + UNISKIP( uvc ); \
791 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
800 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
801 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
802 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
803 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
805 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
806 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
807 TRIE_LIST_LEN( state ) *= 2; \
808 Renew( trie->states[ state ].trans.list, \
809 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
811 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
812 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
813 TRIE_LIST_CUR( state )++; \
816 #define TRIE_LIST_NEW(state) STMT_START { \
817 Newxz( trie->states[ state ].trans.list, \
818 4, reg_trie_trans_le ); \
819 TRIE_LIST_CUR( state ) = 1; \
820 TRIE_LIST_LEN( state ) = 4; \
824 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
827 /* first pass, loop through and scan words */
830 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
835 /* we just use folder as a flag in utf8 */
836 const U8 * const folder = ( flags == EXACTF
844 const U32 data_slot = add_data( pRExC_state, 1, "t" );
847 GET_RE_DEBUG_FLAGS_DECL;
849 Newxz( trie, 1, reg_trie_data );
851 RExC_rx->data->data[ data_slot ] = (void*)trie;
852 Newxz( trie->charmap, 256, U16 );
854 trie->words = newAV();
855 trie->revcharmap = newAV();
859 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
860 if (!SvIOK(re_trie_maxbuff)) {
861 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
864 /* -- First loop and Setup --
866 We first traverse the branches and scan each word to determine if it
867 contains widechars, and how many unique chars there are, this is
868 important as we have to build a table with at least as many columns as we
871 We use an array of integers to represent the character codes 0..255
872 (trie->charmap) and we use a an HV* to store unicode characters. We use the
873 native representation of the character value as the key and IV's for the
876 *TODO* If we keep track of how many times each character is used we can
877 remap the columns so that the table compression later on is more
878 efficient in terms of memory by ensuring most common value is in the
879 middle and the least common are on the outside. IMO this would be better
880 than a most to least common mapping as theres a decent chance the most
881 common letter will share a node with the least common, meaning the node
882 will not be compressable. With a middle is most common approach the worst
883 case is when we have the least common nodes twice.
888 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
889 regnode * const noper = NEXTOPER( cur );
890 const U8 *uc = (U8*)STRING( noper );
891 const U8 * const e = uc + STR_LEN( noper );
893 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
894 const U8 *scan = (U8*)NULL;
896 for ( ; uc < e ; uc += len ) {
900 if ( !trie->charmap[ uvc ] ) {
901 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
903 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
908 if ( !trie->widecharmap )
909 trie->widecharmap = newHV();
911 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
914 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
916 if ( !SvTRUE( *svpp ) ) {
917 sv_setiv( *svpp, ++trie->uniquecharcount );
923 } /* end first pass */
924 DEBUG_TRIE_COMPILE_r(
925 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
926 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
927 (int)trie->charcount, trie->uniquecharcount )
932 We now know what we are dealing with in terms of unique chars and
933 string sizes so we can calculate how much memory a naive
934 representation using a flat table will take. If it's over a reasonable
935 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
936 conservative but potentially much slower representation using an array
939 At the end we convert both representations into the same compressed
940 form that will be used in regexec.c for matching with. The latter
941 is a form that cannot be used to construct with but has memory
942 properties similar to the list form and access properties similar
943 to the table form making it both suitable for fast searches and
944 small enough that its feasable to store for the duration of a program.
946 See the comment in the code where the compressed table is produced
947 inplace from the flat tabe representation for an explanation of how
948 the compression works.
953 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
955 Second Pass -- Array Of Lists Representation
957 Each state will be represented by a list of charid:state records
958 (reg_trie_trans_le) the first such element holds the CUR and LEN
959 points of the allocated array. (See defines above).
961 We build the initial structure using the lists, and then convert
962 it into the compressed table form which allows faster lookups
963 (but cant be modified once converted).
969 STRLEN transcount = 1;
971 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
975 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
977 regnode * const noper = NEXTOPER( cur );
978 U8 *uc = (U8*)STRING( noper );
979 const U8 * const e = uc + STR_LEN( noper );
980 U32 state = 1; /* required init */
981 U16 charid = 0; /* sanity init */
982 U8 *scan = (U8*)NULL; /* sanity init */
983 STRLEN foldlen = 0; /* required init */
984 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
986 for ( ; uc < e ; uc += len ) {
991 charid = trie->charmap[ uvc ];
993 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
997 charid=(U16)SvIV( *svpp );
1006 if ( !trie->states[ state ].trans.list ) {
1007 TRIE_LIST_NEW( state );
1009 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1010 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1011 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1016 newstate = next_alloc++;
1017 TRIE_LIST_PUSH( state, charid, newstate );
1022 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1024 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1027 if ( !trie->states[ state ].wordnum ) {
1028 /* we havent inserted this word into the structure yet. */
1029 trie->states[ state ].wordnum = ++curword;
1032 /* store the word for dumping */
1033 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1034 if ( UTF ) SvUTF8_on( tmp );
1035 av_push( trie->words, tmp );
1039 /* Its a dupe. So ignore it. */
1042 } /* end second pass */
1044 trie->laststate = next_alloc;
1045 Renew( trie->states, next_alloc, reg_trie_state );
1047 DEBUG_TRIE_COMPILE_MORE_r({
1050 /* print out the table precompression. */
1052 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1053 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1055 for( state=1 ; state < next_alloc ; state ++ ) {
1058 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1059 if ( ! trie->states[ state ].wordnum ) {
1060 PerlIO_printf( Perl_debug_log, "%5s| ","");
1062 PerlIO_printf( Perl_debug_log, "W%04x| ",
1063 trie->states[ state ].wordnum
1066 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1067 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1068 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1069 SvPV_nolen_const( *tmp ),
1070 TRIE_LIST_ITEM(state,charid).forid,
1071 (UV)TRIE_LIST_ITEM(state,charid).newstate
1076 PerlIO_printf( Perl_debug_log, "\n\n" );
1079 Newxz( trie->trans, transcount ,reg_trie_trans );
1086 for( state=1 ; state < next_alloc ; state ++ ) {
1090 DEBUG_TRIE_COMPILE_MORE_r(
1091 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1095 if (trie->states[state].trans.list) {
1096 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1100 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1101 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1102 if ( forid < minid ) {
1104 } else if ( forid > maxid ) {
1108 if ( transcount < tp + maxid - minid + 1) {
1110 Renew( trie->trans, transcount, reg_trie_trans );
1111 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1113 base = trie->uniquecharcount + tp - minid;
1114 if ( maxid == minid ) {
1116 for ( ; zp < tp ; zp++ ) {
1117 if ( ! trie->trans[ zp ].next ) {
1118 base = trie->uniquecharcount + zp - minid;
1119 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1120 trie->trans[ zp ].check = state;
1126 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1127 trie->trans[ tp ].check = state;
1132 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1133 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1134 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1135 trie->trans[ tid ].check = state;
1137 tp += ( maxid - minid + 1 );
1139 Safefree(trie->states[ state ].trans.list);
1142 DEBUG_TRIE_COMPILE_MORE_r(
1143 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1146 trie->states[ state ].trans.base=base;
1148 trie->lasttrans = tp + 1;
1152 Second Pass -- Flat Table Representation.
1154 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1155 We know that we will need Charcount+1 trans at most to store the data
1156 (one row per char at worst case) So we preallocate both structures
1157 assuming worst case.
1159 We then construct the trie using only the .next slots of the entry
1162 We use the .check field of the first entry of the node temporarily to
1163 make compression both faster and easier by keeping track of how many non
1164 zero fields are in the node.
1166 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1169 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1170 number representing the first entry of the node, and state as a
1171 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1172 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1173 are 2 entrys per node. eg:
1181 The table is internally in the right hand, idx form. However as we also
1182 have to deal with the states array which is indexed by nodenum we have to
1183 use TRIE_NODENUM() to convert.
1187 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1189 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
1190 next_alloc = trie->uniquecharcount + 1;
1192 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1194 regnode * const noper = NEXTOPER( cur );
1195 const U8 *uc = (U8*)STRING( noper );
1196 const U8 * const e = uc + STR_LEN( noper );
1198 U32 state = 1; /* required init */
1200 U16 charid = 0; /* sanity init */
1201 U32 accept_state = 0; /* sanity init */
1202 U8 *scan = (U8*)NULL; /* sanity init */
1204 STRLEN foldlen = 0; /* required init */
1205 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1208 for ( ; uc < e ; uc += len ) {
1213 charid = trie->charmap[ uvc ];
1215 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1216 charid = svpp ? (U16)SvIV(*svpp) : 0;
1220 if ( !trie->trans[ state + charid ].next ) {
1221 trie->trans[ state + charid ].next = next_alloc;
1222 trie->trans[ state ].check++;
1223 next_alloc += trie->uniquecharcount;
1225 state = trie->trans[ state + charid ].next;
1227 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1229 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1232 accept_state = TRIE_NODENUM( state );
1233 if ( !trie->states[ accept_state ].wordnum ) {
1234 /* we havent inserted this word into the structure yet. */
1235 trie->states[ accept_state ].wordnum = ++curword;
1238 /* store the word for dumping */
1239 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1240 if ( UTF ) SvUTF8_on( tmp );
1241 av_push( trie->words, tmp );
1245 /* Its a dupe. So ignore it. */
1248 } /* end second pass */
1250 DEBUG_TRIE_COMPILE_MORE_r({
1252 print out the table precompression so that we can do a visual check
1253 that they are identical.
1257 PerlIO_printf( Perl_debug_log, "\nChar : " );
1259 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1260 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1262 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1266 PerlIO_printf( Perl_debug_log, "\nState+-" );
1268 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1269 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1272 PerlIO_printf( Perl_debug_log, "\n" );
1274 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1276 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1278 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1279 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1280 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1282 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1283 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1285 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1286 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1289 PerlIO_printf( Perl_debug_log, "\n\n" );
1293 * Inplace compress the table.*
1295 For sparse data sets the table constructed by the trie algorithm will
1296 be mostly 0/FAIL transitions or to put it another way mostly empty.
1297 (Note that leaf nodes will not contain any transitions.)
1299 This algorithm compresses the tables by eliminating most such
1300 transitions, at the cost of a modest bit of extra work during lookup:
1302 - Each states[] entry contains a .base field which indicates the
1303 index in the state[] array wheres its transition data is stored.
1305 - If .base is 0 there are no valid transitions from that node.
1307 - If .base is nonzero then charid is added to it to find an entry in
1310 -If trans[states[state].base+charid].check!=state then the
1311 transition is taken to be a 0/Fail transition. Thus if there are fail
1312 transitions at the front of the node then the .base offset will point
1313 somewhere inside the previous nodes data (or maybe even into a node
1314 even earlier), but the .check field determines if the transition is
1317 The following process inplace converts the table to the compressed
1318 table: We first do not compress the root node 1,and mark its all its
1319 .check pointers as 1 and set its .base pointer as 1 as well. This
1320 allows to do a DFA construction from the compressed table later, and
1321 ensures that any .base pointers we calculate later are greater than
1324 - We set 'pos' to indicate the first entry of the second node.
1326 - We then iterate over the columns of the node, finding the first and
1327 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1328 and set the .check pointers accordingly, and advance pos
1329 appropriately and repreat for the next node. Note that when we copy
1330 the next pointers we have to convert them from the original
1331 NODEIDX form to NODENUM form as the former is not valid post
1334 - If a node has no transitions used we mark its base as 0 and do not
1335 advance the pos pointer.
1337 - If a node only has one transition we use a second pointer into the
1338 structure to fill in allocated fail transitions from other states.
1339 This pointer is independent of the main pointer and scans forward
1340 looking for null transitions that are allocated to a state. When it
1341 finds one it writes the single transition into the "hole". If the
1342 pointer doesnt find one the single transition is appeneded as normal.
1344 - Once compressed we can Renew/realloc the structures to release the
1347 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1348 specifically Fig 3.47 and the associated pseudocode.
1352 const U32 laststate = TRIE_NODENUM( next_alloc );
1355 trie->laststate = laststate;
1357 for ( state = 1 ; state < laststate ; state++ ) {
1359 const U32 stateidx = TRIE_NODEIDX( state );
1360 const U32 o_used = trie->trans[ stateidx ].check;
1361 U32 used = trie->trans[ stateidx ].check;
1362 trie->trans[ stateidx ].check = 0;
1364 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1365 if ( flag || trie->trans[ stateidx + charid ].next ) {
1366 if ( trie->trans[ stateidx + charid ].next ) {
1368 for ( ; zp < pos ; zp++ ) {
1369 if ( ! trie->trans[ zp ].next ) {
1373 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1374 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1375 trie->trans[ zp ].check = state;
1376 if ( ++zp > pos ) pos = zp;
1383 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1385 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1386 trie->trans[ pos ].check = state;
1391 trie->lasttrans = pos + 1;
1392 Renew( trie->states, laststate + 1, reg_trie_state);
1393 DEBUG_TRIE_COMPILE_MORE_r(
1394 PerlIO_printf( Perl_debug_log,
1395 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1396 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1399 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1402 } /* end table compress */
1404 /* resize the trans array to remove unused space */
1405 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1407 DEBUG_TRIE_COMPILE_r({
1410 Now we print it out again, in a slightly different form as there is additional
1411 info we want to be able to see when its compressed. They are close enough for
1412 visual comparison though.
1414 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1416 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1417 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1419 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1422 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1424 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1425 PerlIO_printf( Perl_debug_log, "-----");
1426 PerlIO_printf( Perl_debug_log, "\n");
1428 for( state = 1 ; state < trie->laststate ; state++ ) {
1429 const U32 base = trie->states[ state ].trans.base;
1431 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1433 if ( trie->states[ state ].wordnum ) {
1434 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1436 PerlIO_printf( Perl_debug_log, "%6s", "" );
1439 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1444 while( ( base + ofs < trie->uniquecharcount ) ||
1445 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1446 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1449 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1451 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1452 if ( ( base + ofs >= trie->uniquecharcount ) &&
1453 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1454 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1456 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1457 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1459 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1463 PerlIO_printf( Perl_debug_log, "]");
1466 PerlIO_printf( Perl_debug_log, "\n" );
1471 /* now finally we "stitch in" the new TRIE node
1472 This means we convert either the first branch or the first Exact,
1473 depending on whether the thing following (in 'last') is a branch
1474 or not and whther first is the startbranch (ie is it a sub part of
1475 the alternation or is it the whole thing.)
1476 Assuming its a sub part we conver the EXACT otherwise we convert
1477 the whole branch sequence, including the first.
1484 if ( first == startbranch && OP( last ) != BRANCH ) {
1487 convert = NEXTOPER( first );
1488 NEXT_OFF( first ) = (U16)(last - first);
1491 OP( convert ) = TRIE + (U8)( flags - EXACT );
1492 NEXT_OFF( convert ) = (U16)(tail - convert);
1493 ARG_SET( convert, data_slot );
1495 /* tells us if we need to handle accept buffers specially */
1496 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1499 /* needed for dumping*/
1501 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1502 /* We now need to mark all of the space originally used by the
1503 branches as optimized away. This keeps the dumpuntil from
1504 throwing a wobbly as it doesnt use regnext() to traverse the
1507 while( optimize < last ) {
1508 OP( optimize ) = OPTIMIZED;
1512 } /* end node insert */
1519 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1520 * These need to be revisited when a newer toolchain becomes available.
1522 #if defined(__sparc64__) && defined(__GNUC__)
1523 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1524 # undef SPARC64_GCC_WORKAROUND
1525 # define SPARC64_GCC_WORKAROUND 1
1529 /* REx optimizer. Converts nodes into quickier variants "in place".
1530 Finds fixed substrings. */
1532 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1533 to the position after last scanned or to NULL. */
1537 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1538 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1539 /* scanp: Start here (read-write). */
1540 /* deltap: Write maxlen-minlen here. */
1541 /* last: Stop before this one. */
1544 I32 min = 0, pars = 0, code;
1545 regnode *scan = *scanp, *next;
1547 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1548 int is_inf_internal = 0; /* The studied chunk is infinite */
1549 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1550 scan_data_t data_fake;
1551 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1552 SV *re_trie_maxbuff = NULL;
1554 GET_RE_DEBUG_FLAGS_DECL;
1556 while (scan && OP(scan) != END && scan < last) {
1557 /* Peephole optimizer: */
1559 SV * const mysv=sv_newmortal();
1560 regprop( mysv, scan);
1561 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1562 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1565 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1566 /* Merge several consecutive EXACTish nodes into one. */
1567 regnode *n = regnext(scan);
1570 regnode *stop = scan;
1573 next = scan + NODE_SZ_STR(scan);
1574 /* Skip NOTHING, merge EXACT*. */
1576 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1577 (stringok && (OP(n) == OP(scan))))
1579 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1580 if (OP(n) == TAIL || n > next)
1582 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1583 NEXT_OFF(scan) += NEXT_OFF(n);
1584 next = n + NODE_STEP_REGNODE;
1591 else if (stringok) {
1592 const int oldl = STR_LEN(scan);
1593 regnode * const nnext = regnext(n);
1595 if (oldl + STR_LEN(n) > U8_MAX)
1597 NEXT_OFF(scan) += NEXT_OFF(n);
1598 STR_LEN(scan) += STR_LEN(n);
1599 next = n + NODE_SZ_STR(n);
1600 /* Now we can overwrite *n : */
1601 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1609 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1611 Two problematic code points in Unicode casefolding of EXACT nodes:
1613 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1614 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1620 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1621 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1623 This means that in case-insensitive matching (or "loose matching",
1624 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1625 length of the above casefolded versions) can match a target string
1626 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1627 This would rather mess up the minimum length computation.
1629 What we'll do is to look for the tail four bytes, and then peek
1630 at the preceding two bytes to see whether we need to decrease
1631 the minimum length by four (six minus two).
1633 Thanks to the design of UTF-8, there cannot be false matches:
1634 A sequence of valid UTF-8 bytes cannot be a subsequence of
1635 another valid sequence of UTF-8 bytes.
1638 char * const s0 = STRING(scan), *s, *t;
1639 char * const s1 = s0 + STR_LEN(scan) - 1;
1640 char * const s2 = s1 - 4;
1641 const char * const t0 = "\xcc\x88\xcc\x81";
1642 const char * const t1 = t0 + 3;
1645 s < s2 && (t = ninstr(s, s1, t0, t1));
1647 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1648 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1655 n = scan + NODE_SZ_STR(scan);
1657 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1668 /* Follow the next-chain of the current node and optimize
1669 away all the NOTHINGs from it. */
1670 if (OP(scan) != CURLYX) {
1671 const int max = (reg_off_by_arg[OP(scan)]
1673 /* I32 may be smaller than U16 on CRAYs! */
1674 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1675 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1679 /* Skip NOTHING and LONGJMP. */
1680 while ((n = regnext(n))
1681 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1682 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1683 && off + noff < max)
1685 if (reg_off_by_arg[OP(scan)])
1688 NEXT_OFF(scan) = off;
1691 /* The principal pseudo-switch. Cannot be a switch, since we
1692 look into several different things. */
1693 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1694 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1695 next = regnext(scan);
1697 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1699 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1700 I32 max1 = 0, min1 = I32_MAX, num = 0;
1701 struct regnode_charclass_class accum;
1702 regnode *startbranch=scan;
1704 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1705 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1706 if (flags & SCF_DO_STCLASS)
1707 cl_init_zero(pRExC_state, &accum);
1709 while (OP(scan) == code) {
1710 I32 deltanext, minnext, f = 0, fake;
1711 struct regnode_charclass_class this_class;
1714 data_fake.flags = 0;
1716 data_fake.whilem_c = data->whilem_c;
1717 data_fake.last_closep = data->last_closep;
1720 data_fake.last_closep = &fake;
1721 next = regnext(scan);
1722 scan = NEXTOPER(scan);
1724 scan = NEXTOPER(scan);
1725 if (flags & SCF_DO_STCLASS) {
1726 cl_init(pRExC_state, &this_class);
1727 data_fake.start_class = &this_class;
1728 f = SCF_DO_STCLASS_AND;
1730 if (flags & SCF_WHILEM_VISITED_POS)
1731 f |= SCF_WHILEM_VISITED_POS;
1733 /* we suppose the run is continuous, last=next...*/
1734 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1735 next, &data_fake, f,depth+1);
1738 if (max1 < minnext + deltanext)
1739 max1 = minnext + deltanext;
1740 if (deltanext == I32_MAX)
1741 is_inf = is_inf_internal = 1;
1743 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1745 if (data && (data_fake.flags & SF_HAS_EVAL))
1746 data->flags |= SF_HAS_EVAL;
1748 data->whilem_c = data_fake.whilem_c;
1749 if (flags & SCF_DO_STCLASS)
1750 cl_or(pRExC_state, &accum, &this_class);
1751 if (code == SUSPEND)
1754 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1756 if (flags & SCF_DO_SUBSTR) {
1757 data->pos_min += min1;
1758 data->pos_delta += max1 - min1;
1759 if (max1 != min1 || is_inf)
1760 data->longest = &(data->longest_float);
1763 delta += max1 - min1;
1764 if (flags & SCF_DO_STCLASS_OR) {
1765 cl_or(pRExC_state, data->start_class, &accum);
1767 cl_and(data->start_class, &and_with);
1768 flags &= ~SCF_DO_STCLASS;
1771 else if (flags & SCF_DO_STCLASS_AND) {
1773 cl_and(data->start_class, &accum);
1774 flags &= ~SCF_DO_STCLASS;
1777 /* Switch to OR mode: cache the old value of
1778 * data->start_class */
1779 StructCopy(data->start_class, &and_with,
1780 struct regnode_charclass_class);
1781 flags &= ~SCF_DO_STCLASS_AND;
1782 StructCopy(&accum, data->start_class,
1783 struct regnode_charclass_class);
1784 flags |= SCF_DO_STCLASS_OR;
1785 data->start_class->flags |= ANYOF_EOS;
1791 Assuming this was/is a branch we are dealing with: 'scan' now
1792 points at the item that follows the branch sequence, whatever
1793 it is. We now start at the beginning of the sequence and look
1799 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1801 If we can find such a subseqence we need to turn the first
1802 element into a trie and then add the subsequent branch exact
1803 strings to the trie.
1807 1. patterns where the whole set of branch can be converted to a trie,
1809 2. patterns where only a subset of the alternations can be
1810 converted to a trie.
1812 In case 1 we can replace the whole set with a single regop
1813 for the trie. In case 2 we need to keep the start and end
1816 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1817 becomes BRANCH TRIE; BRANCH X;
1819 Hypthetically when we know the regex isnt anchored we can
1820 turn a case 1 into a DFA and let it rip... Every time it finds a match
1821 it would just call its tail, no WHILEM/CURLY needed.
1825 if (!re_trie_maxbuff) {
1826 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1827 if (!SvIOK(re_trie_maxbuff))
1828 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1830 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1832 regnode *first = (regnode *)NULL;
1833 regnode *last = (regnode *)NULL;
1834 regnode *tail = scan;
1839 SV * const mysv = sv_newmortal(); /* for dumping */
1841 /* var tail is used because there may be a TAIL
1842 regop in the way. Ie, the exacts will point to the
1843 thing following the TAIL, but the last branch will
1844 point at the TAIL. So we advance tail. If we
1845 have nested (?:) we may have to move through several
1849 while ( OP( tail ) == TAIL ) {
1850 /* this is the TAIL generated by (?:) */
1851 tail = regnext( tail );
1855 regprop( mysv, tail );
1856 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1857 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1858 (RExC_seen_evals) ? "[EVAL]" : ""
1863 step through the branches, cur represents each
1864 branch, noper is the first thing to be matched
1865 as part of that branch and noper_next is the
1866 regnext() of that node. if noper is an EXACT
1867 and noper_next is the same as scan (our current
1868 position in the regex) then the EXACT branch is
1869 a possible optimization target. Once we have
1870 two or more consequetive such branches we can
1871 create a trie of the EXACT's contents and stich
1872 it in place. If the sequence represents all of
1873 the branches we eliminate the whole thing and
1874 replace it with a single TRIE. If it is a
1875 subsequence then we need to stitch it in. This
1876 means the first branch has to remain, and needs
1877 to be repointed at the item on the branch chain
1878 following the last branch optimized. This could
1879 be either a BRANCH, in which case the
1880 subsequence is internal, or it could be the
1881 item following the branch sequence in which
1882 case the subsequence is at the end.
1886 /* dont use tail as the end marker for this traverse */
1887 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1888 regnode * const noper = NEXTOPER( cur );
1889 regnode * const noper_next = regnext( noper );
1892 regprop( mysv, cur);
1893 PerlIO_printf( Perl_debug_log, "%*s%s",
1894 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1896 regprop( mysv, noper);
1897 PerlIO_printf( Perl_debug_log, " -> %s",
1898 SvPV_nolen_const(mysv));
1901 regprop( mysv, noper_next );
1902 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1903 SvPV_nolen_const(mysv));
1905 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1908 if ( ( first ? OP( noper ) == optype
1909 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1910 && noper_next == tail && count<U16_MAX)
1915 optype = OP( noper );
1919 regprop( mysv, first);
1920 PerlIO_printf( Perl_debug_log, "%*s%s",
1921 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1922 regprop( mysv, NEXTOPER(first) );
1923 PerlIO_printf( Perl_debug_log, " -> %s\n",
1924 SvPV_nolen_const( mysv ) );
1929 regprop( mysv, cur);
1930 PerlIO_printf( Perl_debug_log, "%*s%s",
1931 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1932 regprop( mysv, noper );
1933 PerlIO_printf( Perl_debug_log, " -> %s\n",
1934 SvPV_nolen_const( mysv ) );
1940 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1941 (int)depth * 2 + 2, "E:", "**END**" );
1943 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1945 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1946 && noper_next == tail )
1950 optype = OP( noper );
1960 regprop( mysv, cur);
1961 PerlIO_printf( Perl_debug_log,
1962 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1963 " ", SvPV_nolen_const( mysv ), first, last, cur);
1968 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1969 (int)depth * 2 + 2, "E:", "==END==" );
1971 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1976 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1977 scan = NEXTOPER(NEXTOPER(scan));
1978 } else /* single branch is optimized. */
1979 scan = NEXTOPER(scan);
1982 else if (OP(scan) == EXACT) {
1983 I32 l = STR_LEN(scan);
1986 const U8 * const s = (U8*)STRING(scan);
1987 l = utf8_length(s, s + l);
1988 uc = utf8_to_uvchr(s, NULL);
1990 uc = *((U8*)STRING(scan));
1993 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1994 /* The code below prefers earlier match for fixed
1995 offset, later match for variable offset. */
1996 if (data->last_end == -1) { /* Update the start info. */
1997 data->last_start_min = data->pos_min;
1998 data->last_start_max = is_inf
1999 ? I32_MAX : data->pos_min + data->pos_delta;
2001 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2003 SV * const sv = data->last_found;
2004 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2005 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2006 if (mg && mg->mg_len >= 0)
2007 mg->mg_len += utf8_length((U8*)STRING(scan),
2008 (U8*)STRING(scan)+STR_LEN(scan));
2011 SvUTF8_on(data->last_found);
2012 data->last_end = data->pos_min + l;
2013 data->pos_min += l; /* As in the first entry. */
2014 data->flags &= ~SF_BEFORE_EOL;
2016 if (flags & SCF_DO_STCLASS_AND) {
2017 /* Check whether it is compatible with what we know already! */
2021 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2022 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2023 && (!(data->start_class->flags & ANYOF_FOLD)
2024 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2027 ANYOF_CLASS_ZERO(data->start_class);
2028 ANYOF_BITMAP_ZERO(data->start_class);
2030 ANYOF_BITMAP_SET(data->start_class, uc);
2031 data->start_class->flags &= ~ANYOF_EOS;
2033 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2035 else if (flags & SCF_DO_STCLASS_OR) {
2036 /* false positive possible if the class is case-folded */
2038 ANYOF_BITMAP_SET(data->start_class, uc);
2040 data->start_class->flags |= ANYOF_UNICODE_ALL;
2041 data->start_class->flags &= ~ANYOF_EOS;
2042 cl_and(data->start_class, &and_with);
2044 flags &= ~SCF_DO_STCLASS;
2046 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2047 I32 l = STR_LEN(scan);
2048 UV uc = *((U8*)STRING(scan));
2050 /* Search for fixed substrings supports EXACT only. */
2051 if (flags & SCF_DO_SUBSTR)
2052 scan_commit(pRExC_state, data);
2054 U8 *s = (U8 *)STRING(scan);
2055 l = utf8_length(s, s + l);
2056 uc = utf8_to_uvchr(s, NULL);
2059 if (data && (flags & SCF_DO_SUBSTR))
2061 if (flags & SCF_DO_STCLASS_AND) {
2062 /* Check whether it is compatible with what we know already! */
2066 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2067 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2068 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2070 ANYOF_CLASS_ZERO(data->start_class);
2071 ANYOF_BITMAP_ZERO(data->start_class);
2073 ANYOF_BITMAP_SET(data->start_class, uc);
2074 data->start_class->flags &= ~ANYOF_EOS;
2075 data->start_class->flags |= ANYOF_FOLD;
2076 if (OP(scan) == EXACTFL)
2077 data->start_class->flags |= ANYOF_LOCALE;
2080 else if (flags & SCF_DO_STCLASS_OR) {
2081 if (data->start_class->flags & ANYOF_FOLD) {
2082 /* false positive possible if the class is case-folded.
2083 Assume that the locale settings are the same... */
2085 ANYOF_BITMAP_SET(data->start_class, uc);
2086 data->start_class->flags &= ~ANYOF_EOS;
2088 cl_and(data->start_class, &and_with);
2090 flags &= ~SCF_DO_STCLASS;
2092 else if (strchr((const char*)PL_varies,OP(scan))) {
2093 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2094 I32 f = flags, pos_before = 0;
2095 regnode *oscan = scan;
2096 struct regnode_charclass_class this_class;
2097 struct regnode_charclass_class *oclass = NULL;
2098 I32 next_is_eval = 0;
2100 switch (PL_regkind[(U8)OP(scan)]) {
2101 case WHILEM: /* End of (?:...)* . */
2102 scan = NEXTOPER(scan);
2105 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2106 next = NEXTOPER(scan);
2107 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2109 maxcount = REG_INFTY;
2110 next = regnext(scan);
2111 scan = NEXTOPER(scan);
2115 if (flags & SCF_DO_SUBSTR)
2120 if (flags & SCF_DO_STCLASS) {
2122 maxcount = REG_INFTY;
2123 next = regnext(scan);
2124 scan = NEXTOPER(scan);
2127 is_inf = is_inf_internal = 1;
2128 scan = regnext(scan);
2129 if (flags & SCF_DO_SUBSTR) {
2130 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2131 data->longest = &(data->longest_float);
2133 goto optimize_curly_tail;
2135 mincount = ARG1(scan);
2136 maxcount = ARG2(scan);
2137 next = regnext(scan);
2138 if (OP(scan) == CURLYX) {
2139 I32 lp = (data ? *(data->last_closep) : 0);
2140 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2142 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2143 next_is_eval = (OP(scan) == EVAL);
2145 if (flags & SCF_DO_SUBSTR) {
2146 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2147 pos_before = data->pos_min;
2151 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2153 data->flags |= SF_IS_INF;
2155 if (flags & SCF_DO_STCLASS) {
2156 cl_init(pRExC_state, &this_class);
2157 oclass = data->start_class;
2158 data->start_class = &this_class;
2159 f |= SCF_DO_STCLASS_AND;
2160 f &= ~SCF_DO_STCLASS_OR;
2162 /* These are the cases when once a subexpression
2163 fails at a particular position, it cannot succeed
2164 even after backtracking at the enclosing scope.
2166 XXXX what if minimal match and we are at the
2167 initial run of {n,m}? */
2168 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2169 f &= ~SCF_WHILEM_VISITED_POS;
2171 /* This will finish on WHILEM, setting scan, or on NULL: */
2172 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2174 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2176 if (flags & SCF_DO_STCLASS)
2177 data->start_class = oclass;
2178 if (mincount == 0 || minnext == 0) {
2179 if (flags & SCF_DO_STCLASS_OR) {
2180 cl_or(pRExC_state, data->start_class, &this_class);
2182 else if (flags & SCF_DO_STCLASS_AND) {
2183 /* Switch to OR mode: cache the old value of
2184 * data->start_class */
2185 StructCopy(data->start_class, &and_with,
2186 struct regnode_charclass_class);
2187 flags &= ~SCF_DO_STCLASS_AND;
2188 StructCopy(&this_class, data->start_class,
2189 struct regnode_charclass_class);
2190 flags |= SCF_DO_STCLASS_OR;
2191 data->start_class->flags |= ANYOF_EOS;
2193 } else { /* Non-zero len */
2194 if (flags & SCF_DO_STCLASS_OR) {
2195 cl_or(pRExC_state, data->start_class, &this_class);
2196 cl_and(data->start_class, &and_with);
2198 else if (flags & SCF_DO_STCLASS_AND)
2199 cl_and(data->start_class, &this_class);
2200 flags &= ~SCF_DO_STCLASS;
2202 if (!scan) /* It was not CURLYX, but CURLY. */
2204 if ( /* ? quantifier ok, except for (?{ ... }) */
2205 (next_is_eval || !(mincount == 0 && maxcount == 1))
2206 && (minnext == 0) && (deltanext == 0)
2207 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2208 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2209 && ckWARN(WARN_REGEXP))
2212 "Quantifier unexpected on zero-length expression");
2215 min += minnext * mincount;
2216 is_inf_internal |= ((maxcount == REG_INFTY
2217 && (minnext + deltanext) > 0)
2218 || deltanext == I32_MAX);
2219 is_inf |= is_inf_internal;
2220 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2222 /* Try powerful optimization CURLYX => CURLYN. */
2223 if ( OP(oscan) == CURLYX && data
2224 && data->flags & SF_IN_PAR
2225 && !(data->flags & SF_HAS_EVAL)
2226 && !deltanext && minnext == 1 ) {
2227 /* Try to optimize to CURLYN. */
2228 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2229 regnode *nxt1 = nxt;
2236 if (!strchr((const char*)PL_simple,OP(nxt))
2237 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2238 && STR_LEN(nxt) == 1))
2244 if (OP(nxt) != CLOSE)
2246 /* Now we know that nxt2 is the only contents: */
2247 oscan->flags = (U8)ARG(nxt);
2249 OP(nxt1) = NOTHING; /* was OPEN. */
2251 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2252 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2253 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2254 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2255 OP(nxt + 1) = OPTIMIZED; /* was count. */
2256 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2261 /* Try optimization CURLYX => CURLYM. */
2262 if ( OP(oscan) == CURLYX && data
2263 && !(data->flags & SF_HAS_PAR)
2264 && !(data->flags & SF_HAS_EVAL)
2265 && !deltanext /* atom is fixed width */
2266 && minnext != 0 /* CURLYM can't handle zero width */
2268 /* XXXX How to optimize if data == 0? */
2269 /* Optimize to a simpler form. */
2270 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2274 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2275 && (OP(nxt2) != WHILEM))
2277 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2278 /* Need to optimize away parenths. */
2279 if (data->flags & SF_IN_PAR) {
2280 /* Set the parenth number. */
2281 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2283 if (OP(nxt) != CLOSE)
2284 FAIL("Panic opt close");
2285 oscan->flags = (U8)ARG(nxt);
2286 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2287 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2289 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2290 OP(nxt + 1) = OPTIMIZED; /* was count. */
2291 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2292 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2295 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2296 regnode *nnxt = regnext(nxt1);
2299 if (reg_off_by_arg[OP(nxt1)])
2300 ARG_SET(nxt1, nxt2 - nxt1);
2301 else if (nxt2 - nxt1 < U16_MAX)
2302 NEXT_OFF(nxt1) = nxt2 - nxt1;
2304 OP(nxt) = NOTHING; /* Cannot beautify */
2309 /* Optimize again: */
2310 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2316 else if ((OP(oscan) == CURLYX)
2317 && (flags & SCF_WHILEM_VISITED_POS)
2318 /* See the comment on a similar expression above.
2319 However, this time it not a subexpression
2320 we care about, but the expression itself. */
2321 && (maxcount == REG_INFTY)
2322 && data && ++data->whilem_c < 16) {
2323 /* This stays as CURLYX, we can put the count/of pair. */
2324 /* Find WHILEM (as in regexec.c) */
2325 regnode *nxt = oscan + NEXT_OFF(oscan);
2327 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2329 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2330 | (RExC_whilem_seen << 4)); /* On WHILEM */
2332 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2334 if (flags & SCF_DO_SUBSTR) {
2335 SV *last_str = NULL;
2336 int counted = mincount != 0;
2338 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2339 #if defined(SPARC64_GCC_WORKAROUND)
2342 const char *s = NULL;
2345 if (pos_before >= data->last_start_min)
2348 b = data->last_start_min;
2351 s = SvPV_const(data->last_found, l);
2352 old = b - data->last_start_min;
2355 I32 b = pos_before >= data->last_start_min
2356 ? pos_before : data->last_start_min;
2358 const char *s = SvPV_const(data->last_found, l);
2359 I32 old = b - data->last_start_min;
2363 old = utf8_hop((U8*)s, old) - (U8*)s;
2366 /* Get the added string: */
2367 last_str = newSVpvn(s + old, l);
2369 SvUTF8_on(last_str);
2370 if (deltanext == 0 && pos_before == b) {
2371 /* What was added is a constant string */
2373 SvGROW(last_str, (mincount * l) + 1);
2374 repeatcpy(SvPVX(last_str) + l,
2375 SvPVX_const(last_str), l, mincount - 1);
2376 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2377 /* Add additional parts. */
2378 SvCUR_set(data->last_found,
2379 SvCUR(data->last_found) - l);
2380 sv_catsv(data->last_found, last_str);
2382 SV * sv = data->last_found;
2384 SvUTF8(sv) && SvMAGICAL(sv) ?
2385 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2386 if (mg && mg->mg_len >= 0)
2387 mg->mg_len += CHR_SVLEN(last_str);
2389 data->last_end += l * (mincount - 1);
2392 /* start offset must point into the last copy */
2393 data->last_start_min += minnext * (mincount - 1);
2394 data->last_start_max += is_inf ? I32_MAX
2395 : (maxcount - 1) * (minnext + data->pos_delta);
2398 /* It is counted once already... */
2399 data->pos_min += minnext * (mincount - counted);
2400 data->pos_delta += - counted * deltanext +
2401 (minnext + deltanext) * maxcount - minnext * mincount;
2402 if (mincount != maxcount) {
2403 /* Cannot extend fixed substrings found inside
2405 scan_commit(pRExC_state,data);
2406 if (mincount && last_str) {
2407 sv_setsv(data->last_found, last_str);
2408 data->last_end = data->pos_min;
2409 data->last_start_min =
2410 data->pos_min - CHR_SVLEN(last_str);
2411 data->last_start_max = is_inf
2413 : data->pos_min + data->pos_delta
2414 - CHR_SVLEN(last_str);
2416 data->longest = &(data->longest_float);
2418 SvREFCNT_dec(last_str);
2420 if (data && (fl & SF_HAS_EVAL))
2421 data->flags |= SF_HAS_EVAL;
2422 optimize_curly_tail:
2423 if (OP(oscan) != CURLYX) {
2424 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2426 NEXT_OFF(oscan) += NEXT_OFF(next);
2429 default: /* REF and CLUMP only? */
2430 if (flags & SCF_DO_SUBSTR) {
2431 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2432 data->longest = &(data->longest_float);
2434 is_inf = is_inf_internal = 1;
2435 if (flags & SCF_DO_STCLASS_OR)
2436 cl_anything(pRExC_state, data->start_class);
2437 flags &= ~SCF_DO_STCLASS;
2441 else if (strchr((const char*)PL_simple,OP(scan))) {
2444 if (flags & SCF_DO_SUBSTR) {
2445 scan_commit(pRExC_state,data);
2449 if (flags & SCF_DO_STCLASS) {
2450 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2452 /* Some of the logic below assumes that switching
2453 locale on will only add false positives. */
2454 switch (PL_regkind[(U8)OP(scan)]) {
2458 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2459 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2460 cl_anything(pRExC_state, data->start_class);
2463 if (OP(scan) == SANY)
2465 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2466 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2467 || (data->start_class->flags & ANYOF_CLASS));
2468 cl_anything(pRExC_state, data->start_class);
2470 if (flags & SCF_DO_STCLASS_AND || !value)
2471 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2474 if (flags & SCF_DO_STCLASS_AND)
2475 cl_and(data->start_class,
2476 (struct regnode_charclass_class*)scan);
2478 cl_or(pRExC_state, data->start_class,
2479 (struct regnode_charclass_class*)scan);
2482 if (flags & SCF_DO_STCLASS_AND) {
2483 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2484 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2485 for (value = 0; value < 256; value++)
2486 if (!isALNUM(value))
2487 ANYOF_BITMAP_CLEAR(data->start_class, value);
2491 if (data->start_class->flags & ANYOF_LOCALE)
2492 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2494 for (value = 0; value < 256; value++)
2496 ANYOF_BITMAP_SET(data->start_class, value);
2501 if (flags & SCF_DO_STCLASS_AND) {
2502 if (data->start_class->flags & ANYOF_LOCALE)
2503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2506 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2507 data->start_class->flags |= ANYOF_LOCALE;
2511 if (flags & SCF_DO_STCLASS_AND) {
2512 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2513 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2514 for (value = 0; value < 256; value++)
2516 ANYOF_BITMAP_CLEAR(data->start_class, value);
2520 if (data->start_class->flags & ANYOF_LOCALE)
2521 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2523 for (value = 0; value < 256; value++)
2524 if (!isALNUM(value))
2525 ANYOF_BITMAP_SET(data->start_class, value);
2530 if (flags & SCF_DO_STCLASS_AND) {
2531 if (data->start_class->flags & ANYOF_LOCALE)
2532 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2535 data->start_class->flags |= ANYOF_LOCALE;
2536 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2540 if (flags & SCF_DO_STCLASS_AND) {
2541 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2542 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2543 for (value = 0; value < 256; value++)
2544 if (!isSPACE(value))
2545 ANYOF_BITMAP_CLEAR(data->start_class, value);
2549 if (data->start_class->flags & ANYOF_LOCALE)
2550 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2552 for (value = 0; value < 256; value++)
2554 ANYOF_BITMAP_SET(data->start_class, value);
2559 if (flags & SCF_DO_STCLASS_AND) {
2560 if (data->start_class->flags & ANYOF_LOCALE)
2561 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2564 data->start_class->flags |= ANYOF_LOCALE;
2565 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2569 if (flags & SCF_DO_STCLASS_AND) {
2570 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2571 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2572 for (value = 0; value < 256; value++)
2574 ANYOF_BITMAP_CLEAR(data->start_class, value);
2578 if (data->start_class->flags & ANYOF_LOCALE)
2579 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2581 for (value = 0; value < 256; value++)
2582 if (!isSPACE(value))
2583 ANYOF_BITMAP_SET(data->start_class, value);
2588 if (flags & SCF_DO_STCLASS_AND) {
2589 if (data->start_class->flags & ANYOF_LOCALE) {
2590 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2591 for (value = 0; value < 256; value++)
2592 if (!isSPACE(value))
2593 ANYOF_BITMAP_CLEAR(data->start_class, value);
2597 data->start_class->flags |= ANYOF_LOCALE;
2598 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2602 if (flags & SCF_DO_STCLASS_AND) {
2603 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2604 for (value = 0; value < 256; value++)
2605 if (!isDIGIT(value))
2606 ANYOF_BITMAP_CLEAR(data->start_class, value);
2609 if (data->start_class->flags & ANYOF_LOCALE)
2610 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2612 for (value = 0; value < 256; value++)
2614 ANYOF_BITMAP_SET(data->start_class, value);
2619 if (flags & SCF_DO_STCLASS_AND) {
2620 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2621 for (value = 0; value < 256; value++)
2623 ANYOF_BITMAP_CLEAR(data->start_class, value);
2626 if (data->start_class->flags & ANYOF_LOCALE)
2627 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2629 for (value = 0; value < 256; value++)
2630 if (!isDIGIT(value))
2631 ANYOF_BITMAP_SET(data->start_class, value);
2636 if (flags & SCF_DO_STCLASS_OR)
2637 cl_and(data->start_class, &and_with);
2638 flags &= ~SCF_DO_STCLASS;
2641 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2642 data->flags |= (OP(scan) == MEOL
2646 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2647 /* Lookbehind, or need to calculate parens/evals/stclass: */
2648 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2649 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2650 /* Lookahead/lookbehind */
2651 I32 deltanext, minnext, fake = 0;
2653 struct regnode_charclass_class intrnl;
2656 data_fake.flags = 0;
2658 data_fake.whilem_c = data->whilem_c;
2659 data_fake.last_closep = data->last_closep;
2662 data_fake.last_closep = &fake;
2663 if ( flags & SCF_DO_STCLASS && !scan->flags
2664 && OP(scan) == IFMATCH ) { /* Lookahead */
2665 cl_init(pRExC_state, &intrnl);
2666 data_fake.start_class = &intrnl;
2667 f |= SCF_DO_STCLASS_AND;
2669 if (flags & SCF_WHILEM_VISITED_POS)
2670 f |= SCF_WHILEM_VISITED_POS;
2671 next = regnext(scan);
2672 nscan = NEXTOPER(NEXTOPER(scan));
2673 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2676 vFAIL("Variable length lookbehind not implemented");
2678 else if (minnext > U8_MAX) {
2679 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2681 scan->flags = (U8)minnext;
2683 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2685 if (data && (data_fake.flags & SF_HAS_EVAL))
2686 data->flags |= SF_HAS_EVAL;
2688 data->whilem_c = data_fake.whilem_c;
2689 if (f & SCF_DO_STCLASS_AND) {
2690 const int was = (data->start_class->flags & ANYOF_EOS);
2692 cl_and(data->start_class, &intrnl);
2694 data->start_class->flags |= ANYOF_EOS;
2697 else if (OP(scan) == OPEN) {
2700 else if (OP(scan) == CLOSE) {
2701 if ((I32)ARG(scan) == is_par) {
2702 next = regnext(scan);
2704 if ( next && (OP(next) != WHILEM) && next < last)
2705 is_par = 0; /* Disable optimization */
2708 *(data->last_closep) = ARG(scan);
2710 else if (OP(scan) == EVAL) {
2712 data->flags |= SF_HAS_EVAL;
2714 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2715 if (flags & SCF_DO_SUBSTR) {
2716 scan_commit(pRExC_state,data);
2717 data->longest = &(data->longest_float);
2719 is_inf = is_inf_internal = 1;
2720 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2721 cl_anything(pRExC_state, data->start_class);
2722 flags &= ~SCF_DO_STCLASS;
2724 /* Else: zero-length, ignore. */
2725 scan = regnext(scan);
2730 *deltap = is_inf_internal ? I32_MAX : delta;
2731 if (flags & SCF_DO_SUBSTR && is_inf)
2732 data->pos_delta = I32_MAX - data->pos_min;
2733 if (is_par > U8_MAX)
2735 if (is_par && pars==1 && data) {
2736 data->flags |= SF_IN_PAR;
2737 data->flags &= ~SF_HAS_PAR;
2739 else if (pars && data) {
2740 data->flags |= SF_HAS_PAR;
2741 data->flags &= ~SF_IN_PAR;
2743 if (flags & SCF_DO_STCLASS_OR)
2744 cl_and(data->start_class, &and_with);
2749 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
2751 if (RExC_rx->data) {
2752 Renewc(RExC_rx->data,
2753 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2754 char, struct reg_data);
2755 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2756 RExC_rx->data->count += n;
2759 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2760 char, struct reg_data);
2761 Newx(RExC_rx->data->what, n, U8);
2762 RExC_rx->data->count = n;
2764 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2765 return RExC_rx->data->count - n;
2769 Perl_reginitcolors(pTHX)
2772 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2774 char *t = savepv(s);
2778 t = strchr(t, '\t');
2784 PL_colors[i] = t = (char *)"";
2789 PL_colors[i++] = (char *)"";
2796 - pregcomp - compile a regular expression into internal code
2798 * We can't allocate space until we know how big the compiled form will be,
2799 * but we can't compile it (and thus know how big it is) until we've got a
2800 * place to put the code. So we cheat: we compile it twice, once with code
2801 * generation turned off and size counting turned on, and once "for real".
2802 * This also means that we don't allocate space until we are sure that the
2803 * thing really will compile successfully, and we never have to move the
2804 * code and thus invalidate pointers into it. (Note that it has to be in
2805 * one piece because free() must be able to free it all.) [NB: not true in perl]
2807 * Beware that the optimization-preparation code in here knows about some
2808 * of the structure of the compiled regexp. [I'll say.]
2811 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2822 RExC_state_t RExC_state;
2823 RExC_state_t *pRExC_state = &RExC_state;
2825 GET_RE_DEBUG_FLAGS_DECL;
2828 FAIL("NULL regexp argument");
2830 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2833 DEBUG_r(if (!PL_colorset) reginitcolors());
2835 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2836 PL_colors[4],PL_colors[5],PL_colors[0],
2837 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2839 RExC_flags = pm->op_pmflags;
2843 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2844 RExC_seen_evals = 0;
2847 /* First pass: determine size, legality. */
2854 RExC_emit = &PL_regdummy;
2855 RExC_whilem_seen = 0;
2856 #if 0 /* REGC() is (currently) a NOP at the first pass.
2857 * Clever compilers notice this and complain. --jhi */
2858 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2860 if (reg(pRExC_state, 0, &flags) == NULL) {
2861 RExC_precomp = NULL;
2864 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2866 /* Small enough for pointer-storage convention?
2867 If extralen==0, this means that we will not need long jumps. */
2868 if (RExC_size >= 0x10000L && RExC_extralen)
2869 RExC_size += RExC_extralen;
2872 if (RExC_whilem_seen > 15)
2873 RExC_whilem_seen = 15;
2875 /* Allocate space and initialize. */
2876 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2879 FAIL("Regexp out of space");
2882 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2883 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2886 r->prelen = xend - exp;
2887 r->precomp = savepvn(RExC_precomp, r->prelen);
2889 #ifdef PERL_OLD_COPY_ON_WRITE
2890 r->saved_copy = NULL;
2892 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2893 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2895 r->substrs = 0; /* Useful during FAIL. */
2896 r->startp = 0; /* Useful during FAIL. */
2897 r->endp = 0; /* Useful during FAIL. */
2899 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2901 r->offsets[0] = RExC_size;
2903 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2904 "%s %"UVuf" bytes for offset annotations.\n",
2905 r->offsets ? "Got" : "Couldn't get",
2906 (UV)((2*RExC_size+1) * sizeof(U32))));
2910 /* Second pass: emit code. */
2911 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2916 RExC_emit_start = r->program;
2917 RExC_emit = r->program;
2918 /* Store the count of eval-groups for security checks: */
2919 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2920 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2922 if (reg(pRExC_state, 0, &flags) == NULL)
2926 /* Dig out information for optimizations. */
2927 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2928 pm->op_pmflags = RExC_flags;
2930 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2931 r->regstclass = NULL;
2932 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2933 r->reganch |= ROPT_NAUGHTY;
2934 scan = r->program + 1; /* First BRANCH. */
2936 /* XXXX To minimize changes to RE engine we always allocate
2937 3-units-long substrs field. */
2938 Newxz(r->substrs, 1, struct reg_substr_data);
2940 StructCopy(&zero_scan_data, &data, scan_data_t);
2941 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2942 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2944 STRLEN longest_float_length, longest_fixed_length;
2945 struct regnode_charclass_class ch_class;
2950 /* Skip introductions and multiplicators >= 1. */
2951 while ((OP(first) == OPEN && (sawopen = 1)) ||
2952 /* An OR of *one* alternative - should not happen now. */
2953 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2954 (OP(first) == PLUS) ||
2955 (OP(first) == MINMOD) ||
2956 /* An {n,m} with n>0 */
2957 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2958 if (OP(first) == PLUS)
2961 first += regarglen[(U8)OP(first)];
2962 first = NEXTOPER(first);
2965 /* Starting-point info. */
2967 if (PL_regkind[(U8)OP(first)] == EXACT) {
2968 if (OP(first) == EXACT)
2969 ; /* Empty, get anchored substr later. */
2970 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2971 r->regstclass = first;
2973 else if (strchr((const char*)PL_simple,OP(first)))
2974 r->regstclass = first;
2975 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2976 PL_regkind[(U8)OP(first)] == NBOUND)
2977 r->regstclass = first;
2978 else if (PL_regkind[(U8)OP(first)] == BOL) {
2979 r->reganch |= (OP(first) == MBOL
2981 : (OP(first) == SBOL
2984 first = NEXTOPER(first);
2987 else if (OP(first) == GPOS) {
2988 r->reganch |= ROPT_ANCH_GPOS;
2989 first = NEXTOPER(first);
2992 else if (!sawopen && (OP(first) == STAR &&
2993 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2994 !(r->reganch & ROPT_ANCH) )
2996 /* turn .* into ^.* with an implied $*=1 */
2998 (OP(NEXTOPER(first)) == REG_ANY)
3001 r->reganch |= type | ROPT_IMPLICIT;
3002 first = NEXTOPER(first);
3005 if (sawplus && (!sawopen || !RExC_sawback)
3006 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3007 /* x+ must match at the 1st pos of run of x's */
3008 r->reganch |= ROPT_SKIP;
3010 /* Scan is after the zeroth branch, first is atomic matcher. */
3011 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3012 (IV)(first - scan + 1)));
3014 * If there's something expensive in the r.e., find the
3015 * longest literal string that must appear and make it the
3016 * regmust. Resolve ties in favor of later strings, since
3017 * the regstart check works with the beginning of the r.e.
3018 * and avoiding duplication strengthens checking. Not a
3019 * strong reason, but sufficient in the absence of others.
3020 * [Now we resolve ties in favor of the earlier string if
3021 * it happens that c_offset_min has been invalidated, since the
3022 * earlier string may buy us something the later one won't.]
3026 data.longest_fixed = newSVpvs("");
3027 data.longest_float = newSVpvs("");
3028 data.last_found = newSVpvs("");
3029 data.longest = &(data.longest_fixed);
3031 if (!r->regstclass) {
3032 cl_init(pRExC_state, &ch_class);
3033 data.start_class = &ch_class;
3034 stclass_flag = SCF_DO_STCLASS_AND;
3035 } else /* XXXX Check for BOUND? */
3037 data.last_closep = &last_close;
3039 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3040 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3041 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3042 && data.last_start_min == 0 && data.last_end > 0
3043 && !RExC_seen_zerolen
3044 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3045 r->reganch |= ROPT_CHECK_ALL;
3046 scan_commit(pRExC_state, &data);
3047 SvREFCNT_dec(data.last_found);
3049 longest_float_length = CHR_SVLEN(data.longest_float);
3050 if (longest_float_length
3051 || (data.flags & SF_FL_BEFORE_EOL
3052 && (!(data.flags & SF_FL_BEFORE_MEOL)
3053 || (RExC_flags & PMf_MULTILINE)))) {
3056 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3057 && data.offset_fixed == data.offset_float_min
3058 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3059 goto remove_float; /* As in (a)+. */
3061 if (SvUTF8(data.longest_float)) {
3062 r->float_utf8 = data.longest_float;
3063 r->float_substr = NULL;
3065 r->float_substr = data.longest_float;
3066 r->float_utf8 = NULL;
3068 r->float_min_offset = data.offset_float_min;
3069 r->float_max_offset = data.offset_float_max;
3070 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3071 && (!(data.flags & SF_FL_BEFORE_MEOL)
3072 || (RExC_flags & PMf_MULTILINE)));
3073 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3077 r->float_substr = r->float_utf8 = NULL;
3078 SvREFCNT_dec(data.longest_float);
3079 longest_float_length = 0;
3082 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3083 if (longest_fixed_length
3084 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3085 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3086 || (RExC_flags & PMf_MULTILINE)))) {
3089 if (SvUTF8(data.longest_fixed)) {
3090 r->anchored_utf8 = data.longest_fixed;
3091 r->anchored_substr = NULL;
3093 r->anchored_substr = data.longest_fixed;
3094 r->anchored_utf8 = NULL;
3096 r->anchored_offset = data.offset_fixed;
3097 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3098 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3099 || (RExC_flags & PMf_MULTILINE)));
3100 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3103 r->anchored_substr = r->anchored_utf8 = NULL;
3104 SvREFCNT_dec(data.longest_fixed);
3105 longest_fixed_length = 0;
3108 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3109 r->regstclass = NULL;
3110 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3112 && !(data.start_class->flags & ANYOF_EOS)
3113 && !cl_is_anything(data.start_class))
3115 const I32 n = add_data(pRExC_state, 1, "f");
3117 Newx(RExC_rx->data->data[n], 1,
3118 struct regnode_charclass_class);
3119 StructCopy(data.start_class,
3120 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3121 struct regnode_charclass_class);
3122 r->regstclass = (regnode*)RExC_rx->data->data[n];
3123 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3124 PL_regdata = r->data; /* for regprop() */
3125 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3126 regprop(sv, (regnode*)data.start_class);
3127 PerlIO_printf(Perl_debug_log,
3128 "synthetic stclass \"%s\".\n",
3129 SvPVX_const(sv));});
3132 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3133 if (longest_fixed_length > longest_float_length) {
3134 r->check_substr = r->anchored_substr;
3135 r->check_utf8 = r->anchored_utf8;
3136 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3137 if (r->reganch & ROPT_ANCH_SINGLE)
3138 r->reganch |= ROPT_NOSCAN;
3141 r->check_substr = r->float_substr;
3142 r->check_utf8 = r->float_utf8;
3143 r->check_offset_min = data.offset_float_min;
3144 r->check_offset_max = data.offset_float_max;
3146 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3147 This should be changed ASAP! */
3148 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3149 r->reganch |= RE_USE_INTUIT;
3150 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3151 r->reganch |= RE_INTUIT_TAIL;
3155 /* Several toplevels. Best we can is to set minlen. */
3157 struct regnode_charclass_class ch_class;
3160 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3161 scan = r->program + 1;
3162 cl_init(pRExC_state, &ch_class);
3163 data.start_class = &ch_class;
3164 data.last_closep = &last_close;
3165 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3166 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3167 = r->float_substr = r->float_utf8 = NULL;
3168 if (!(data.start_class->flags & ANYOF_EOS)
3169 && !cl_is_anything(data.start_class))
3171 const I32 n = add_data(pRExC_state, 1, "f");
3173 Newx(RExC_rx->data->data[n], 1,
3174 struct regnode_charclass_class);
3175 StructCopy(data.start_class,
3176 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3177 struct regnode_charclass_class);
3178 r->regstclass = (regnode*)RExC_rx->data->data[n];
3179 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3180 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3181 regprop(sv, (regnode*)data.start_class);
3182 PerlIO_printf(Perl_debug_log,
3183 "synthetic stclass \"%s\".\n",
3184 SvPVX_const(sv));});
3189 if (RExC_seen & REG_SEEN_GPOS)
3190 r->reganch |= ROPT_GPOS_SEEN;
3191 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3192 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3193 if (RExC_seen & REG_SEEN_EVAL)
3194 r->reganch |= ROPT_EVAL_SEEN;
3195 if (RExC_seen & REG_SEEN_CANY)
3196 r->reganch |= ROPT_CANY_SEEN;
3197 Newxz(r->startp, RExC_npar, I32);
3198 Newxz(r->endp, RExC_npar, I32);
3199 PL_regdata = r->data; /* for regprop() */
3200 DEBUG_COMPILE_r(regdump(r));
3205 - reg - regular expression, i.e. main body or parenthesized thing
3207 * Caller must absorb opening parenthesis.
3209 * Combining parenthesis handling with the base level of regular expression
3210 * is a trifle forced, but the need to tie the tails of the branches to what
3211 * follows makes it hard to avoid.
3214 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3215 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3218 register regnode *ret; /* Will be the head of the group. */
3219 register regnode *br;
3220 register regnode *lastbr;
3221 register regnode *ender = NULL;
3222 register I32 parno = 0;
3224 const I32 oregflags = RExC_flags;
3225 I32 have_branch = 0;
3228 /* for (?g), (?gc), and (?o) warnings; warning
3229 about (?c) will warn about (?g) -- japhy */
3231 I32 wastedflags = 0x00;
3232 const I32 wasted_o = 0x01;
3233 const I32 wasted_g = 0x02;
3234 const I32 wasted_gc = 0x02 | 0x04;
3235 const I32 wasted_c = 0x04;
3237 char * parse_start = RExC_parse; /* MJD */
3238 char * const oregcomp_parse = RExC_parse;
3241 *flagp = 0; /* Tentatively. */
3244 /* Make an OPEN node, if parenthesized. */
3246 if (*RExC_parse == '?') { /* (?...) */
3247 U32 posflags = 0, negflags = 0;
3248 U32 *flagsp = &posflags;
3250 const char * const seqstart = RExC_parse;
3253 paren = *RExC_parse++;
3254 ret = NULL; /* For look-ahead/behind. */
3256 case '<': /* (?<...) */
3257 RExC_seen |= REG_SEEN_LOOKBEHIND;
3258 if (*RExC_parse == '!')
3260 if (*RExC_parse != '=' && *RExC_parse != '!')
3263 case '=': /* (?=...) */
3264 case '!': /* (?!...) */
3265 RExC_seen_zerolen++;
3266 case ':': /* (?:...) */
3267 case '>': /* (?>...) */
3269 case '$': /* (?$...) */
3270 case '@': /* (?@...) */
3271 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3273 case '#': /* (?#...) */
3274 while (*RExC_parse && *RExC_parse != ')')
3276 if (*RExC_parse != ')')
3277 FAIL("Sequence (?#... not terminated");
3278 nextchar(pRExC_state);
3281 case 'p': /* (?p...) */
3282 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3283 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3285 case '?': /* (??...) */
3287 if (*RExC_parse != '{')
3289 paren = *RExC_parse++;
3291 case '{': /* (?{...}) */
3293 I32 count = 1, n = 0;
3295 char *s = RExC_parse;
3297 OP_4tree *sop, *rop;
3299 RExC_seen_zerolen++;
3300 RExC_seen |= REG_SEEN_EVAL;
3301 while (count && (c = *RExC_parse)) {
3302 if (c == '\\' && RExC_parse[1])
3310 if (*RExC_parse != ')')
3313 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3318 if (RExC_parse - 1 - s)
3319 sv = newSVpvn(s, RExC_parse - 1 - s);
3324 Perl_save_re_context(aTHX);
3325 rop = sv_compile_2op(sv, &sop, "re", &pad);
3326 sop->op_private |= OPpREFCOUNTED;
3327 /* re_dup will OpREFCNT_inc */
3328 OpREFCNT_set(sop, 1);
3331 n = add_data(pRExC_state, 3, "nop");
3332 RExC_rx->data->data[n] = (void*)rop;
3333 RExC_rx->data->data[n+1] = (void*)sop;
3334 RExC_rx->data->data[n+2] = (void*)pad;
3337 else { /* First pass */
3338 if (PL_reginterp_cnt < ++RExC_seen_evals
3340 /* No compiled RE interpolated, has runtime
3341 components ===> unsafe. */
3342 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3343 if (PL_tainting && PL_tainted)
3344 FAIL("Eval-group in insecure regular expression");
3345 if (IN_PERL_COMPILETIME)
3349 nextchar(pRExC_state);
3351 ret = reg_node(pRExC_state, LOGICAL);
3354 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3355 /* deal with the length of this later - MJD */
3358 ret = reganode(pRExC_state, EVAL, n);
3359 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3360 Set_Node_Offset(ret, parse_start);
3363 case '(': /* (?(?{...})...) and (?(?=...)...) */
3365 if (RExC_parse[0] == '?') { /* (?(?...)) */
3366 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3367 || RExC_parse[1] == '<'
3368 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3371 ret = reg_node(pRExC_state, LOGICAL);
3374 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3378 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3380 parno = atoi(RExC_parse++);
3382 while (isDIGIT(*RExC_parse))
3384 ret = reganode(pRExC_state, GROUPP, parno);
3386 if ((c = *nextchar(pRExC_state)) != ')')
3387 vFAIL("Switch condition not recognized");
3389 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3390 br = regbranch(pRExC_state, &flags, 1);
3392 br = reganode(pRExC_state, LONGJMP, 0);
3394 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3395 c = *nextchar(pRExC_state);
3399 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3400 regbranch(pRExC_state, &flags, 1);
3401 regtail(pRExC_state, ret, lastbr);
3404 c = *nextchar(pRExC_state);
3409 vFAIL("Switch (?(condition)... contains too many branches");
3410 ender = reg_node(pRExC_state, TAIL);
3411 regtail(pRExC_state, br, ender);
3413 regtail(pRExC_state, lastbr, ender);
3414 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3417 regtail(pRExC_state, ret, ender);
3421 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3425 RExC_parse--; /* for vFAIL to print correctly */
3426 vFAIL("Sequence (? incomplete");
3430 parse_flags: /* (?i) */
3431 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3432 /* (?g), (?gc) and (?o) are useless here
3433 and must be globally applied -- japhy */
3435 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3436 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3437 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3438 if (! (wastedflags & wflagbit) ) {
3439 wastedflags |= wflagbit;
3442 "Useless (%s%c) - %suse /%c modifier",
3443 flagsp == &negflags ? "?-" : "?",
3445 flagsp == &negflags ? "don't " : "",
3451 else if (*RExC_parse == 'c') {
3452 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3453 if (! (wastedflags & wasted_c) ) {
3454 wastedflags |= wasted_gc;
3457 "Useless (%sc) - %suse /gc modifier",
3458 flagsp == &negflags ? "?-" : "?",
3459 flagsp == &negflags ? "don't " : ""
3464 else { pmflag(flagsp, *RExC_parse); }
3468 if (*RExC_parse == '-') {
3470 wastedflags = 0; /* reset so (?g-c) warns twice */
3474 RExC_flags |= posflags;
3475 RExC_flags &= ~negflags;
3476 if (*RExC_parse == ':') {
3482 if (*RExC_parse != ')') {
3484 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3486 nextchar(pRExC_state);
3494 ret = reganode(pRExC_state, OPEN, parno);
3495 Set_Node_Length(ret, 1); /* MJD */
3496 Set_Node_Offset(ret, RExC_parse); /* MJD */
3503 /* Pick up the branches, linking them together. */
3504 parse_start = RExC_parse; /* MJD */
3505 br = regbranch(pRExC_state, &flags, 1);
3506 /* branch_len = (paren != 0); */
3510 if (*RExC_parse == '|') {
3511 if (!SIZE_ONLY && RExC_extralen) {
3512 reginsert(pRExC_state, BRANCHJ, br);
3515 reginsert(pRExC_state, BRANCH, br);
3516 Set_Node_Length(br, paren != 0);
3517 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3521 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3523 else if (paren == ':') {
3524 *flagp |= flags&SIMPLE;
3526 if (open) { /* Starts with OPEN. */
3527 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3529 else if (paren != '?') /* Not Conditional */
3531 *flagp |= flags & (SPSTART | HASWIDTH);
3533 while (*RExC_parse == '|') {
3534 if (!SIZE_ONLY && RExC_extralen) {
3535 ender = reganode(pRExC_state, LONGJMP,0);
3536 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3539 RExC_extralen += 2; /* Account for LONGJMP. */
3540 nextchar(pRExC_state);
3541 br = regbranch(pRExC_state, &flags, 0);
3545 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3549 *flagp |= flags&SPSTART;
3552 if (have_branch || paren != ':') {
3553 /* Make a closing node, and hook it on the end. */
3556 ender = reg_node(pRExC_state, TAIL);
3559 ender = reganode(pRExC_state, CLOSE, parno);
3560 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3561 Set_Node_Length(ender,1); /* MJD */
3567 *flagp &= ~HASWIDTH;
3570 ender = reg_node(pRExC_state, SUCCEED);
3573 ender = reg_node(pRExC_state, END);
3576 regtail(pRExC_state, lastbr, ender);
3579 /* Hook the tails of the branches to the closing node. */
3580 for (br = ret; br != NULL; br = regnext(br)) {
3581 regoptail(pRExC_state, br, ender);
3588 static const char parens[] = "=!<,>";
3590 if (paren && (p = strchr(parens, paren))) {
3591 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3592 int flag = (p - parens) > 1;
3595 node = SUSPEND, flag = 0;
3596 reginsert(pRExC_state, node,ret);
3597 Set_Node_Cur_Length(ret);
3598 Set_Node_Offset(ret, parse_start + 1);
3600 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3604 /* Check for proper termination. */
3606 RExC_flags = oregflags;
3607 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3608 RExC_parse = oregcomp_parse;
3609 vFAIL("Unmatched (");
3612 else if (!paren && RExC_parse < RExC_end) {
3613 if (*RExC_parse == ')') {
3615 vFAIL("Unmatched )");
3618 FAIL("Junk on end of regexp"); /* "Can't happen". */
3626 - regbranch - one alternative of an | operator
3628 * Implements the concatenation operator.
3631 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3634 register regnode *ret;
3635 register regnode *chain = NULL;
3636 register regnode *latest;
3637 I32 flags = 0, c = 0;
3642 if (!SIZE_ONLY && RExC_extralen)
3643 ret = reganode(pRExC_state, BRANCHJ,0);
3645 ret = reg_node(pRExC_state, BRANCH);
3646 Set_Node_Length(ret, 1);
3650 if (!first && SIZE_ONLY)
3651 RExC_extralen += 1; /* BRANCHJ */
3653 *flagp = WORST; /* Tentatively. */
3656 nextchar(pRExC_state);
3657 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3659 latest = regpiece(pRExC_state, &flags);
3660 if (latest == NULL) {
3661 if (flags & TRYAGAIN)
3665 else if (ret == NULL)
3667 *flagp |= flags&HASWIDTH;
3668 if (chain == NULL) /* First piece. */
3669 *flagp |= flags&SPSTART;
3672 regtail(pRExC_state, chain, latest);
3677 if (chain == NULL) { /* Loop ran zero times. */
3678 chain = reg_node(pRExC_state, NOTHING);
3683 *flagp |= flags&SIMPLE;
3690 - regpiece - something followed by possible [*+?]
3692 * Note that the branching code sequences used for ? and the general cases
3693 * of * and + are somewhat optimized: they use the same NOTHING node as
3694 * both the endmarker for their branch list and the body of the last branch.
3695 * It might seem that this node could be dispensed with entirely, but the
3696 * endmarker role is not redundant.
3699 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3702 register regnode *ret;
3704 register char *next;
3706 const char * const origparse = RExC_parse;
3709 I32 max = REG_INFTY;
3712 ret = regatom(pRExC_state, &flags);
3714 if (flags & TRYAGAIN)
3721 if (op == '{' && regcurly(RExC_parse)) {
3722 parse_start = RExC_parse; /* MJD */
3723 next = RExC_parse + 1;
3725 while (isDIGIT(*next) || *next == ',') {
3734 if (*next == '}') { /* got one */
3738 min = atoi(RExC_parse);
3742 maxpos = RExC_parse;
3744 if (!max && *maxpos != '0')
3745 max = REG_INFTY; /* meaning "infinity" */
3746 else if (max >= REG_INFTY)
3747 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3749 nextchar(pRExC_state);
3752 if ((flags&SIMPLE)) {
3753 RExC_naughty += 2 + RExC_naughty / 2;
3754 reginsert(pRExC_state, CURLY, ret);
3755 Set_Node_Offset(ret, parse_start+1); /* MJD */
3756 Set_Node_Cur_Length(ret);
3759 regnode *w = reg_node(pRExC_state, WHILEM);
3762 regtail(pRExC_state, ret, w);
3763 if (!SIZE_ONLY && RExC_extralen) {
3764 reginsert(pRExC_state, LONGJMP,ret);
3765 reginsert(pRExC_state, NOTHING,ret);
3766 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3768 reginsert(pRExC_state, CURLYX,ret);
3770 Set_Node_Offset(ret, parse_start+1);
3771 Set_Node_Length(ret,
3772 op == '{' ? (RExC_parse - parse_start) : 1);
3774 if (!SIZE_ONLY && RExC_extralen)
3775 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3776 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3778 RExC_whilem_seen++, RExC_extralen += 3;
3779 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3787 if (max && max < min)
3788 vFAIL("Can't do {n,m} with n > m");
3790 ARG1_SET(ret, (U16)min);
3791 ARG2_SET(ret, (U16)max);
3803 #if 0 /* Now runtime fix should be reliable. */
3805 /* if this is reinstated, don't forget to put this back into perldiag:
3807 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3809 (F) The part of the regexp subject to either the * or + quantifier
3810 could match an empty string. The {#} shows in the regular
3811 expression about where the problem was discovered.
3815 if (!(flags&HASWIDTH) && op != '?')
3816 vFAIL("Regexp *+ operand could be empty");
3819 parse_start = RExC_parse;
3820 nextchar(pRExC_state);
3822 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3824 if (op == '*' && (flags&SIMPLE)) {
3825 reginsert(pRExC_state, STAR, ret);
3829 else if (op == '*') {
3833 else if (op == '+' && (flags&SIMPLE)) {
3834 reginsert(pRExC_state, PLUS, ret);
3838 else if (op == '+') {
3842 else if (op == '?') {
3847 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
3849 "%.*s matches null string many times",
3850 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
3854 if (*RExC_parse == '?') {
3855 nextchar(pRExC_state);
3856 reginsert(pRExC_state, MINMOD, ret);
3857 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3859 if (ISMULT2(RExC_parse)) {
3861 vFAIL("Nested quantifiers");
3868 - regatom - the lowest level
3870 * Optimization: gobbles an entire sequence of ordinary characters so that
3871 * it can turn them into a single node, which is smaller to store and
3872 * faster to run. Backslashed characters are exceptions, each becoming a
3873 * separate node; the code is simpler that way and it's not worth fixing.
3875 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3877 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3880 register regnode *ret = NULL;
3882 char *parse_start = RExC_parse;
3884 *flagp = WORST; /* Tentatively. */
3887 switch (*RExC_parse) {
3889 RExC_seen_zerolen++;
3890 nextchar(pRExC_state);
3891 if (RExC_flags & PMf_MULTILINE)
3892 ret = reg_node(pRExC_state, MBOL);
3893 else if (RExC_flags & PMf_SINGLELINE)
3894 ret = reg_node(pRExC_state, SBOL);
3896 ret = reg_node(pRExC_state, BOL);
3897 Set_Node_Length(ret, 1); /* MJD */
3900 nextchar(pRExC_state);
3902 RExC_seen_zerolen++;
3903 if (RExC_flags & PMf_MULTILINE)
3904 ret = reg_node(pRExC_state, MEOL);
3905 else if (RExC_flags & PMf_SINGLELINE)
3906 ret = reg_node(pRExC_state, SEOL);
3908 ret = reg_node(pRExC_state, EOL);
3909 Set_Node_Length(ret, 1); /* MJD */
3912 nextchar(pRExC_state);
3913 if (RExC_flags & PMf_SINGLELINE)
3914 ret = reg_node(pRExC_state, SANY);
3916 ret = reg_node(pRExC_state, REG_ANY);
3917 *flagp |= HASWIDTH|SIMPLE;
3919 Set_Node_Length(ret, 1); /* MJD */
3923 char *oregcomp_parse = ++RExC_parse;
3924 ret = regclass(pRExC_state);
3925 if (*RExC_parse != ']') {
3926 RExC_parse = oregcomp_parse;
3927 vFAIL("Unmatched [");
3929 nextchar(pRExC_state);
3930 *flagp |= HASWIDTH|SIMPLE;
3931 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3935 nextchar(pRExC_state);
3936 ret = reg(pRExC_state, 1, &flags);
3938 if (flags & TRYAGAIN) {
3939 if (RExC_parse == RExC_end) {
3940 /* Make parent create an empty node if needed. */
3948 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3952 if (flags & TRYAGAIN) {
3956 vFAIL("Internal urp");
3957 /* Supposed to be caught earlier. */
3960 if (!regcurly(RExC_parse)) {
3969 vFAIL("Quantifier follows nothing");
3972 switch (*++RExC_parse) {
3974 RExC_seen_zerolen++;
3975 ret = reg_node(pRExC_state, SBOL);
3977 nextchar(pRExC_state);
3978 Set_Node_Length(ret, 2); /* MJD */
3981 ret = reg_node(pRExC_state, GPOS);
3982 RExC_seen |= REG_SEEN_GPOS;
3984 nextchar(pRExC_state);
3985 Set_Node_Length(ret, 2); /* MJD */
3988 ret = reg_node(pRExC_state, SEOL);
3990 RExC_seen_zerolen++; /* Do not optimize RE away */
3991 nextchar(pRExC_state);
3994 ret = reg_node(pRExC_state, EOS);
3996 RExC_seen_zerolen++; /* Do not optimize RE away */
3997 nextchar(pRExC_state);
3998 Set_Node_Length(ret, 2); /* MJD */
4001 ret = reg_node(pRExC_state, CANY);
4002 RExC_seen |= REG_SEEN_CANY;
4003 *flagp |= HASWIDTH|SIMPLE;
4004 nextchar(pRExC_state);
4005 Set_Node_Length(ret, 2); /* MJD */
4008 ret = reg_node(pRExC_state, CLUMP);
4010 nextchar(pRExC_state);
4011 Set_Node_Length(ret, 2); /* MJD */
4014 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4015 *flagp |= HASWIDTH|SIMPLE;
4016 nextchar(pRExC_state);
4017 Set_Node_Length(ret, 2); /* MJD */
4020 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4021 *flagp |= HASWIDTH|SIMPLE;
4022 nextchar(pRExC_state);
4023 Set_Node_Length(ret, 2); /* MJD */
4026 RExC_seen_zerolen++;
4027 RExC_seen |= REG_SEEN_LOOKBEHIND;
4028 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4030 nextchar(pRExC_state);
4031 Set_Node_Length(ret, 2); /* MJD */
4034 RExC_seen_zerolen++;
4035 RExC_seen |= REG_SEEN_LOOKBEHIND;
4036 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4038 nextchar(pRExC_state);
4039 Set_Node_Length(ret, 2); /* MJD */
4042 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4043 *flagp |= HASWIDTH|SIMPLE;
4044 nextchar(pRExC_state);
4045 Set_Node_Length(ret, 2); /* MJD */
4048 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4049 *flagp |= HASWIDTH|SIMPLE;
4050 nextchar(pRExC_state);
4051 Set_Node_Length(ret, 2); /* MJD */
4054 ret = reg_node(pRExC_state, DIGIT);
4055 *flagp |= HASWIDTH|SIMPLE;
4056 nextchar(pRExC_state);
4057 Set_Node_Length(ret, 2); /* MJD */
4060 ret = reg_node(pRExC_state, NDIGIT);
4061 *flagp |= HASWIDTH|SIMPLE;
4062 nextchar(pRExC_state);
4063 Set_Node_Length(ret, 2); /* MJD */
4068 char* oldregxend = RExC_end;
4069 char* parse_start = RExC_parse - 2;
4071 if (RExC_parse[1] == '{') {
4072 /* a lovely hack--pretend we saw [\pX] instead */
4073 RExC_end = strchr(RExC_parse, '}');
4075 U8 c = (U8)*RExC_parse;
4077 RExC_end = oldregxend;
4078 vFAIL2("Missing right brace on \\%c{}", c);
4083 RExC_end = RExC_parse + 2;
4084 if (RExC_end > oldregxend)
4085 RExC_end = oldregxend;
4089 ret = regclass(pRExC_state);
4091 RExC_end = oldregxend;
4094 Set_Node_Offset(ret, parse_start + 2);
4095 Set_Node_Cur_Length(ret);
4096 nextchar(pRExC_state);
4097 *flagp |= HASWIDTH|SIMPLE;
4110 case '1': case '2': case '3': case '4':
4111 case '5': case '6': case '7': case '8': case '9':
4113 const I32 num = atoi(RExC_parse);
4115 if (num > 9 && num >= RExC_npar)
4118 char * parse_start = RExC_parse - 1; /* MJD */
4119 while (isDIGIT(*RExC_parse))
4122 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4123 vFAIL("Reference to nonexistent group");
4125 ret = reganode(pRExC_state,
4126 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4130 /* override incorrect value set in reganode MJD */
4131 Set_Node_Offset(ret, parse_start+1);
4132 Set_Node_Cur_Length(ret); /* MJD */
4134 nextchar(pRExC_state);
4139 if (RExC_parse >= RExC_end)
4140 FAIL("Trailing \\");
4143 /* Do not generate "unrecognized" warnings here, we fall
4144 back into the quick-grab loop below */
4151 if (RExC_flags & PMf_EXTENDED) {
4152 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4153 if (RExC_parse < RExC_end)
4159 register STRLEN len;
4164 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4166 parse_start = RExC_parse - 1;
4172 ret = reg_node(pRExC_state,
4173 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4175 for (len = 0, p = RExC_parse - 1;
4176 len < 127 && p < RExC_end;
4181 if (RExC_flags & PMf_EXTENDED)
4182 p = regwhite(p, RExC_end);
4229 ender = ASCII_TO_NATIVE('\033');
4233 ender = ASCII_TO_NATIVE('\007');
4238 char* const e = strchr(p, '}');
4242 vFAIL("Missing right brace on \\x{}");
4245 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4246 | PERL_SCAN_DISALLOW_PREFIX;
4247 STRLEN numlen = e - p - 1;
4248 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4255 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4257 ender = grok_hex(p, &numlen, &flags, NULL);
4263 ender = UCHARAT(p++);
4264 ender = toCTRL(ender);
4266 case '0': case '1': case '2': case '3':case '4':
4267 case '5': case '6': case '7': case '8':case '9':
4269 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4272 ender = grok_oct(p, &numlen, &flags, NULL);
4282 FAIL("Trailing \\");
4285 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4286 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4287 goto normal_default;
4292 if (UTF8_IS_START(*p) && UTF) {
4294 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4302 if (RExC_flags & PMf_EXTENDED)
4303 p = regwhite(p, RExC_end);
4305 /* Prime the casefolded buffer. */
4306 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4308 if (ISMULT2(p)) { /* Back off on ?+*. */
4315 /* Emit all the Unicode characters. */
4317 for (foldbuf = tmpbuf;
4319 foldlen -= numlen) {
4320 ender = utf8_to_uvchr(foldbuf, &numlen);
4322 reguni(pRExC_state, ender, s, &unilen);
4325 /* In EBCDIC the numlen
4326 * and unilen can differ. */
4328 if (numlen >= foldlen)
4332 break; /* "Can't happen." */
4336 reguni(pRExC_state, ender, s, &unilen);
4345 REGC((char)ender, s++);
4353 /* Emit all the Unicode characters. */
4355 for (foldbuf = tmpbuf;
4357 foldlen -= numlen) {
4358 ender = utf8_to_uvchr(foldbuf, &numlen);
4360 reguni(pRExC_state, ender, s, &unilen);
4363 /* In EBCDIC the numlen
4364 * and unilen can differ. */
4366 if (numlen >= foldlen)
4374 reguni(pRExC_state, ender, s, &unilen);
4383 REGC((char)ender, s++);
4387 Set_Node_Cur_Length(ret); /* MJD */
4388 nextchar(pRExC_state);
4390 /* len is STRLEN which is unsigned, need to copy to signed */
4393 vFAIL("Internal disaster");
4397 if (len == 1 && UNI_IS_INVARIANT(ender))
4402 RExC_size += STR_SZ(len);
4404 RExC_emit += STR_SZ(len);
4409 /* If the encoding pragma is in effect recode the text of
4410 * any EXACT-kind nodes. */
4411 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4412 STRLEN oldlen = STR_LEN(ret);
4413 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4417 if (sv_utf8_downgrade(sv, TRUE)) {
4418 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4419 const STRLEN newlen = SvCUR(sv);
4424 GET_RE_DEBUG_FLAGS_DECL;
4425 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4426 (int)oldlen, STRING(ret),
4428 Copy(s, STRING(ret), newlen, char);
4429 STR_LEN(ret) += newlen - oldlen;
4430 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4432 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4440 S_regwhite(pTHX_ char *p, const char *e)
4445 else if (*p == '#') {
4448 } while (p < e && *p != '\n');
4456 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4457 Character classes ([:foo:]) can also be negated ([:^foo:]).
4458 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4459 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4460 but trigger failures because they are currently unimplemented. */
4462 #define POSIXCC_DONE(c) ((c) == ':')
4463 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4464 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4467 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4470 I32 namedclass = OOB_NAMEDCLASS;
4472 if (value == '[' && RExC_parse + 1 < RExC_end &&
4473 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4474 POSIXCC(UCHARAT(RExC_parse))) {
4475 const char c = UCHARAT(RExC_parse);
4476 char* s = RExC_parse++;
4478 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4480 if (RExC_parse == RExC_end)
4481 /* Grandfather lone [:, [=, [. */
4484 const char* t = RExC_parse++; /* skip over the c */
4485 const char *posixcc;
4489 if (UCHARAT(RExC_parse) == ']') {
4490 RExC_parse++; /* skip over the ending ] */
4493 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4494 const I32 skip = t - posixcc;
4496 /* Initially switch on the length of the name. */
4499 if (memEQ(posixcc, "word", 4)) {
4500 /* this is not POSIX, this is the Perl \w */;
4502 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4506 /* Names all of length 5. */
4507 /* alnum alpha ascii blank cntrl digit graph lower
4508 print punct space upper */
4509 /* Offset 4 gives the best switch position. */
4510 switch (posixcc[4]) {
4512 if (memEQ(posixcc, "alph", 4)) {
4515 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4519 if (memEQ(posixcc, "spac", 4)) {
4522 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4526 if (memEQ(posixcc, "grap", 4)) {
4529 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4533 if (memEQ(posixcc, "asci", 4)) {
4536 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4540 if (memEQ(posixcc, "blan", 4)) {
4543 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4547 if (memEQ(posixcc, "cntr", 4)) {
4550 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4554 if (memEQ(posixcc, "alnu", 4)) {
4557 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4561 if (memEQ(posixcc, "lowe", 4)) {
4564 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4566 if (memEQ(posixcc, "uppe", 4)) {
4569 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4573 if (memEQ(posixcc, "digi", 4)) {
4576 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4578 if (memEQ(posixcc, "prin", 4)) {
4581 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4583 if (memEQ(posixcc, "punc", 4)) {
4586 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4592 if (memEQ(posixcc, "xdigit", 6)) {
4594 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4599 if (namedclass == OOB_NAMEDCLASS)
4601 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4604 assert (posixcc[skip] == ':');
4605 assert (posixcc[skip+1] == ']');
4606 } else if (!SIZE_ONLY) {
4607 /* [[=foo=]] and [[.foo.]] are still future. */
4609 /* adjust RExC_parse so the warning shows after
4611 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4613 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4616 /* Maternal grandfather:
4617 * "[:" ending in ":" but not in ":]" */
4627 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4630 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4631 const char *s = RExC_parse;
4632 const char c = *s++;
4634 while(*s && isALNUM(*s))
4636 if (*s && c == *s && s[1] == ']') {
4637 if (ckWARN(WARN_REGEXP))
4639 "POSIX syntax [%c %c] belongs inside character classes",
4642 /* [[=foo=]] and [[.foo.]] are still future. */
4643 if (POSIXCC_NOTYET(c)) {
4644 /* adjust RExC_parse so the error shows after
4646 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4648 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4655 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4659 register UV nextvalue;
4660 register IV prevvalue = OOB_UNICODE;
4661 register IV range = 0;
4662 register regnode *ret;
4665 char *rangebegin = NULL;
4666 bool need_class = 0;
4670 bool optimize_invert = TRUE;
4671 AV* unicode_alternate = NULL;
4673 UV literal_endpoint = 0;
4676 ret = reganode(pRExC_state, ANYOF, 0);
4679 ANYOF_FLAGS(ret) = 0;
4681 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4685 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4689 RExC_size += ANYOF_SKIP;
4691 RExC_emit += ANYOF_SKIP;
4693 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4695 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4696 ANYOF_BITMAP_ZERO(ret);
4697 listsv = newSVpvs("# comment\n");
4700 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4702 if (!SIZE_ONLY && POSIXCC(nextvalue))
4703 checkposixcc(pRExC_state);
4705 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4706 if (UCHARAT(RExC_parse) == ']')
4709 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4713 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4716 rangebegin = RExC_parse;
4718 value = utf8n_to_uvchr((U8*)RExC_parse,
4719 RExC_end - RExC_parse,
4721 RExC_parse += numlen;
4724 value = UCHARAT(RExC_parse++);
4725 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4726 if (value == '[' && POSIXCC(nextvalue))
4727 namedclass = regpposixcc(pRExC_state, value);
4728 else if (value == '\\') {
4730 value = utf8n_to_uvchr((U8*)RExC_parse,
4731 RExC_end - RExC_parse,
4733 RExC_parse += numlen;
4736 value = UCHARAT(RExC_parse++);
4737 /* Some compilers cannot handle switching on 64-bit integer
4738 * values, therefore value cannot be an UV. Yes, this will
4739 * be a problem later if we want switch on Unicode.
4740 * A similar issue a little bit later when switching on
4741 * namedclass. --jhi */
4742 switch ((I32)value) {
4743 case 'w': namedclass = ANYOF_ALNUM; break;
4744 case 'W': namedclass = ANYOF_NALNUM; break;
4745 case 's': namedclass = ANYOF_SPACE; break;
4746 case 'S': namedclass = ANYOF_NSPACE; break;
4747 case 'd': namedclass = ANYOF_DIGIT; break;
4748 case 'D': namedclass = ANYOF_NDIGIT; break;
4751 if (RExC_parse >= RExC_end)
4752 vFAIL2("Empty \\%c{}", (U8)value);
4753 if (*RExC_parse == '{') {
4754 const U8 c = (U8)value;
4755 e = strchr(RExC_parse++, '}');
4757 vFAIL2("Missing right brace on \\%c{}", c);
4758 while (isSPACE(UCHARAT(RExC_parse)))
4760 if (e == RExC_parse)
4761 vFAIL2("Empty \\%c{}", c);
4763 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4771 if (UCHARAT(RExC_parse) == '^') {
4774 value = value == 'p' ? 'P' : 'p'; /* toggle */
4775 while (isSPACE(UCHARAT(RExC_parse))) {
4781 Perl_sv_catpvf(aTHX_ listsv,
4782 "+utf8::%.*s\n", (int)n, RExC_parse);
4784 Perl_sv_catpvf(aTHX_ listsv,
4785 "!utf8::%.*s\n", (int)n, RExC_parse);
4788 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4789 namedclass = ANYOF_MAX; /* no official name, but it's named */
4791 case 'n': value = '\n'; break;
4792 case 'r': value = '\r'; break;
4793 case 't': value = '\t'; break;
4794 case 'f': value = '\f'; break;
4795 case 'b': value = '\b'; break;
4796 case 'e': value = ASCII_TO_NATIVE('\033');break;
4797 case 'a': value = ASCII_TO_NATIVE('\007');break;
4799 if (*RExC_parse == '{') {
4800 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4801 | PERL_SCAN_DISALLOW_PREFIX;
4802 e = strchr(RExC_parse++, '}');
4804 vFAIL("Missing right brace on \\x{}");
4806 numlen = e - RExC_parse;
4807 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4811 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4813 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4814 RExC_parse += numlen;
4818 value = UCHARAT(RExC_parse++);
4819 value = toCTRL(value);
4821 case '0': case '1': case '2': case '3': case '4':
4822 case '5': case '6': case '7': case '8': case '9':
4826 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4827 RExC_parse += numlen;
4831 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
4833 "Unrecognized escape \\%c in character class passed through",
4837 } /* end of \blah */
4843 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4845 if (!SIZE_ONLY && !need_class)
4846 ANYOF_CLASS_ZERO(ret);
4850 /* a bad range like a-\d, a-[:digit:] ? */
4853 if (ckWARN(WARN_REGEXP)) {
4855 RExC_parse >= rangebegin ?
4856 RExC_parse - rangebegin : 0;
4858 "False [] range \"%*.*s\"",
4863 if (prevvalue < 256) {
4864 ANYOF_BITMAP_SET(ret, prevvalue);
4865 ANYOF_BITMAP_SET(ret, '-');
4868 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4869 Perl_sv_catpvf(aTHX_ listsv,
4870 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4874 range = 0; /* this was not a true range */
4878 const char *what = NULL;
4881 if (namedclass > OOB_NAMEDCLASS)
4882 optimize_invert = FALSE;
4883 /* Possible truncation here but in some 64-bit environments
4884 * the compiler gets heartburn about switch on 64-bit values.
4885 * A similar issue a little earlier when switching on value.
4887 switch ((I32)namedclass) {
4890 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4892 for (value = 0; value < 256; value++)
4894 ANYOF_BITMAP_SET(ret, value);
4901 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4903 for (value = 0; value < 256; value++)
4904 if (!isALNUM(value))
4905 ANYOF_BITMAP_SET(ret, value);
4912 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4914 for (value = 0; value < 256; value++)
4915 if (isALNUMC(value))
4916 ANYOF_BITMAP_SET(ret, value);
4923 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4925 for (value = 0; value < 256; value++)
4926 if (!isALNUMC(value))
4927 ANYOF_BITMAP_SET(ret, value);
4934 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4936 for (value = 0; value < 256; value++)
4938 ANYOF_BITMAP_SET(ret, value);
4945 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4947 for (value = 0; value < 256; value++)
4948 if (!isALPHA(value))
4949 ANYOF_BITMAP_SET(ret, value);
4956 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4959 for (value = 0; value < 128; value++)
4960 ANYOF_BITMAP_SET(ret, value);
4962 for (value = 0; value < 256; value++) {
4964 ANYOF_BITMAP_SET(ret, value);
4973 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4976 for (value = 128; value < 256; value++)
4977 ANYOF_BITMAP_SET(ret, value);
4979 for (value = 0; value < 256; value++) {
4980 if (!isASCII(value))
4981 ANYOF_BITMAP_SET(ret, value);
4990 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4992 for (value = 0; value < 256; value++)
4994 ANYOF_BITMAP_SET(ret, value);
5001 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5003 for (value = 0; value < 256; value++)
5004 if (!isBLANK(value))
5005 ANYOF_BITMAP_SET(ret, value);
5012 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5014 for (value = 0; value < 256; value++)
5016 ANYOF_BITMAP_SET(ret, value);
5023 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5025 for (value = 0; value < 256; value++)
5026 if (!isCNTRL(value))
5027 ANYOF_BITMAP_SET(ret, value);
5034 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5036 /* consecutive digits assumed */
5037 for (value = '0'; value <= '9'; value++)
5038 ANYOF_BITMAP_SET(ret, value);
5045 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5047 /* consecutive digits assumed */
5048 for (value = 0; value < '0'; value++)
5049 ANYOF_BITMAP_SET(ret, value);
5050 for (value = '9' + 1; value < 256; value++)
5051 ANYOF_BITMAP_SET(ret, value);
5058 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5060 for (value = 0; value < 256; value++)
5062 ANYOF_BITMAP_SET(ret, value);
5069 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5071 for (value = 0; value < 256; value++)
5072 if (!isGRAPH(value))
5073 ANYOF_BITMAP_SET(ret, value);
5080 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5082 for (value = 0; value < 256; value++)
5084 ANYOF_BITMAP_SET(ret, value);
5091 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5093 for (value = 0; value < 256; value++)
5094 if (!isLOWER(value))
5095 ANYOF_BITMAP_SET(ret, value);
5102 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5104 for (value = 0; value < 256; value++)
5106 ANYOF_BITMAP_SET(ret, value);
5113 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5115 for (value = 0; value < 256; value++)
5116 if (!isPRINT(value))
5117 ANYOF_BITMAP_SET(ret, value);
5124 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5126 for (value = 0; value < 256; value++)
5127 if (isPSXSPC(value))
5128 ANYOF_BITMAP_SET(ret, value);
5135 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5137 for (value = 0; value < 256; value++)
5138 if (!isPSXSPC(value))
5139 ANYOF_BITMAP_SET(ret, value);
5146 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5148 for (value = 0; value < 256; value++)
5150 ANYOF_BITMAP_SET(ret, value);
5157 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5159 for (value = 0; value < 256; value++)
5160 if (!isPUNCT(value))
5161 ANYOF_BITMAP_SET(ret, value);
5168 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5170 for (value = 0; value < 256; value++)
5172 ANYOF_BITMAP_SET(ret, value);
5179 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5181 for (value = 0; value < 256; value++)
5182 if (!isSPACE(value))
5183 ANYOF_BITMAP_SET(ret, value);
5190 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5192 for (value = 0; value < 256; value++)
5194 ANYOF_BITMAP_SET(ret, value);
5201 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5203 for (value = 0; value < 256; value++)
5204 if (!isUPPER(value))
5205 ANYOF_BITMAP_SET(ret, value);
5212 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5214 for (value = 0; value < 256; value++)
5215 if (isXDIGIT(value))
5216 ANYOF_BITMAP_SET(ret, value);
5223 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5225 for (value = 0; value < 256; value++)
5226 if (!isXDIGIT(value))
5227 ANYOF_BITMAP_SET(ret, value);
5233 /* this is to handle \p and \P */
5236 vFAIL("Invalid [::] class");
5240 /* Strings such as "+utf8::isWord\n" */
5241 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5244 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5247 } /* end of namedclass \blah */
5250 if (prevvalue > (IV)value) /* b-a */ {
5251 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5252 RExC_parse - rangebegin,
5253 RExC_parse - rangebegin,
5255 range = 0; /* not a valid range */
5259 prevvalue = value; /* save the beginning of the range */
5260 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5261 RExC_parse[1] != ']') {
5264 /* a bad range like \w-, [:word:]- ? */
5265 if (namedclass > OOB_NAMEDCLASS) {
5266 if (ckWARN(WARN_REGEXP)) {
5268 RExC_parse >= rangebegin ?
5269 RExC_parse - rangebegin : 0;
5271 "False [] range \"%*.*s\"",
5277 ANYOF_BITMAP_SET(ret, '-');
5279 range = 1; /* yeah, it's a range! */
5280 continue; /* but do it the next time */
5284 /* now is the next time */
5288 if (prevvalue < 256) {
5289 const IV ceilvalue = value < 256 ? value : 255;
5292 /* In EBCDIC [\x89-\x91] should include
5293 * the \x8e but [i-j] should not. */
5294 if (literal_endpoint == 2 &&
5295 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5296 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5298 if (isLOWER(prevvalue)) {
5299 for (i = prevvalue; i <= ceilvalue; i++)
5301 ANYOF_BITMAP_SET(ret, i);
5303 for (i = prevvalue; i <= ceilvalue; i++)
5305 ANYOF_BITMAP_SET(ret, i);
5310 for (i = prevvalue; i <= ceilvalue; i++)
5311 ANYOF_BITMAP_SET(ret, i);
5313 if (value > 255 || UTF) {
5314 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5315 const UV natvalue = NATIVE_TO_UNI(value);
5317 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5318 if (prevnatvalue < natvalue) { /* what about > ? */
5319 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5320 prevnatvalue, natvalue);
5322 else if (prevnatvalue == natvalue) {
5323 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5325 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5327 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5329 /* If folding and foldable and a single
5330 * character, insert also the folded version
5331 * to the charclass. */
5333 if (foldlen == (STRLEN)UNISKIP(f))
5334 Perl_sv_catpvf(aTHX_ listsv,
5337 /* Any multicharacter foldings
5338 * require the following transform:
5339 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5340 * where E folds into "pq" and F folds
5341 * into "rst", all other characters
5342 * fold to single characters. We save
5343 * away these multicharacter foldings,
5344 * to be later saved as part of the
5345 * additional "s" data. */
5348 if (!unicode_alternate)
5349 unicode_alternate = newAV();
5350 sv = newSVpvn((char*)foldbuf, foldlen);
5352 av_push(unicode_alternate, sv);
5356 /* If folding and the value is one of the Greek
5357 * sigmas insert a few more sigmas to make the
5358 * folding rules of the sigmas to work right.
5359 * Note that not all the possible combinations
5360 * are handled here: some of them are handled
5361 * by the standard folding rules, and some of
5362 * them (literal or EXACTF cases) are handled
5363 * during runtime in regexec.c:S_find_byclass(). */
5364 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5365 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5366 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5367 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5368 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5370 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5371 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5372 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5377 literal_endpoint = 0;
5381 range = 0; /* this range (if it was one) is done now */
5385 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5387 RExC_size += ANYOF_CLASS_ADD_SKIP;
5389 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5392 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5394 /* If the only flag is folding (plus possibly inversion). */
5395 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5397 for (value = 0; value < 256; ++value) {
5398 if (ANYOF_BITMAP_TEST(ret, value)) {
5399 UV fold = PL_fold[value];
5402 ANYOF_BITMAP_SET(ret, fold);
5405 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5408 /* optimize inverted simple patterns (e.g. [^a-z]) */
5409 if (!SIZE_ONLY && optimize_invert &&
5410 /* If the only flag is inversion. */
5411 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5412 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5413 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5414 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5421 /* The 0th element stores the character class description
5422 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5423 * to initialize the appropriate swash (which gets stored in
5424 * the 1st element), and also useful for dumping the regnode.
5425 * The 2nd element stores the multicharacter foldings,
5426 * used later (regexec.c:S_reginclass()). */
5427 av_store(av, 0, listsv);
5428 av_store(av, 1, NULL);
5429 av_store(av, 2, (SV*)unicode_alternate);
5430 rv = newRV_noinc((SV*)av);
5431 n = add_data(pRExC_state, 1, "s");
5432 RExC_rx->data->data[n] = (void*)rv;
5440 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5443 char* retval = RExC_parse++;
5446 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5447 RExC_parse[2] == '#') {
5448 while (*RExC_parse != ')') {
5449 if (RExC_parse == RExC_end)
5450 FAIL("Sequence (?#... not terminated");
5456 if (RExC_flags & PMf_EXTENDED) {
5457 if (isSPACE(*RExC_parse)) {
5461 else if (*RExC_parse == '#') {
5462 while (RExC_parse < RExC_end)
5463 if (*RExC_parse++ == '\n') break;
5472 - reg_node - emit a node
5474 STATIC regnode * /* Location. */
5475 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5478 register regnode *ptr;
5479 regnode * const ret = RExC_emit;
5482 SIZE_ALIGN(RExC_size);
5487 NODE_ALIGN_FILL(ret);
5489 FILL_ADVANCE_NODE(ptr, op);
5490 if (RExC_offsets) { /* MJD */
5491 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5492 "reg_node", __LINE__,
5494 RExC_emit - RExC_emit_start > RExC_offsets[0]
5495 ? "Overwriting end of array!\n" : "OK",
5496 RExC_emit - RExC_emit_start,
5497 RExC_parse - RExC_start,
5499 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5508 - reganode - emit a node with an argument
5510 STATIC regnode * /* Location. */
5511 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5514 register regnode *ptr;
5515 regnode * const ret = RExC_emit;
5518 SIZE_ALIGN(RExC_size);
5523 NODE_ALIGN_FILL(ret);
5525 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5526 if (RExC_offsets) { /* MJD */
5527 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5531 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5532 "Overwriting end of array!\n" : "OK",
5533 RExC_emit - RExC_emit_start,
5534 RExC_parse - RExC_start,
5536 Set_Cur_Node_Offset;
5545 - reguni - emit (if appropriate) a Unicode character
5548 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5551 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5555 - reginsert - insert an operator in front of already-emitted operand
5557 * Means relocating the operand.
5560 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5563 register regnode *src;
5564 register regnode *dst;
5565 register regnode *place;
5566 const int offset = regarglen[(U8)op];
5568 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5571 RExC_size += NODE_STEP_REGNODE + offset;
5576 RExC_emit += NODE_STEP_REGNODE + offset;
5578 while (src > opnd) {
5579 StructCopy(--src, --dst, regnode);
5580 if (RExC_offsets) { /* MJD 20010112 */
5581 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5585 dst - RExC_emit_start > RExC_offsets[0]
5586 ? "Overwriting end of array!\n" : "OK",
5587 src - RExC_emit_start,
5588 dst - RExC_emit_start,
5590 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5591 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5596 place = opnd; /* Op node, where operand used to be. */
5597 if (RExC_offsets) { /* MJD */
5598 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5602 place - RExC_emit_start > RExC_offsets[0]
5603 ? "Overwriting end of array!\n" : "OK",
5604 place - RExC_emit_start,
5605 RExC_parse - RExC_start,
5607 Set_Node_Offset(place, RExC_parse);
5608 Set_Node_Length(place, 1);
5610 src = NEXTOPER(place);
5611 FILL_ADVANCE_NODE(place, op);
5612 Zero(src, offset, regnode);
5616 - regtail - set the next-pointer at the end of a node chain of p to val.
5619 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5622 register regnode *scan;
5627 /* Find last node. */
5630 regnode * const temp = regnext(scan);
5636 if (reg_off_by_arg[OP(scan)]) {
5637 ARG_SET(scan, val - scan);
5640 NEXT_OFF(scan) = val - scan;
5645 - regoptail - regtail on operand of first argument; nop if operandless
5648 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5651 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5652 if (p == NULL || SIZE_ONLY)
5654 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5655 regtail(pRExC_state, NEXTOPER(p), val);
5657 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5658 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5665 - regcurly - a little FSA that accepts {\d+,?\d*}
5668 S_regcurly(pTHX_ register const char *s)
5687 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5690 Perl_regdump(pTHX_ regexp *r)
5694 SV * const sv = sv_newmortal();
5696 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5698 /* Header fields of interest. */
5699 if (r->anchored_substr)
5700 PerlIO_printf(Perl_debug_log,
5701 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5703 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5704 SvPVX_const(r->anchored_substr),
5706 SvTAIL(r->anchored_substr) ? "$" : "",
5707 (IV)r->anchored_offset);
5708 else if (r->anchored_utf8)
5709 PerlIO_printf(Perl_debug_log,
5710 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5712 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5713 SvPVX_const(r->anchored_utf8),
5715 SvTAIL(r->anchored_utf8) ? "$" : "",
5716 (IV)r->anchored_offset);
5717 if (r->float_substr)
5718 PerlIO_printf(Perl_debug_log,
5719 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5721 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5722 SvPVX_const(r->float_substr),
5724 SvTAIL(r->float_substr) ? "$" : "",
5725 (IV)r->float_min_offset, (UV)r->float_max_offset);
5726 else if (r->float_utf8)
5727 PerlIO_printf(Perl_debug_log,
5728 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5730 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5731 SvPVX_const(r->float_utf8),
5733 SvTAIL(r->float_utf8) ? "$" : "",
5734 (IV)r->float_min_offset, (UV)r->float_max_offset);
5735 if (r->check_substr || r->check_utf8)
5736 PerlIO_printf(Perl_debug_log,
5737 r->check_substr == r->float_substr
5738 && r->check_utf8 == r->float_utf8
5739 ? "(checking floating" : "(checking anchored");
5740 if (r->reganch & ROPT_NOSCAN)
5741 PerlIO_printf(Perl_debug_log, " noscan");
5742 if (r->reganch & ROPT_CHECK_ALL)
5743 PerlIO_printf(Perl_debug_log, " isall");
5744 if (r->check_substr || r->check_utf8)
5745 PerlIO_printf(Perl_debug_log, ") ");
5747 if (r->regstclass) {
5748 regprop(sv, r->regstclass);
5749 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5751 if (r->reganch & ROPT_ANCH) {
5752 PerlIO_printf(Perl_debug_log, "anchored");
5753 if (r->reganch & ROPT_ANCH_BOL)
5754 PerlIO_printf(Perl_debug_log, "(BOL)");
5755 if (r->reganch & ROPT_ANCH_MBOL)
5756 PerlIO_printf(Perl_debug_log, "(MBOL)");
5757 if (r->reganch & ROPT_ANCH_SBOL)
5758 PerlIO_printf(Perl_debug_log, "(SBOL)");
5759 if (r->reganch & ROPT_ANCH_GPOS)
5760 PerlIO_printf(Perl_debug_log, "(GPOS)");
5761 PerlIO_putc(Perl_debug_log, ' ');
5763 if (r->reganch & ROPT_GPOS_SEEN)
5764 PerlIO_printf(Perl_debug_log, "GPOS ");
5765 if (r->reganch & ROPT_SKIP)
5766 PerlIO_printf(Perl_debug_log, "plus ");
5767 if (r->reganch & ROPT_IMPLICIT)
5768 PerlIO_printf(Perl_debug_log, "implicit ");
5769 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5770 if (r->reganch & ROPT_EVAL_SEEN)
5771 PerlIO_printf(Perl_debug_log, "with eval ");
5772 PerlIO_printf(Perl_debug_log, "\n");
5774 const U32 len = r->offsets[0];
5775 GET_RE_DEBUG_FLAGS_DECL;
5778 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5779 for (i = 1; i <= len; i++)
5780 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5781 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5782 PerlIO_printf(Perl_debug_log, "\n");
5787 #endif /* DEBUGGING */
5791 - regprop - printable representation of opcode
5794 Perl_regprop(pTHX_ SV *sv, const regnode *o)
5800 sv_setpvn(sv, "", 0);
5801 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5802 /* It would be nice to FAIL() here, but this may be called from
5803 regexec.c, and it would be hard to supply pRExC_state. */
5804 Perl_croak(aTHX_ "Corrupted regexp opcode");
5805 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5807 k = PL_regkind[(U8)OP(o)];
5810 SV * const dsv = sv_2mortal(newSVpvs(""));
5811 /* Using is_utf8_string() is a crude hack but it may
5812 * be the best for now since we have no flag "this EXACTish
5813 * node was UTF-8" --jhi */
5814 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5815 const char * const s = do_utf8 ?
5816 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5817 UNI_DISPLAY_REGEX) :
5819 const int len = do_utf8 ?
5822 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5826 } else if (k == TRIE) {/*
5827 this isn't always safe, as Pl_regdata may not be for this regex yet
5828 (depending on where its called from) so its being moved to dumpuntil
5830 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5831 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5834 trie->uniquecharcount,
5837 } else if (k == CURLY) {
5838 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5839 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5840 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5842 else if (k == WHILEM && o->flags) /* Ordinal/of */
5843 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5844 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5845 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5846 else if (k == LOGICAL)
5847 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5848 else if (k == ANYOF) {
5849 int i, rangestart = -1;
5850 const U8 flags = ANYOF_FLAGS(o);
5852 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5853 static const char * const anyofs[] = {
5886 if (flags & ANYOF_LOCALE)
5887 sv_catpvs(sv, "{loc}");
5888 if (flags & ANYOF_FOLD)
5889 sv_catpvs(sv, "{i}");
5890 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5891 if (flags & ANYOF_INVERT)
5893 for (i = 0; i <= 256; i++) {
5894 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5895 if (rangestart == -1)
5897 } else if (rangestart != -1) {
5898 if (i <= rangestart + 3)
5899 for (; rangestart < i; rangestart++)
5900 put_byte(sv, rangestart);
5902 put_byte(sv, rangestart);
5904 put_byte(sv, i - 1);
5910 if (o->flags & ANYOF_CLASS)
5911 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5912 if (ANYOF_CLASS_TEST(o,i))
5913 sv_catpv(sv, anyofs[i]);
5915 if (flags & ANYOF_UNICODE)
5916 sv_catpvs(sv, "{unicode}");
5917 else if (flags & ANYOF_UNICODE_ALL)
5918 sv_catpvs(sv, "{unicode_all}");
5922 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
5926 U8 s[UTF8_MAXBYTES_CASE+1];
5928 for (i = 0; i <= 256; i++) { /* just the first 256 */
5929 uvchr_to_utf8(s, i);
5931 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5932 if (rangestart == -1)
5934 } else if (rangestart != -1) {
5935 if (i <= rangestart + 3)
5936 for (; rangestart < i; rangestart++) {
5937 const U8 * const e = uvchr_to_utf8(s,rangestart);
5939 for(p = s; p < e; p++)
5943 const U8 *e = uvchr_to_utf8(s,rangestart);
5945 for (p = s; p < e; p++)
5948 e = uvchr_to_utf8(s, i-1);
5949 for (p = s; p < e; p++)
5956 sv_catpvs(sv, "..."); /* et cetera */
5960 char *s = savesvpv(lv);
5961 char * const origs = s;
5963 while(*s && *s != '\n') s++;
5966 const char * const t = ++s;
5984 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5986 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5987 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5989 PERL_UNUSED_ARG(sv);
5991 #endif /* DEBUGGING */
5995 Perl_re_intuit_string(pTHX_ regexp *prog)
5996 { /* Assume that RE_INTUIT is set */
5998 GET_RE_DEBUG_FLAGS_DECL;
6001 const char * const s = SvPV_nolen_const(prog->check_substr
6002 ? prog->check_substr : prog->check_utf8);
6004 if (!PL_colorset) reginitcolors();
6005 PerlIO_printf(Perl_debug_log,
6006 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6008 prog->check_substr ? "" : "utf8 ",
6009 PL_colors[5],PL_colors[0],
6012 (strlen(s) > 60 ? "..." : ""));
6015 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6019 Perl_pregfree(pTHX_ struct regexp *r)
6023 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6024 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
6028 if (!r || (--r->refcnt > 0))
6030 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6031 const char * const s = (r->reganch & ROPT_UTF8)
6032 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6033 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6034 const int len = SvCUR(dsv);
6037 PerlIO_printf(Perl_debug_log,
6038 "%sFreeing REx:%s %s%*.*s%s%s\n",
6039 PL_colors[4],PL_colors[5],PL_colors[0],
6042 len > 60 ? "..." : "");
6045 /* gcov results gave these as non-null 100% of the time, so there's no
6046 optimisation in checking them before calling Safefree */
6047 Safefree(r->precomp);
6048 Safefree(r->offsets); /* 20010421 MJD */
6049 RX_MATCH_COPY_FREE(r);
6050 #ifdef PERL_OLD_COPY_ON_WRITE
6052 SvREFCNT_dec(r->saved_copy);
6055 if (r->anchored_substr)
6056 SvREFCNT_dec(r->anchored_substr);
6057 if (r->anchored_utf8)
6058 SvREFCNT_dec(r->anchored_utf8);
6059 if (r->float_substr)
6060 SvREFCNT_dec(r->float_substr);
6062 SvREFCNT_dec(r->float_utf8);
6063 Safefree(r->substrs);
6066 int n = r->data->count;
6067 PAD* new_comppad = NULL;
6072 /* If you add a ->what type here, update the comment in regcomp.h */
6073 switch (r->data->what[n]) {
6075 SvREFCNT_dec((SV*)r->data->data[n]);
6078 Safefree(r->data->data[n]);
6081 new_comppad = (AV*)r->data->data[n];
6084 if (new_comppad == NULL)
6085 Perl_croak(aTHX_ "panic: pregfree comppad");
6086 PAD_SAVE_LOCAL(old_comppad,
6087 /* Watch out for global destruction's random ordering. */
6088 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6091 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6094 op_free((OP_4tree*)r->data->data[n]);
6096 PAD_RESTORE_LOCAL(old_comppad);
6097 SvREFCNT_dec((SV*)new_comppad);
6104 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
6107 refcount = --trie->refcount;
6110 Safefree(trie->charmap);
6111 if (trie->widecharmap)
6112 SvREFCNT_dec((SV*)trie->widecharmap);
6113 Safefree(trie->states);
6114 Safefree(trie->trans);
6117 SvREFCNT_dec((SV*)trie->words);
6118 if (trie->revcharmap)
6119 SvREFCNT_dec((SV*)trie->revcharmap);
6121 Safefree(r->data->data[n]); /* do this last!!!! */
6126 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6129 Safefree(r->data->what);
6132 Safefree(r->startp);
6138 - regnext - dig the "next" pointer out of a node
6141 Perl_regnext(pTHX_ register regnode *p)
6144 register I32 offset;
6146 if (p == &PL_regdummy)
6149 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6157 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6160 STRLEN l1 = strlen(pat1);
6161 STRLEN l2 = strlen(pat2);
6164 const char *message;
6170 Copy(pat1, buf, l1 , char);
6171 Copy(pat2, buf + l1, l2 , char);
6172 buf[l1 + l2] = '\n';
6173 buf[l1 + l2 + 1] = '\0';
6175 /* ANSI variant takes additional second argument */
6176 va_start(args, pat2);
6180 msv = vmess(buf, &args);
6182 message = SvPV_const(msv,l1);
6185 Copy(message, buf, l1 , char);
6186 buf[l1-1] = '\0'; /* Overwrite \n */
6187 Perl_croak(aTHX_ "%s", buf);
6190 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6193 Perl_save_re_context(pTHX)
6196 SAVEI32(PL_reg_flags); /* from regexec.c */
6198 SAVEPPTR(PL_reginput); /* String-input pointer. */
6199 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6200 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6201 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6202 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6203 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6204 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6205 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6206 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6207 PL_reg_start_tmp = 0;
6208 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6209 PL_reg_start_tmpl = 0;
6210 SAVEVPTR(PL_regdata);
6211 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6212 SAVEI32(PL_regnarrate); /* from regexec.c */
6213 SAVEVPTR(PL_regprogram); /* from regexec.c */
6214 SAVEINT(PL_regindent); /* from regexec.c */
6215 SAVEVPTR(PL_regcc); /* from regexec.c */
6216 SAVEVPTR(PL_curcop);
6217 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6218 SAVEVPTR(PL_reg_re); /* from regexec.c */
6219 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6220 SAVESPTR(PL_reg_sv); /* from regexec.c */
6221 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6222 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6223 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6224 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6225 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6226 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6227 PL_reg_oldsaved = NULL;
6228 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6229 PL_reg_oldsavedlen = 0;
6230 #ifdef PERL_OLD_COPY_ON_WRITE
6234 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6236 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6237 PL_reg_leftiter = 0;
6238 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6239 PL_reg_poscache = NULL;
6240 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6241 PL_reg_poscache_size = 0;
6242 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6243 SAVEI32(PL_regnpar); /* () count. */
6244 SAVEI32(PL_regsize); /* from regexec.c */
6246 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6248 const REGEXP * const rx = PM_GETRE(PL_curpm);
6251 for (i = 1; i <= rx->nparens; i++) {
6252 char digits[TYPE_CHARS(long)];
6253 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
6254 GV * const mgv = gv_fetchpvn_flags(digits, len, 0, SVt_PV);
6262 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6267 clear_re(pTHX_ void *r)
6270 ReREFCNT_dec((regexp *)r);
6276 S_put_byte(pTHX_ SV *sv, int c)
6278 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6279 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6280 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6281 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6283 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6288 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6291 register U8 op = EXACT; /* Arbitrary non-END op. */
6292 register regnode *next;
6294 while (op != END && (!last || node < last)) {
6295 /* While that wasn't END last time... */
6301 next = regnext(node);
6303 if (OP(node) == OPTIMIZED)
6306 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6307 (int)(2*l + 1), "", SvPVX_const(sv));
6308 if (next == NULL) /* Next ptr. */
6309 PerlIO_printf(Perl_debug_log, "(0)");
6311 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6312 (void)PerlIO_putc(Perl_debug_log, '\n');
6314 if (PL_regkind[(U8)op] == BRANCHJ) {
6315 register regnode *nnode = (OP(next) == LONGJMP
6318 if (last && nnode > last)
6320 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6322 else if (PL_regkind[(U8)op] == BRANCH) {
6323 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6325 else if ( PL_regkind[(U8)op] == TRIE ) {
6326 const I32 n = ARG(node);
6327 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6328 const I32 arry_len = av_len(trie->words)+1;
6330 PerlIO_printf(Perl_debug_log,
6331 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6335 (int)trie->charcount,
6336 trie->uniquecharcount,
6337 (IV)trie->laststate-1,
6338 node->flags ? " EVAL mode" : "");
6340 for (word_idx=0; word_idx < arry_len; word_idx++) {
6341 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6343 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6346 SvPV_nolen_const(*elem_ptr),
6351 PerlIO_printf(Perl_debug_log, "(0)\n");
6353 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6359 node = NEXTOPER(node);
6360 node += regarglen[(U8)op];
6363 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6364 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6365 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6367 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6368 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6371 else if ( op == PLUS || op == STAR) {
6372 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6374 else if (op == ANYOF) {
6375 /* arglen 1 + class block */
6376 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6377 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6378 node = NEXTOPER(node);
6380 else if (PL_regkind[(U8)op] == EXACT) {
6381 /* Literal string, where present. */
6382 node += NODE_SZ_STR(node) - 1;
6383 node = NEXTOPER(node);
6386 node = NEXTOPER(node);
6387 node += regarglen[(U8)op];
6389 if (op == CURLYX || op == OPEN)
6391 else if (op == WHILEM)
6397 #endif /* DEBUGGING */
6401 * c-indentation-style: bsd
6403 * indent-tabs-mode: t
6406 * ex: set ts=8 sts=4 sw=4 noet: