5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
289 #define FAIL2(pat,msg) STMT_START { \
290 const char *ellipses = ""; \
291 IV len = RExC_end - RExC_precomp; \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 #define Simple_vFAIL(m) STMT_START { \
309 const IV offset = RExC_parse - RExC_precomp; \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 #define vFAIL(m) STMT_START { \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
324 * Like Simple_vFAIL(), but accepts two arguments.
326 #define Simple_vFAIL2(m,a1) STMT_START { \
327 const IV offset = RExC_parse - RExC_precomp; \
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 #define vFAIL2(m,a1) STMT_START { \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
346 const IV offset = RExC_parse - RExC_precomp; \
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 #define vFAIL3(m,a1,a2) STMT_START { \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
361 * Like Simple_vFAIL(), but accepts four arguments.
363 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
364 const IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 #define vWARN(loc,m) STMT_START { \
370 const IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARNdep(loc,m) STMT_START { \
376 const IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
383 #define vWARN2(loc, m, a1) STMT_START { \
384 const IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN3(loc, m, a1, a2) STMT_START { \
390 const IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 const IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 const IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 /* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
423 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
430 RExC_offsets[2*(node)-1] = (byte); \
435 #define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439 #define Set_Node_Length_To_R(node,len) STMT_START { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (int)(node), (int)(len))); \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
446 RExC_offsets[2*(node)] = (len); \
451 #define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461 static void clear_re(pTHX_ void *r);
463 /* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 SvSetMagicSV(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 data->flags &= ~SF_FIX_BEFORE_EOL;
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 data->flags &= ~SF_FL_BEFORE_EOL;
497 SvCUR_set(data->last_found, 0);
499 SV * const sv = data->last_found;
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
506 data->flags &= ~SF_BEFORE_EOL;
509 /* Can match anything (initialization) */
511 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 ANYOF_CLASS_ZERO(cl);
514 ANYOF_BITMAP_SETALL(cl);
515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517 cl->flags |= ANYOF_LOCALE;
520 /* Can match anything (initialization) */
522 S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
526 for (value = 0; value <= ANYOF_MAX; value += 2)
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
536 /* Can match anything (initialization) */
538 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 Zero(cl, 1, struct regnode_charclass_class);
542 cl_anything(pRExC_state, cl);
546 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 Zero(cl, 1, struct regnode_charclass_class);
550 cl_anything(pRExC_state, cl);
552 cl->flags |= ANYOF_LOCALE;
555 /* 'And' a given class with another one. Can create false positives */
556 /* We assume that cl is not inverted */
558 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559 const struct regnode_charclass_class *and_with)
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
586 cl->flags &= ~ANYOF_UNICODE_ALL;
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
589 cl->flags &= ~ANYOF_UNICODE;
592 /* 'OR' a given class with another one. Can create false positives */
593 /* We assume that cl is not inverted */
595 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
597 if (or_with->flags & ANYOF_INVERT) {
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
616 cl_anything(pRExC_state, cl);
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && (!(or_with->flags & ANYOF_FOLD)
622 || (cl->flags & ANYOF_FOLD)) ) {
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
635 cl_anything(pRExC_state, cl);
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
654 make_trie(startbranch,first,last,tail,flags)
655 startbranch: the first branch in the whole branch sequence
656 first : start branch of sequence of branch-exact nodes.
657 May be the same as startbranch
658 last : Thing following the last branch.
659 May be the same as tail.
660 tail : item following the branch sequence
661 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
663 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
665 A trie is an N'ary tree where the branches are determined by digital
666 decomposition of the key. IE, at the root node you look up the 1st character and
667 follow that branch repeat until you find the end of the branches. Nodes can be
668 marked as "accepting" meaning they represent a complete word. Eg:
672 would convert into the following structure. Numbers represent states, letters
673 following numbers represent valid transitions on the letter from that state, if
674 the number is in square brackets it represents an accepting state, otherwise it
675 will be in parenthesis.
677 +-h->+-e->[3]-+-r->(8)-+-s->[9]
681 (1) +-i->(6)-+-s->[7]
683 +-s->(3)-+-h->(4)-+-e->[5]
685 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
687 This shows that when matching against the string 'hers' we will begin at state 1
688 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
689 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
690 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
691 single traverse. We store a mapping from accepting to state to which word was
692 matched, and then when we have multiple possibilities we try to complete the
693 rest of the regex in the order in which they occured in the alternation.
695 The only prior NFA like behaviour that would be changed by the TRIE support is
696 the silent ignoring of duplicate alternations which are of the form:
698 / (DUPE|DUPE) X? (?{ ... }) Y /x
700 Thus EVAL blocks follwing a trie may be called a different number of times with
701 and without the optimisation. With the optimisations dupes will be silently
702 ignored. This inconsistant behaviour of EVAL type nodes is well established as
703 the following demonstrates:
705 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
707 which prints out 'word' three times, but
709 'words'=~/(word|word|word)(?{ print $1 })S/
711 which doesnt print it out at all. This is due to other optimisations kicking in.
713 Example of what happens on a structural level:
715 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
717 1: CURLYM[1] {1,32767}(18)
728 This would be optimizable with startbranch=5, first=5, last=16, tail=16
729 and should turn into:
731 1: CURLYM[1] {1,32767}(18)
733 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
741 Cases where tail != last would be like /(?foo|bar)baz/:
751 which would be optimizable with startbranch=1, first=1, last=7, tail=8
752 and would end up looking like:
755 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
764 #define TRIE_DEBUG_CHAR \
765 DEBUG_TRIE_COMPILE_r({ \
768 tmp = newSVpvn( "", 0 ); \
769 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
771 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
773 av_push( trie->revcharmap, tmp ); \
776 #define TRIE_READ_CHAR STMT_START { \
779 if ( foldlen > 0 ) { \
780 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
785 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
786 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
787 foldlen -= UNISKIP( uvc ); \
788 scan = foldbuf + UNISKIP( uvc ); \
791 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
800 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
801 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
802 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
803 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
805 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
806 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
807 TRIE_LIST_LEN( state ) *= 2; \
808 Renew( trie->states[ state ].trans.list, \
809 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
811 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
812 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
813 TRIE_LIST_CUR( state )++; \
816 #define TRIE_LIST_NEW(state) STMT_START { \
817 Newxz( trie->states[ state ].trans.list, \
818 4, reg_trie_trans_le ); \
819 TRIE_LIST_CUR( state ) = 1; \
820 TRIE_LIST_LEN( state ) = 4; \
824 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
827 /* first pass, loop through and scan words */
830 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
835 /* we just use folder as a flag in utf8 */
836 const U8 * const folder = ( flags == EXACTF
844 const U32 data_slot = add_data( pRExC_state, 1, "t" );
847 GET_RE_DEBUG_FLAGS_DECL;
849 Newxz( trie, 1, reg_trie_data );
851 RExC_rx->data->data[ data_slot ] = (void*)trie;
852 Newxz( trie->charmap, 256, U16 );
854 trie->words = newAV();
855 trie->revcharmap = newAV();
859 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
860 if (!SvIOK(re_trie_maxbuff)) {
861 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
864 /* -- First loop and Setup --
866 We first traverse the branches and scan each word to determine if it
867 contains widechars, and how many unique chars there are, this is
868 important as we have to build a table with at least as many columns as we
871 We use an array of integers to represent the character codes 0..255
872 (trie->charmap) and we use a an HV* to store unicode characters. We use the
873 native representation of the character value as the key and IV's for the
876 *TODO* If we keep track of how many times each character is used we can
877 remap the columns so that the table compression later on is more
878 efficient in terms of memory by ensuring most common value is in the
879 middle and the least common are on the outside. IMO this would be better
880 than a most to least common mapping as theres a decent chance the most
881 common letter will share a node with the least common, meaning the node
882 will not be compressable. With a middle is most common approach the worst
883 case is when we have the least common nodes twice.
888 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
889 regnode * const noper = NEXTOPER( cur );
890 const U8 *uc = (U8*)STRING( noper );
891 const U8 * const e = uc + STR_LEN( noper );
893 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
894 const U8 *scan = (U8*)NULL;
896 for ( ; uc < e ; uc += len ) {
900 if ( !trie->charmap[ uvc ] ) {
901 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
903 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
908 if ( !trie->widecharmap )
909 trie->widecharmap = newHV();
911 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
914 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
916 if ( !SvTRUE( *svpp ) ) {
917 sv_setiv( *svpp, ++trie->uniquecharcount );
923 } /* end first pass */
924 DEBUG_TRIE_COMPILE_r(
925 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
926 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
927 (int)trie->charcount, trie->uniquecharcount )
932 We now know what we are dealing with in terms of unique chars and
933 string sizes so we can calculate how much memory a naive
934 representation using a flat table will take. If it's over a reasonable
935 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
936 conservative but potentially much slower representation using an array
939 At the end we convert both representations into the same compressed
940 form that will be used in regexec.c for matching with. The latter
941 is a form that cannot be used to construct with but has memory
942 properties similar to the list form and access properties similar
943 to the table form making it both suitable for fast searches and
944 small enough that its feasable to store for the duration of a program.
946 See the comment in the code where the compressed table is produced
947 inplace from the flat tabe representation for an explanation of how
948 the compression works.
953 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
955 Second Pass -- Array Of Lists Representation
957 Each state will be represented by a list of charid:state records
958 (reg_trie_trans_le) the first such element holds the CUR and LEN
959 points of the allocated array. (See defines above).
961 We build the initial structure using the lists, and then convert
962 it into the compressed table form which allows faster lookups
963 (but cant be modified once converted).
969 STRLEN transcount = 1;
971 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
975 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
977 regnode * const noper = NEXTOPER( cur );
978 U8 *uc = (U8*)STRING( noper );
979 const U8 * const e = uc + STR_LEN( noper );
980 U32 state = 1; /* required init */
981 U16 charid = 0; /* sanity init */
982 U8 *scan = (U8*)NULL; /* sanity init */
983 STRLEN foldlen = 0; /* required init */
984 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
986 for ( ; uc < e ; uc += len ) {
991 charid = trie->charmap[ uvc ];
993 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
997 charid=(U16)SvIV( *svpp );
1006 if ( !trie->states[ state ].trans.list ) {
1007 TRIE_LIST_NEW( state );
1009 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1010 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1011 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1016 newstate = next_alloc++;
1017 TRIE_LIST_PUSH( state, charid, newstate );
1022 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1024 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1027 if ( !trie->states[ state ].wordnum ) {
1028 /* we havent inserted this word into the structure yet. */
1029 trie->states[ state ].wordnum = ++curword;
1032 /* store the word for dumping */
1033 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1034 if ( UTF ) SvUTF8_on( tmp );
1035 av_push( trie->words, tmp );
1039 /* Its a dupe. So ignore it. */
1042 } /* end second pass */
1044 trie->laststate = next_alloc;
1045 Renew( trie->states, next_alloc, reg_trie_state );
1047 DEBUG_TRIE_COMPILE_MORE_r({
1050 /* print out the table precompression. */
1052 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1053 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1055 for( state=1 ; state < next_alloc ; state ++ ) {
1058 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1059 if ( ! trie->states[ state ].wordnum ) {
1060 PerlIO_printf( Perl_debug_log, "%5s| ","");
1062 PerlIO_printf( Perl_debug_log, "W%04x| ",
1063 trie->states[ state ].wordnum
1066 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1067 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1068 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1069 SvPV_nolen_const( *tmp ),
1070 TRIE_LIST_ITEM(state,charid).forid,
1071 (UV)TRIE_LIST_ITEM(state,charid).newstate
1076 PerlIO_printf( Perl_debug_log, "\n\n" );
1079 Newxz( trie->trans, transcount ,reg_trie_trans );
1086 for( state=1 ; state < next_alloc ; state ++ ) {
1090 DEBUG_TRIE_COMPILE_MORE_r(
1091 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1095 if (trie->states[state].trans.list) {
1096 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1100 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1101 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1102 if ( forid < minid ) {
1104 } else if ( forid > maxid ) {
1108 if ( transcount < tp + maxid - minid + 1) {
1110 Renew( trie->trans, transcount, reg_trie_trans );
1111 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1113 base = trie->uniquecharcount + tp - minid;
1114 if ( maxid == minid ) {
1116 for ( ; zp < tp ; zp++ ) {
1117 if ( ! trie->trans[ zp ].next ) {
1118 base = trie->uniquecharcount + zp - minid;
1119 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1120 trie->trans[ zp ].check = state;
1126 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1127 trie->trans[ tp ].check = state;
1132 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1133 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1134 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1135 trie->trans[ tid ].check = state;
1137 tp += ( maxid - minid + 1 );
1139 Safefree(trie->states[ state ].trans.list);
1142 DEBUG_TRIE_COMPILE_MORE_r(
1143 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1146 trie->states[ state ].trans.base=base;
1148 trie->lasttrans = tp + 1;
1152 Second Pass -- Flat Table Representation.
1154 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1155 We know that we will need Charcount+1 trans at most to store the data
1156 (one row per char at worst case) So we preallocate both structures
1157 assuming worst case.
1159 We then construct the trie using only the .next slots of the entry
1162 We use the .check field of the first entry of the node temporarily to
1163 make compression both faster and easier by keeping track of how many non
1164 zero fields are in the node.
1166 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1169 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1170 number representing the first entry of the node, and state as a
1171 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1172 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1173 are 2 entrys per node. eg:
1181 The table is internally in the right hand, idx form. However as we also
1182 have to deal with the states array which is indexed by nodenum we have to
1183 use TRIE_NODENUM() to convert.
1187 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1189 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
1190 next_alloc = trie->uniquecharcount + 1;
1192 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1194 regnode * const noper = NEXTOPER( cur );
1195 const U8 *uc = (U8*)STRING( noper );
1196 const U8 * const e = uc + STR_LEN( noper );
1198 U32 state = 1; /* required init */
1200 U16 charid = 0; /* sanity init */
1201 U32 accept_state = 0; /* sanity init */
1202 U8 *scan = (U8*)NULL; /* sanity init */
1204 STRLEN foldlen = 0; /* required init */
1205 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1208 for ( ; uc < e ; uc += len ) {
1213 charid = trie->charmap[ uvc ];
1215 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1216 charid = svpp ? (U16)SvIV(*svpp) : 0;
1220 if ( !trie->trans[ state + charid ].next ) {
1221 trie->trans[ state + charid ].next = next_alloc;
1222 trie->trans[ state ].check++;
1223 next_alloc += trie->uniquecharcount;
1225 state = trie->trans[ state + charid ].next;
1227 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1229 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1232 accept_state = TRIE_NODENUM( state );
1233 if ( !trie->states[ accept_state ].wordnum ) {
1234 /* we havent inserted this word into the structure yet. */
1235 trie->states[ accept_state ].wordnum = ++curword;
1238 /* store the word for dumping */
1239 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1240 if ( UTF ) SvUTF8_on( tmp );
1241 av_push( trie->words, tmp );
1245 /* Its a dupe. So ignore it. */
1248 } /* end second pass */
1250 DEBUG_TRIE_COMPILE_MORE_r({
1252 print out the table precompression so that we can do a visual check
1253 that they are identical.
1257 PerlIO_printf( Perl_debug_log, "\nChar : " );
1259 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1260 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1262 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1266 PerlIO_printf( Perl_debug_log, "\nState+-" );
1268 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1269 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1272 PerlIO_printf( Perl_debug_log, "\n" );
1274 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1276 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1278 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1279 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1280 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1282 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1283 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1285 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1286 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1289 PerlIO_printf( Perl_debug_log, "\n\n" );
1293 * Inplace compress the table.*
1295 For sparse data sets the table constructed by the trie algorithm will
1296 be mostly 0/FAIL transitions or to put it another way mostly empty.
1297 (Note that leaf nodes will not contain any transitions.)
1299 This algorithm compresses the tables by eliminating most such
1300 transitions, at the cost of a modest bit of extra work during lookup:
1302 - Each states[] entry contains a .base field which indicates the
1303 index in the state[] array wheres its transition data is stored.
1305 - If .base is 0 there are no valid transitions from that node.
1307 - If .base is nonzero then charid is added to it to find an entry in
1310 -If trans[states[state].base+charid].check!=state then the
1311 transition is taken to be a 0/Fail transition. Thus if there are fail
1312 transitions at the front of the node then the .base offset will point
1313 somewhere inside the previous nodes data (or maybe even into a node
1314 even earlier), but the .check field determines if the transition is
1317 The following process inplace converts the table to the compressed
1318 table: We first do not compress the root node 1,and mark its all its
1319 .check pointers as 1 and set its .base pointer as 1 as well. This
1320 allows to do a DFA construction from the compressed table later, and
1321 ensures that any .base pointers we calculate later are greater than
1324 - We set 'pos' to indicate the first entry of the second node.
1326 - We then iterate over the columns of the node, finding the first and
1327 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1328 and set the .check pointers accordingly, and advance pos
1329 appropriately and repreat for the next node. Note that when we copy
1330 the next pointers we have to convert them from the original
1331 NODEIDX form to NODENUM form as the former is not valid post
1334 - If a node has no transitions used we mark its base as 0 and do not
1335 advance the pos pointer.
1337 - If a node only has one transition we use a second pointer into the
1338 structure to fill in allocated fail transitions from other states.
1339 This pointer is independent of the main pointer and scans forward
1340 looking for null transitions that are allocated to a state. When it
1341 finds one it writes the single transition into the "hole". If the
1342 pointer doesnt find one the single transition is appeneded as normal.
1344 - Once compressed we can Renew/realloc the structures to release the
1347 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1348 specifically Fig 3.47 and the associated pseudocode.
1352 const U32 laststate = TRIE_NODENUM( next_alloc );
1355 trie->laststate = laststate;
1357 for ( state = 1 ; state < laststate ; state++ ) {
1359 const U32 stateidx = TRIE_NODEIDX( state );
1360 const U32 o_used = trie->trans[ stateidx ].check;
1361 U32 used = trie->trans[ stateidx ].check;
1362 trie->trans[ stateidx ].check = 0;
1364 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1365 if ( flag || trie->trans[ stateidx + charid ].next ) {
1366 if ( trie->trans[ stateidx + charid ].next ) {
1368 for ( ; zp < pos ; zp++ ) {
1369 if ( ! trie->trans[ zp ].next ) {
1373 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1374 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1375 trie->trans[ zp ].check = state;
1376 if ( ++zp > pos ) pos = zp;
1383 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1385 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1386 trie->trans[ pos ].check = state;
1391 trie->lasttrans = pos + 1;
1392 Renew( trie->states, laststate + 1, reg_trie_state);
1393 DEBUG_TRIE_COMPILE_MORE_r(
1394 PerlIO_printf( Perl_debug_log,
1395 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1396 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1399 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1402 } /* end table compress */
1404 /* resize the trans array to remove unused space */
1405 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1407 DEBUG_TRIE_COMPILE_r({
1410 Now we print it out again, in a slightly different form as there is additional
1411 info we want to be able to see when its compressed. They are close enough for
1412 visual comparison though.
1414 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1416 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1417 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1419 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1422 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1424 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1425 PerlIO_printf( Perl_debug_log, "-----");
1426 PerlIO_printf( Perl_debug_log, "\n");
1428 for( state = 1 ; state < trie->laststate ; state++ ) {
1429 const U32 base = trie->states[ state ].trans.base;
1431 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1433 if ( trie->states[ state ].wordnum ) {
1434 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1436 PerlIO_printf( Perl_debug_log, "%6s", "" );
1439 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1444 while( ( base + ofs < trie->uniquecharcount ) ||
1445 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1446 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1449 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1451 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1452 if ( ( base + ofs >= trie->uniquecharcount ) &&
1453 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1454 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1456 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1457 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1459 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1463 PerlIO_printf( Perl_debug_log, "]");
1466 PerlIO_printf( Perl_debug_log, "\n" );
1471 /* now finally we "stitch in" the new TRIE node
1472 This means we convert either the first branch or the first Exact,
1473 depending on whether the thing following (in 'last') is a branch
1474 or not and whther first is the startbranch (ie is it a sub part of
1475 the alternation or is it the whole thing.)
1476 Assuming its a sub part we conver the EXACT otherwise we convert
1477 the whole branch sequence, including the first.
1484 if ( first == startbranch && OP( last ) != BRANCH ) {
1487 convert = NEXTOPER( first );
1488 NEXT_OFF( first ) = (U16)(last - first);
1491 OP( convert ) = TRIE + (U8)( flags - EXACT );
1492 NEXT_OFF( convert ) = (U16)(tail - convert);
1493 ARG_SET( convert, data_slot );
1495 /* tells us if we need to handle accept buffers specially */
1496 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1499 /* needed for dumping*/
1501 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1502 /* We now need to mark all of the space originally used by the
1503 branches as optimized away. This keeps the dumpuntil from
1504 throwing a wobbly as it doesnt use regnext() to traverse the
1507 while( optimize < last ) {
1508 OP( optimize ) = OPTIMIZED;
1512 } /* end node insert */
1519 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1520 * These need to be revisited when a newer toolchain becomes available.
1522 #if defined(__sparc64__) && defined(__GNUC__)
1523 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1524 # undef SPARC64_GCC_WORKAROUND
1525 # define SPARC64_GCC_WORKAROUND 1
1529 /* REx optimizer. Converts nodes into quickier variants "in place".
1530 Finds fixed substrings. */
1532 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1533 to the position after last scanned or to NULL. */
1537 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1538 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1539 /* scanp: Start here (read-write). */
1540 /* deltap: Write maxlen-minlen here. */
1541 /* last: Stop before this one. */
1543 I32 min = 0, pars = 0, code;
1544 regnode *scan = *scanp, *next;
1546 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1547 int is_inf_internal = 0; /* The studied chunk is infinite */
1548 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1549 scan_data_t data_fake;
1550 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1551 SV *re_trie_maxbuff = NULL;
1553 GET_RE_DEBUG_FLAGS_DECL;
1555 while (scan && OP(scan) != END && scan < last) {
1556 /* Peephole optimizer: */
1558 SV * const mysv=sv_newmortal();
1559 regprop( mysv, scan);
1560 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1561 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1564 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1565 /* Merge several consecutive EXACTish nodes into one. */
1566 regnode *n = regnext(scan);
1569 regnode *stop = scan;
1572 next = scan + NODE_SZ_STR(scan);
1573 /* Skip NOTHING, merge EXACT*. */
1575 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1576 (stringok && (OP(n) == OP(scan))))
1578 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1579 if (OP(n) == TAIL || n > next)
1581 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1582 NEXT_OFF(scan) += NEXT_OFF(n);
1583 next = n + NODE_STEP_REGNODE;
1590 else if (stringok) {
1591 const int oldl = STR_LEN(scan);
1592 regnode * const nnext = regnext(n);
1594 if (oldl + STR_LEN(n) > U8_MAX)
1596 NEXT_OFF(scan) += NEXT_OFF(n);
1597 STR_LEN(scan) += STR_LEN(n);
1598 next = n + NODE_SZ_STR(n);
1599 /* Now we can overwrite *n : */
1600 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1608 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1610 Two problematic code points in Unicode casefolding of EXACT nodes:
1612 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1613 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1619 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1620 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1622 This means that in case-insensitive matching (or "loose matching",
1623 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1624 length of the above casefolded versions) can match a target string
1625 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1626 This would rather mess up the minimum length computation.
1628 What we'll do is to look for the tail four bytes, and then peek
1629 at the preceding two bytes to see whether we need to decrease
1630 the minimum length by four (six minus two).
1632 Thanks to the design of UTF-8, there cannot be false matches:
1633 A sequence of valid UTF-8 bytes cannot be a subsequence of
1634 another valid sequence of UTF-8 bytes.
1637 char * const s0 = STRING(scan), *s, *t;
1638 char * const s1 = s0 + STR_LEN(scan) - 1;
1639 char * const s2 = s1 - 4;
1640 const char * const t0 = "\xcc\x88\xcc\x81";
1641 const char * const t1 = t0 + 3;
1644 s < s2 && (t = ninstr(s, s1, t0, t1));
1646 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1647 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1654 n = scan + NODE_SZ_STR(scan);
1656 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1667 /* Follow the next-chain of the current node and optimize
1668 away all the NOTHINGs from it. */
1669 if (OP(scan) != CURLYX) {
1670 const int max = (reg_off_by_arg[OP(scan)]
1672 /* I32 may be smaller than U16 on CRAYs! */
1673 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1674 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1678 /* Skip NOTHING and LONGJMP. */
1679 while ((n = regnext(n))
1680 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1681 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1682 && off + noff < max)
1684 if (reg_off_by_arg[OP(scan)])
1687 NEXT_OFF(scan) = off;
1690 /* The principal pseudo-switch. Cannot be a switch, since we
1691 look into several different things. */
1692 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1693 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1694 next = regnext(scan);
1696 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1698 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1699 I32 max1 = 0, min1 = I32_MAX, num = 0;
1700 struct regnode_charclass_class accum;
1701 regnode *startbranch=scan;
1703 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1704 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1705 if (flags & SCF_DO_STCLASS)
1706 cl_init_zero(pRExC_state, &accum);
1708 while (OP(scan) == code) {
1709 I32 deltanext, minnext, f = 0, fake;
1710 struct regnode_charclass_class this_class;
1713 data_fake.flags = 0;
1715 data_fake.whilem_c = data->whilem_c;
1716 data_fake.last_closep = data->last_closep;
1719 data_fake.last_closep = &fake;
1720 next = regnext(scan);
1721 scan = NEXTOPER(scan);
1723 scan = NEXTOPER(scan);
1724 if (flags & SCF_DO_STCLASS) {
1725 cl_init(pRExC_state, &this_class);
1726 data_fake.start_class = &this_class;
1727 f = SCF_DO_STCLASS_AND;
1729 if (flags & SCF_WHILEM_VISITED_POS)
1730 f |= SCF_WHILEM_VISITED_POS;
1732 /* we suppose the run is continuous, last=next...*/
1733 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1734 next, &data_fake, f,depth+1);
1737 if (max1 < minnext + deltanext)
1738 max1 = minnext + deltanext;
1739 if (deltanext == I32_MAX)
1740 is_inf = is_inf_internal = 1;
1742 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1744 if (data && (data_fake.flags & SF_HAS_EVAL))
1745 data->flags |= SF_HAS_EVAL;
1747 data->whilem_c = data_fake.whilem_c;
1748 if (flags & SCF_DO_STCLASS)
1749 cl_or(pRExC_state, &accum, &this_class);
1750 if (code == SUSPEND)
1753 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1755 if (flags & SCF_DO_SUBSTR) {
1756 data->pos_min += min1;
1757 data->pos_delta += max1 - min1;
1758 if (max1 != min1 || is_inf)
1759 data->longest = &(data->longest_float);
1762 delta += max1 - min1;
1763 if (flags & SCF_DO_STCLASS_OR) {
1764 cl_or(pRExC_state, data->start_class, &accum);
1766 cl_and(data->start_class, &and_with);
1767 flags &= ~SCF_DO_STCLASS;
1770 else if (flags & SCF_DO_STCLASS_AND) {
1772 cl_and(data->start_class, &accum);
1773 flags &= ~SCF_DO_STCLASS;
1776 /* Switch to OR mode: cache the old value of
1777 * data->start_class */
1778 StructCopy(data->start_class, &and_with,
1779 struct regnode_charclass_class);
1780 flags &= ~SCF_DO_STCLASS_AND;
1781 StructCopy(&accum, data->start_class,
1782 struct regnode_charclass_class);
1783 flags |= SCF_DO_STCLASS_OR;
1784 data->start_class->flags |= ANYOF_EOS;
1790 Assuming this was/is a branch we are dealing with: 'scan' now
1791 points at the item that follows the branch sequence, whatever
1792 it is. We now start at the beginning of the sequence and look
1798 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1800 If we can find such a subseqence we need to turn the first
1801 element into a trie and then add the subsequent branch exact
1802 strings to the trie.
1806 1. patterns where the whole set of branch can be converted to a trie,
1808 2. patterns where only a subset of the alternations can be
1809 converted to a trie.
1811 In case 1 we can replace the whole set with a single regop
1812 for the trie. In case 2 we need to keep the start and end
1815 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1816 becomes BRANCH TRIE; BRANCH X;
1818 Hypthetically when we know the regex isnt anchored we can
1819 turn a case 1 into a DFA and let it rip... Every time it finds a match
1820 it would just call its tail, no WHILEM/CURLY needed.
1824 if (!re_trie_maxbuff) {
1825 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1826 if (!SvIOK(re_trie_maxbuff))
1827 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1829 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1831 regnode *first = (regnode *)NULL;
1832 regnode *last = (regnode *)NULL;
1833 regnode *tail = scan;
1838 SV * const mysv = sv_newmortal(); /* for dumping */
1840 /* var tail is used because there may be a TAIL
1841 regop in the way. Ie, the exacts will point to the
1842 thing following the TAIL, but the last branch will
1843 point at the TAIL. So we advance tail. If we
1844 have nested (?:) we may have to move through several
1848 while ( OP( tail ) == TAIL ) {
1849 /* this is the TAIL generated by (?:) */
1850 tail = regnext( tail );
1854 regprop( mysv, tail );
1855 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1856 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1857 (RExC_seen_evals) ? "[EVAL]" : ""
1862 step through the branches, cur represents each
1863 branch, noper is the first thing to be matched
1864 as part of that branch and noper_next is the
1865 regnext() of that node. if noper is an EXACT
1866 and noper_next is the same as scan (our current
1867 position in the regex) then the EXACT branch is
1868 a possible optimization target. Once we have
1869 two or more consequetive such branches we can
1870 create a trie of the EXACT's contents and stich
1871 it in place. If the sequence represents all of
1872 the branches we eliminate the whole thing and
1873 replace it with a single TRIE. If it is a
1874 subsequence then we need to stitch it in. This
1875 means the first branch has to remain, and needs
1876 to be repointed at the item on the branch chain
1877 following the last branch optimized. This could
1878 be either a BRANCH, in which case the
1879 subsequence is internal, or it could be the
1880 item following the branch sequence in which
1881 case the subsequence is at the end.
1885 /* dont use tail as the end marker for this traverse */
1886 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1887 regnode * const noper = NEXTOPER( cur );
1888 regnode * const noper_next = regnext( noper );
1891 regprop( mysv, cur);
1892 PerlIO_printf( Perl_debug_log, "%*s%s",
1893 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1895 regprop( mysv, noper);
1896 PerlIO_printf( Perl_debug_log, " -> %s",
1897 SvPV_nolen_const(mysv));
1900 regprop( mysv, noper_next );
1901 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1902 SvPV_nolen_const(mysv));
1904 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1907 if ( ( first ? OP( noper ) == optype
1908 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1909 && noper_next == tail && count<U16_MAX)
1914 optype = OP( noper );
1918 regprop( mysv, first);
1919 PerlIO_printf( Perl_debug_log, "%*s%s",
1920 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1921 regprop( mysv, NEXTOPER(first) );
1922 PerlIO_printf( Perl_debug_log, " -> %s\n",
1923 SvPV_nolen_const( mysv ) );
1928 regprop( mysv, cur);
1929 PerlIO_printf( Perl_debug_log, "%*s%s",
1930 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1931 regprop( mysv, noper );
1932 PerlIO_printf( Perl_debug_log, " -> %s\n",
1933 SvPV_nolen_const( mysv ) );
1939 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1940 (int)depth * 2 + 2, "E:", "**END**" );
1942 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1944 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1945 && noper_next == tail )
1949 optype = OP( noper );
1959 regprop( mysv, cur);
1960 PerlIO_printf( Perl_debug_log,
1961 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1962 " ", SvPV_nolen_const( mysv ), first, last, cur);
1967 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1968 (int)depth * 2 + 2, "E:", "==END==" );
1970 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1975 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1976 scan = NEXTOPER(NEXTOPER(scan));
1977 } else /* single branch is optimized. */
1978 scan = NEXTOPER(scan);
1981 else if (OP(scan) == EXACT) {
1982 I32 l = STR_LEN(scan);
1985 const U8 * const s = (U8*)STRING(scan);
1986 l = utf8_length(s, s + l);
1987 uc = utf8_to_uvchr(s, NULL);
1989 uc = *((U8*)STRING(scan));
1992 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1993 /* The code below prefers earlier match for fixed
1994 offset, later match for variable offset. */
1995 if (data->last_end == -1) { /* Update the start info. */
1996 data->last_start_min = data->pos_min;
1997 data->last_start_max = is_inf
1998 ? I32_MAX : data->pos_min + data->pos_delta;
2000 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2002 SV * const sv = data->last_found;
2003 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2004 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2005 if (mg && mg->mg_len >= 0)
2006 mg->mg_len += utf8_length((U8*)STRING(scan),
2007 (U8*)STRING(scan)+STR_LEN(scan));
2010 SvUTF8_on(data->last_found);
2011 data->last_end = data->pos_min + l;
2012 data->pos_min += l; /* As in the first entry. */
2013 data->flags &= ~SF_BEFORE_EOL;
2015 if (flags & SCF_DO_STCLASS_AND) {
2016 /* Check whether it is compatible with what we know already! */
2020 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2021 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2022 && (!(data->start_class->flags & ANYOF_FOLD)
2023 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2026 ANYOF_CLASS_ZERO(data->start_class);
2027 ANYOF_BITMAP_ZERO(data->start_class);
2029 ANYOF_BITMAP_SET(data->start_class, uc);
2030 data->start_class->flags &= ~ANYOF_EOS;
2032 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2034 else if (flags & SCF_DO_STCLASS_OR) {
2035 /* false positive possible if the class is case-folded */
2037 ANYOF_BITMAP_SET(data->start_class, uc);
2039 data->start_class->flags |= ANYOF_UNICODE_ALL;
2040 data->start_class->flags &= ~ANYOF_EOS;
2041 cl_and(data->start_class, &and_with);
2043 flags &= ~SCF_DO_STCLASS;
2045 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2046 I32 l = STR_LEN(scan);
2047 UV uc = *((U8*)STRING(scan));
2049 /* Search for fixed substrings supports EXACT only. */
2050 if (flags & SCF_DO_SUBSTR)
2051 scan_commit(pRExC_state, data);
2053 U8 *s = (U8 *)STRING(scan);
2054 l = utf8_length(s, s + l);
2055 uc = utf8_to_uvchr(s, NULL);
2058 if (data && (flags & SCF_DO_SUBSTR))
2060 if (flags & SCF_DO_STCLASS_AND) {
2061 /* Check whether it is compatible with what we know already! */
2065 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2066 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2067 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2069 ANYOF_CLASS_ZERO(data->start_class);
2070 ANYOF_BITMAP_ZERO(data->start_class);
2072 ANYOF_BITMAP_SET(data->start_class, uc);
2073 data->start_class->flags &= ~ANYOF_EOS;
2074 data->start_class->flags |= ANYOF_FOLD;
2075 if (OP(scan) == EXACTFL)
2076 data->start_class->flags |= ANYOF_LOCALE;
2079 else if (flags & SCF_DO_STCLASS_OR) {
2080 if (data->start_class->flags & ANYOF_FOLD) {
2081 /* false positive possible if the class is case-folded.
2082 Assume that the locale settings are the same... */
2084 ANYOF_BITMAP_SET(data->start_class, uc);
2085 data->start_class->flags &= ~ANYOF_EOS;
2087 cl_and(data->start_class, &and_with);
2089 flags &= ~SCF_DO_STCLASS;
2091 else if (strchr((const char*)PL_varies,OP(scan))) {
2092 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2093 I32 f = flags, pos_before = 0;
2094 regnode *oscan = scan;
2095 struct regnode_charclass_class this_class;
2096 struct regnode_charclass_class *oclass = NULL;
2097 I32 next_is_eval = 0;
2099 switch (PL_regkind[(U8)OP(scan)]) {
2100 case WHILEM: /* End of (?:...)* . */
2101 scan = NEXTOPER(scan);
2104 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2105 next = NEXTOPER(scan);
2106 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2108 maxcount = REG_INFTY;
2109 next = regnext(scan);
2110 scan = NEXTOPER(scan);
2114 if (flags & SCF_DO_SUBSTR)
2119 if (flags & SCF_DO_STCLASS) {
2121 maxcount = REG_INFTY;
2122 next = regnext(scan);
2123 scan = NEXTOPER(scan);
2126 is_inf = is_inf_internal = 1;
2127 scan = regnext(scan);
2128 if (flags & SCF_DO_SUBSTR) {
2129 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2130 data->longest = &(data->longest_float);
2132 goto optimize_curly_tail;
2134 mincount = ARG1(scan);
2135 maxcount = ARG2(scan);
2136 next = regnext(scan);
2137 if (OP(scan) == CURLYX) {
2138 I32 lp = (data ? *(data->last_closep) : 0);
2139 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2141 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2142 next_is_eval = (OP(scan) == EVAL);
2144 if (flags & SCF_DO_SUBSTR) {
2145 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2146 pos_before = data->pos_min;
2150 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2152 data->flags |= SF_IS_INF;
2154 if (flags & SCF_DO_STCLASS) {
2155 cl_init(pRExC_state, &this_class);
2156 oclass = data->start_class;
2157 data->start_class = &this_class;
2158 f |= SCF_DO_STCLASS_AND;
2159 f &= ~SCF_DO_STCLASS_OR;
2161 /* These are the cases when once a subexpression
2162 fails at a particular position, it cannot succeed
2163 even after backtracking at the enclosing scope.
2165 XXXX what if minimal match and we are at the
2166 initial run of {n,m}? */
2167 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2168 f &= ~SCF_WHILEM_VISITED_POS;
2170 /* This will finish on WHILEM, setting scan, or on NULL: */
2171 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2173 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2175 if (flags & SCF_DO_STCLASS)
2176 data->start_class = oclass;
2177 if (mincount == 0 || minnext == 0) {
2178 if (flags & SCF_DO_STCLASS_OR) {
2179 cl_or(pRExC_state, data->start_class, &this_class);
2181 else if (flags & SCF_DO_STCLASS_AND) {
2182 /* Switch to OR mode: cache the old value of
2183 * data->start_class */
2184 StructCopy(data->start_class, &and_with,
2185 struct regnode_charclass_class);
2186 flags &= ~SCF_DO_STCLASS_AND;
2187 StructCopy(&this_class, data->start_class,
2188 struct regnode_charclass_class);
2189 flags |= SCF_DO_STCLASS_OR;
2190 data->start_class->flags |= ANYOF_EOS;
2192 } else { /* Non-zero len */
2193 if (flags & SCF_DO_STCLASS_OR) {
2194 cl_or(pRExC_state, data->start_class, &this_class);
2195 cl_and(data->start_class, &and_with);
2197 else if (flags & SCF_DO_STCLASS_AND)
2198 cl_and(data->start_class, &this_class);
2199 flags &= ~SCF_DO_STCLASS;
2201 if (!scan) /* It was not CURLYX, but CURLY. */
2203 if ( /* ? quantifier ok, except for (?{ ... }) */
2204 (next_is_eval || !(mincount == 0 && maxcount == 1))
2205 && (minnext == 0) && (deltanext == 0)
2206 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2207 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2208 && ckWARN(WARN_REGEXP))
2211 "Quantifier unexpected on zero-length expression");
2214 min += minnext * mincount;
2215 is_inf_internal |= ((maxcount == REG_INFTY
2216 && (minnext + deltanext) > 0)
2217 || deltanext == I32_MAX);
2218 is_inf |= is_inf_internal;
2219 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2221 /* Try powerful optimization CURLYX => CURLYN. */
2222 if ( OP(oscan) == CURLYX && data
2223 && data->flags & SF_IN_PAR
2224 && !(data->flags & SF_HAS_EVAL)
2225 && !deltanext && minnext == 1 ) {
2226 /* Try to optimize to CURLYN. */
2227 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2228 regnode *nxt1 = nxt;
2235 if (!strchr((const char*)PL_simple,OP(nxt))
2236 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2237 && STR_LEN(nxt) == 1))
2243 if (OP(nxt) != CLOSE)
2245 /* Now we know that nxt2 is the only contents: */
2246 oscan->flags = (U8)ARG(nxt);
2248 OP(nxt1) = NOTHING; /* was OPEN. */
2250 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2251 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2252 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2253 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2254 OP(nxt + 1) = OPTIMIZED; /* was count. */
2255 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2260 /* Try optimization CURLYX => CURLYM. */
2261 if ( OP(oscan) == CURLYX && data
2262 && !(data->flags & SF_HAS_PAR)
2263 && !(data->flags & SF_HAS_EVAL)
2264 && !deltanext /* atom is fixed width */
2265 && minnext != 0 /* CURLYM can't handle zero width */
2267 /* XXXX How to optimize if data == 0? */
2268 /* Optimize to a simpler form. */
2269 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2273 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2274 && (OP(nxt2) != WHILEM))
2276 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2277 /* Need to optimize away parenths. */
2278 if (data->flags & SF_IN_PAR) {
2279 /* Set the parenth number. */
2280 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2282 if (OP(nxt) != CLOSE)
2283 FAIL("Panic opt close");
2284 oscan->flags = (U8)ARG(nxt);
2285 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2286 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2288 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2289 OP(nxt + 1) = OPTIMIZED; /* was count. */
2290 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2291 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2294 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2295 regnode *nnxt = regnext(nxt1);
2298 if (reg_off_by_arg[OP(nxt1)])
2299 ARG_SET(nxt1, nxt2 - nxt1);
2300 else if (nxt2 - nxt1 < U16_MAX)
2301 NEXT_OFF(nxt1) = nxt2 - nxt1;
2303 OP(nxt) = NOTHING; /* Cannot beautify */
2308 /* Optimize again: */
2309 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2315 else if ((OP(oscan) == CURLYX)
2316 && (flags & SCF_WHILEM_VISITED_POS)
2317 /* See the comment on a similar expression above.
2318 However, this time it not a subexpression
2319 we care about, but the expression itself. */
2320 && (maxcount == REG_INFTY)
2321 && data && ++data->whilem_c < 16) {
2322 /* This stays as CURLYX, we can put the count/of pair. */
2323 /* Find WHILEM (as in regexec.c) */
2324 regnode *nxt = oscan + NEXT_OFF(oscan);
2326 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2328 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2329 | (RExC_whilem_seen << 4)); /* On WHILEM */
2331 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2333 if (flags & SCF_DO_SUBSTR) {
2334 SV *last_str = NULL;
2335 int counted = mincount != 0;
2337 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2338 #if defined(SPARC64_GCC_WORKAROUND)
2341 const char *s = NULL;
2344 if (pos_before >= data->last_start_min)
2347 b = data->last_start_min;
2350 s = SvPV_const(data->last_found, l);
2351 old = b - data->last_start_min;
2354 I32 b = pos_before >= data->last_start_min
2355 ? pos_before : data->last_start_min;
2357 const char *s = SvPV_const(data->last_found, l);
2358 I32 old = b - data->last_start_min;
2362 old = utf8_hop((U8*)s, old) - (U8*)s;
2365 /* Get the added string: */
2366 last_str = newSVpvn(s + old, l);
2368 SvUTF8_on(last_str);
2369 if (deltanext == 0 && pos_before == b) {
2370 /* What was added is a constant string */
2372 SvGROW(last_str, (mincount * l) + 1);
2373 repeatcpy(SvPVX(last_str) + l,
2374 SvPVX_const(last_str), l, mincount - 1);
2375 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2376 /* Add additional parts. */
2377 SvCUR_set(data->last_found,
2378 SvCUR(data->last_found) - l);
2379 sv_catsv(data->last_found, last_str);
2381 SV * sv = data->last_found;
2383 SvUTF8(sv) && SvMAGICAL(sv) ?
2384 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2385 if (mg && mg->mg_len >= 0)
2386 mg->mg_len += CHR_SVLEN(last_str);
2388 data->last_end += l * (mincount - 1);
2391 /* start offset must point into the last copy */
2392 data->last_start_min += minnext * (mincount - 1);
2393 data->last_start_max += is_inf ? I32_MAX
2394 : (maxcount - 1) * (minnext + data->pos_delta);
2397 /* It is counted once already... */
2398 data->pos_min += minnext * (mincount - counted);
2399 data->pos_delta += - counted * deltanext +
2400 (minnext + deltanext) * maxcount - minnext * mincount;
2401 if (mincount != maxcount) {
2402 /* Cannot extend fixed substrings found inside
2404 scan_commit(pRExC_state,data);
2405 if (mincount && last_str) {
2406 sv_setsv(data->last_found, last_str);
2407 data->last_end = data->pos_min;
2408 data->last_start_min =
2409 data->pos_min - CHR_SVLEN(last_str);
2410 data->last_start_max = is_inf
2412 : data->pos_min + data->pos_delta
2413 - CHR_SVLEN(last_str);
2415 data->longest = &(data->longest_float);
2417 SvREFCNT_dec(last_str);
2419 if (data && (fl & SF_HAS_EVAL))
2420 data->flags |= SF_HAS_EVAL;
2421 optimize_curly_tail:
2422 if (OP(oscan) != CURLYX) {
2423 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2425 NEXT_OFF(oscan) += NEXT_OFF(next);
2428 default: /* REF and CLUMP only? */
2429 if (flags & SCF_DO_SUBSTR) {
2430 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2431 data->longest = &(data->longest_float);
2433 is_inf = is_inf_internal = 1;
2434 if (flags & SCF_DO_STCLASS_OR)
2435 cl_anything(pRExC_state, data->start_class);
2436 flags &= ~SCF_DO_STCLASS;
2440 else if (strchr((const char*)PL_simple,OP(scan))) {
2443 if (flags & SCF_DO_SUBSTR) {
2444 scan_commit(pRExC_state,data);
2448 if (flags & SCF_DO_STCLASS) {
2449 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2451 /* Some of the logic below assumes that switching
2452 locale on will only add false positives. */
2453 switch (PL_regkind[(U8)OP(scan)]) {
2457 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2458 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2459 cl_anything(pRExC_state, data->start_class);
2462 if (OP(scan) == SANY)
2464 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2465 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2466 || (data->start_class->flags & ANYOF_CLASS));
2467 cl_anything(pRExC_state, data->start_class);
2469 if (flags & SCF_DO_STCLASS_AND || !value)
2470 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2473 if (flags & SCF_DO_STCLASS_AND)
2474 cl_and(data->start_class,
2475 (struct regnode_charclass_class*)scan);
2477 cl_or(pRExC_state, data->start_class,
2478 (struct regnode_charclass_class*)scan);
2481 if (flags & SCF_DO_STCLASS_AND) {
2482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2484 for (value = 0; value < 256; value++)
2485 if (!isALNUM(value))
2486 ANYOF_BITMAP_CLEAR(data->start_class, value);
2490 if (data->start_class->flags & ANYOF_LOCALE)
2491 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2493 for (value = 0; value < 256; value++)
2495 ANYOF_BITMAP_SET(data->start_class, value);
2500 if (flags & SCF_DO_STCLASS_AND) {
2501 if (data->start_class->flags & ANYOF_LOCALE)
2502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2505 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2506 data->start_class->flags |= ANYOF_LOCALE;
2510 if (flags & SCF_DO_STCLASS_AND) {
2511 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2512 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2513 for (value = 0; value < 256; value++)
2515 ANYOF_BITMAP_CLEAR(data->start_class, value);
2519 if (data->start_class->flags & ANYOF_LOCALE)
2520 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2522 for (value = 0; value < 256; value++)
2523 if (!isALNUM(value))
2524 ANYOF_BITMAP_SET(data->start_class, value);
2529 if (flags & SCF_DO_STCLASS_AND) {
2530 if (data->start_class->flags & ANYOF_LOCALE)
2531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2534 data->start_class->flags |= ANYOF_LOCALE;
2535 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2539 if (flags & SCF_DO_STCLASS_AND) {
2540 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2542 for (value = 0; value < 256; value++)
2543 if (!isSPACE(value))
2544 ANYOF_BITMAP_CLEAR(data->start_class, value);
2548 if (data->start_class->flags & ANYOF_LOCALE)
2549 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2551 for (value = 0; value < 256; value++)
2553 ANYOF_BITMAP_SET(data->start_class, value);
2558 if (flags & SCF_DO_STCLASS_AND) {
2559 if (data->start_class->flags & ANYOF_LOCALE)
2560 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2563 data->start_class->flags |= ANYOF_LOCALE;
2564 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2568 if (flags & SCF_DO_STCLASS_AND) {
2569 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2570 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2571 for (value = 0; value < 256; value++)
2573 ANYOF_BITMAP_CLEAR(data->start_class, value);
2577 if (data->start_class->flags & ANYOF_LOCALE)
2578 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2580 for (value = 0; value < 256; value++)
2581 if (!isSPACE(value))
2582 ANYOF_BITMAP_SET(data->start_class, value);
2587 if (flags & SCF_DO_STCLASS_AND) {
2588 if (data->start_class->flags & ANYOF_LOCALE) {
2589 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2590 for (value = 0; value < 256; value++)
2591 if (!isSPACE(value))
2592 ANYOF_BITMAP_CLEAR(data->start_class, value);
2596 data->start_class->flags |= ANYOF_LOCALE;
2597 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2601 if (flags & SCF_DO_STCLASS_AND) {
2602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2603 for (value = 0; value < 256; value++)
2604 if (!isDIGIT(value))
2605 ANYOF_BITMAP_CLEAR(data->start_class, value);
2608 if (data->start_class->flags & ANYOF_LOCALE)
2609 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2611 for (value = 0; value < 256; value++)
2613 ANYOF_BITMAP_SET(data->start_class, value);
2618 if (flags & SCF_DO_STCLASS_AND) {
2619 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2620 for (value = 0; value < 256; value++)
2622 ANYOF_BITMAP_CLEAR(data->start_class, value);
2625 if (data->start_class->flags & ANYOF_LOCALE)
2626 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2628 for (value = 0; value < 256; value++)
2629 if (!isDIGIT(value))
2630 ANYOF_BITMAP_SET(data->start_class, value);
2635 if (flags & SCF_DO_STCLASS_OR)
2636 cl_and(data->start_class, &and_with);
2637 flags &= ~SCF_DO_STCLASS;
2640 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2641 data->flags |= (OP(scan) == MEOL
2645 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2646 /* Lookbehind, or need to calculate parens/evals/stclass: */
2647 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2648 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2649 /* Lookahead/lookbehind */
2650 I32 deltanext, minnext, fake = 0;
2652 struct regnode_charclass_class intrnl;
2655 data_fake.flags = 0;
2657 data_fake.whilem_c = data->whilem_c;
2658 data_fake.last_closep = data->last_closep;
2661 data_fake.last_closep = &fake;
2662 if ( flags & SCF_DO_STCLASS && !scan->flags
2663 && OP(scan) == IFMATCH ) { /* Lookahead */
2664 cl_init(pRExC_state, &intrnl);
2665 data_fake.start_class = &intrnl;
2666 f |= SCF_DO_STCLASS_AND;
2668 if (flags & SCF_WHILEM_VISITED_POS)
2669 f |= SCF_WHILEM_VISITED_POS;
2670 next = regnext(scan);
2671 nscan = NEXTOPER(NEXTOPER(scan));
2672 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2675 vFAIL("Variable length lookbehind not implemented");
2677 else if (minnext > U8_MAX) {
2678 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2680 scan->flags = (U8)minnext;
2682 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2684 if (data && (data_fake.flags & SF_HAS_EVAL))
2685 data->flags |= SF_HAS_EVAL;
2687 data->whilem_c = data_fake.whilem_c;
2688 if (f & SCF_DO_STCLASS_AND) {
2689 const int was = (data->start_class->flags & ANYOF_EOS);
2691 cl_and(data->start_class, &intrnl);
2693 data->start_class->flags |= ANYOF_EOS;
2696 else if (OP(scan) == OPEN) {
2699 else if (OP(scan) == CLOSE) {
2700 if ((I32)ARG(scan) == is_par) {
2701 next = regnext(scan);
2703 if ( next && (OP(next) != WHILEM) && next < last)
2704 is_par = 0; /* Disable optimization */
2707 *(data->last_closep) = ARG(scan);
2709 else if (OP(scan) == EVAL) {
2711 data->flags |= SF_HAS_EVAL;
2713 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2714 if (flags & SCF_DO_SUBSTR) {
2715 scan_commit(pRExC_state,data);
2716 data->longest = &(data->longest_float);
2718 is_inf = is_inf_internal = 1;
2719 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2720 cl_anything(pRExC_state, data->start_class);
2721 flags &= ~SCF_DO_STCLASS;
2723 /* Else: zero-length, ignore. */
2724 scan = regnext(scan);
2729 *deltap = is_inf_internal ? I32_MAX : delta;
2730 if (flags & SCF_DO_SUBSTR && is_inf)
2731 data->pos_delta = I32_MAX - data->pos_min;
2732 if (is_par > U8_MAX)
2734 if (is_par && pars==1 && data) {
2735 data->flags |= SF_IN_PAR;
2736 data->flags &= ~SF_HAS_PAR;
2738 else if (pars && data) {
2739 data->flags |= SF_HAS_PAR;
2740 data->flags &= ~SF_IN_PAR;
2742 if (flags & SCF_DO_STCLASS_OR)
2743 cl_and(data->start_class, &and_with);
2748 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
2750 if (RExC_rx->data) {
2751 Renewc(RExC_rx->data,
2752 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2753 char, struct reg_data);
2754 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2755 RExC_rx->data->count += n;
2758 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2759 char, struct reg_data);
2760 Newx(RExC_rx->data->what, n, U8);
2761 RExC_rx->data->count = n;
2763 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2764 return RExC_rx->data->count - n;
2768 Perl_reginitcolors(pTHX)
2770 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2772 char *t = savepv(s);
2776 t = strchr(t, '\t');
2782 PL_colors[i] = t = (char *)"";
2787 PL_colors[i++] = (char *)"";
2794 - pregcomp - compile a regular expression into internal code
2796 * We can't allocate space until we know how big the compiled form will be,
2797 * but we can't compile it (and thus know how big it is) until we've got a
2798 * place to put the code. So we cheat: we compile it twice, once with code
2799 * generation turned off and size counting turned on, and once "for real".
2800 * This also means that we don't allocate space until we are sure that the
2801 * thing really will compile successfully, and we never have to move the
2802 * code and thus invalidate pointers into it. (Note that it has to be in
2803 * one piece because free() must be able to free it all.) [NB: not true in perl]
2805 * Beware that the optimization-preparation code in here knows about some
2806 * of the structure of the compiled regexp. [I'll say.]
2809 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2819 RExC_state_t RExC_state;
2820 RExC_state_t *pRExC_state = &RExC_state;
2822 GET_RE_DEBUG_FLAGS_DECL;
2825 FAIL("NULL regexp argument");
2827 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2830 DEBUG_r(if (!PL_colorset) reginitcolors());
2832 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2833 PL_colors[4],PL_colors[5],PL_colors[0],
2834 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2836 RExC_flags = pm->op_pmflags;
2840 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2841 RExC_seen_evals = 0;
2844 /* First pass: determine size, legality. */
2851 RExC_emit = &PL_regdummy;
2852 RExC_whilem_seen = 0;
2853 #if 0 /* REGC() is (currently) a NOP at the first pass.
2854 * Clever compilers notice this and complain. --jhi */
2855 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2857 if (reg(pRExC_state, 0, &flags) == NULL) {
2858 RExC_precomp = NULL;
2861 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2863 /* Small enough for pointer-storage convention?
2864 If extralen==0, this means that we will not need long jumps. */
2865 if (RExC_size >= 0x10000L && RExC_extralen)
2866 RExC_size += RExC_extralen;
2869 if (RExC_whilem_seen > 15)
2870 RExC_whilem_seen = 15;
2872 /* Allocate space and initialize. */
2873 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2876 FAIL("Regexp out of space");
2879 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2880 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2883 r->prelen = xend - exp;
2884 r->precomp = savepvn(RExC_precomp, r->prelen);
2886 #ifdef PERL_OLD_COPY_ON_WRITE
2887 r->saved_copy = NULL;
2889 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2890 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2892 r->substrs = 0; /* Useful during FAIL. */
2893 r->startp = 0; /* Useful during FAIL. */
2894 r->endp = 0; /* Useful during FAIL. */
2896 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2898 r->offsets[0] = RExC_size;
2900 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2901 "%s %"UVuf" bytes for offset annotations.\n",
2902 r->offsets ? "Got" : "Couldn't get",
2903 (UV)((2*RExC_size+1) * sizeof(U32))));
2907 /* Second pass: emit code. */
2908 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2913 RExC_emit_start = r->program;
2914 RExC_emit = r->program;
2915 /* Store the count of eval-groups for security checks: */
2916 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2917 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2919 if (reg(pRExC_state, 0, &flags) == NULL)
2923 /* Dig out information for optimizations. */
2924 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2925 pm->op_pmflags = RExC_flags;
2927 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2928 r->regstclass = NULL;
2929 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2930 r->reganch |= ROPT_NAUGHTY;
2931 scan = r->program + 1; /* First BRANCH. */
2933 /* XXXX To minimize changes to RE engine we always allocate
2934 3-units-long substrs field. */
2935 Newxz(r->substrs, 1, struct reg_substr_data);
2937 StructCopy(&zero_scan_data, &data, scan_data_t);
2938 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2939 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2941 STRLEN longest_float_length, longest_fixed_length;
2942 struct regnode_charclass_class ch_class;
2947 /* Skip introductions and multiplicators >= 1. */
2948 while ((OP(first) == OPEN && (sawopen = 1)) ||
2949 /* An OR of *one* alternative - should not happen now. */
2950 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2951 (OP(first) == PLUS) ||
2952 (OP(first) == MINMOD) ||
2953 /* An {n,m} with n>0 */
2954 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2955 if (OP(first) == PLUS)
2958 first += regarglen[(U8)OP(first)];
2959 first = NEXTOPER(first);
2962 /* Starting-point info. */
2964 if (PL_regkind[(U8)OP(first)] == EXACT) {
2965 if (OP(first) == EXACT)
2966 ; /* Empty, get anchored substr later. */
2967 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2968 r->regstclass = first;
2970 else if (strchr((const char*)PL_simple,OP(first)))
2971 r->regstclass = first;
2972 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2973 PL_regkind[(U8)OP(first)] == NBOUND)
2974 r->regstclass = first;
2975 else if (PL_regkind[(U8)OP(first)] == BOL) {
2976 r->reganch |= (OP(first) == MBOL
2978 : (OP(first) == SBOL
2981 first = NEXTOPER(first);
2984 else if (OP(first) == GPOS) {
2985 r->reganch |= ROPT_ANCH_GPOS;
2986 first = NEXTOPER(first);
2989 else if (!sawopen && (OP(first) == STAR &&
2990 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2991 !(r->reganch & ROPT_ANCH) )
2993 /* turn .* into ^.* with an implied $*=1 */
2995 (OP(NEXTOPER(first)) == REG_ANY)
2998 r->reganch |= type | ROPT_IMPLICIT;
2999 first = NEXTOPER(first);
3002 if (sawplus && (!sawopen || !RExC_sawback)
3003 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3004 /* x+ must match at the 1st pos of run of x's */
3005 r->reganch |= ROPT_SKIP;
3007 /* Scan is after the zeroth branch, first is atomic matcher. */
3008 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3009 (IV)(first - scan + 1)));
3011 * If there's something expensive in the r.e., find the
3012 * longest literal string that must appear and make it the
3013 * regmust. Resolve ties in favor of later strings, since
3014 * the regstart check works with the beginning of the r.e.
3015 * and avoiding duplication strengthens checking. Not a
3016 * strong reason, but sufficient in the absence of others.
3017 * [Now we resolve ties in favor of the earlier string if
3018 * it happens that c_offset_min has been invalidated, since the
3019 * earlier string may buy us something the later one won't.]
3023 data.longest_fixed = newSVpvs("");
3024 data.longest_float = newSVpvs("");
3025 data.last_found = newSVpvs("");
3026 data.longest = &(data.longest_fixed);
3028 if (!r->regstclass) {
3029 cl_init(pRExC_state, &ch_class);
3030 data.start_class = &ch_class;
3031 stclass_flag = SCF_DO_STCLASS_AND;
3032 } else /* XXXX Check for BOUND? */
3034 data.last_closep = &last_close;
3036 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3037 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3038 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3039 && data.last_start_min == 0 && data.last_end > 0
3040 && !RExC_seen_zerolen
3041 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3042 r->reganch |= ROPT_CHECK_ALL;
3043 scan_commit(pRExC_state, &data);
3044 SvREFCNT_dec(data.last_found);
3046 longest_float_length = CHR_SVLEN(data.longest_float);
3047 if (longest_float_length
3048 || (data.flags & SF_FL_BEFORE_EOL
3049 && (!(data.flags & SF_FL_BEFORE_MEOL)
3050 || (RExC_flags & PMf_MULTILINE)))) {
3053 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3054 && data.offset_fixed == data.offset_float_min
3055 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3056 goto remove_float; /* As in (a)+. */
3058 if (SvUTF8(data.longest_float)) {
3059 r->float_utf8 = data.longest_float;
3060 r->float_substr = NULL;
3062 r->float_substr = data.longest_float;
3063 r->float_utf8 = NULL;
3065 r->float_min_offset = data.offset_float_min;
3066 r->float_max_offset = data.offset_float_max;
3067 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3068 && (!(data.flags & SF_FL_BEFORE_MEOL)
3069 || (RExC_flags & PMf_MULTILINE)));
3070 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3074 r->float_substr = r->float_utf8 = NULL;
3075 SvREFCNT_dec(data.longest_float);
3076 longest_float_length = 0;
3079 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3080 if (longest_fixed_length
3081 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3082 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3083 || (RExC_flags & PMf_MULTILINE)))) {
3086 if (SvUTF8(data.longest_fixed)) {
3087 r->anchored_utf8 = data.longest_fixed;
3088 r->anchored_substr = NULL;
3090 r->anchored_substr = data.longest_fixed;
3091 r->anchored_utf8 = NULL;
3093 r->anchored_offset = data.offset_fixed;
3094 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3095 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3096 || (RExC_flags & PMf_MULTILINE)));
3097 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3100 r->anchored_substr = r->anchored_utf8 = NULL;
3101 SvREFCNT_dec(data.longest_fixed);
3102 longest_fixed_length = 0;
3105 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3106 r->regstclass = NULL;
3107 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3109 && !(data.start_class->flags & ANYOF_EOS)
3110 && !cl_is_anything(data.start_class))
3112 const I32 n = add_data(pRExC_state, 1, "f");
3114 Newx(RExC_rx->data->data[n], 1,
3115 struct regnode_charclass_class);
3116 StructCopy(data.start_class,
3117 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3118 struct regnode_charclass_class);
3119 r->regstclass = (regnode*)RExC_rx->data->data[n];
3120 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3121 PL_regdata = r->data; /* for regprop() */
3122 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3123 regprop(sv, (regnode*)data.start_class);
3124 PerlIO_printf(Perl_debug_log,
3125 "synthetic stclass \"%s\".\n",
3126 SvPVX_const(sv));});
3129 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3130 if (longest_fixed_length > longest_float_length) {
3131 r->check_substr = r->anchored_substr;
3132 r->check_utf8 = r->anchored_utf8;
3133 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3134 if (r->reganch & ROPT_ANCH_SINGLE)
3135 r->reganch |= ROPT_NOSCAN;
3138 r->check_substr = r->float_substr;
3139 r->check_utf8 = r->float_utf8;
3140 r->check_offset_min = data.offset_float_min;
3141 r->check_offset_max = data.offset_float_max;
3143 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3144 This should be changed ASAP! */
3145 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3146 r->reganch |= RE_USE_INTUIT;
3147 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3148 r->reganch |= RE_INTUIT_TAIL;
3152 /* Several toplevels. Best we can is to set minlen. */
3154 struct regnode_charclass_class ch_class;
3157 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3158 scan = r->program + 1;
3159 cl_init(pRExC_state, &ch_class);
3160 data.start_class = &ch_class;
3161 data.last_closep = &last_close;
3162 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3163 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3164 = r->float_substr = r->float_utf8 = NULL;
3165 if (!(data.start_class->flags & ANYOF_EOS)
3166 && !cl_is_anything(data.start_class))
3168 const I32 n = add_data(pRExC_state, 1, "f");
3170 Newx(RExC_rx->data->data[n], 1,
3171 struct regnode_charclass_class);
3172 StructCopy(data.start_class,
3173 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3174 struct regnode_charclass_class);
3175 r->regstclass = (regnode*)RExC_rx->data->data[n];
3176 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3177 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3178 regprop(sv, (regnode*)data.start_class);
3179 PerlIO_printf(Perl_debug_log,
3180 "synthetic stclass \"%s\".\n",
3181 SvPVX_const(sv));});
3186 if (RExC_seen & REG_SEEN_GPOS)
3187 r->reganch |= ROPT_GPOS_SEEN;
3188 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3189 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3190 if (RExC_seen & REG_SEEN_EVAL)
3191 r->reganch |= ROPT_EVAL_SEEN;
3192 if (RExC_seen & REG_SEEN_CANY)
3193 r->reganch |= ROPT_CANY_SEEN;
3194 Newxz(r->startp, RExC_npar, I32);
3195 Newxz(r->endp, RExC_npar, I32);
3196 PL_regdata = r->data; /* for regprop() */
3197 DEBUG_COMPILE_r(regdump(r));
3202 - reg - regular expression, i.e. main body or parenthesized thing
3204 * Caller must absorb opening parenthesis.
3206 * Combining parenthesis handling with the base level of regular expression
3207 * is a trifle forced, but the need to tie the tails of the branches to what
3208 * follows makes it hard to avoid.
3211 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3212 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3215 register regnode *ret; /* Will be the head of the group. */
3216 register regnode *br;
3217 register regnode *lastbr;
3218 register regnode *ender = NULL;
3219 register I32 parno = 0;
3221 const I32 oregflags = RExC_flags;
3222 I32 have_branch = 0;
3225 /* for (?g), (?gc), and (?o) warnings; warning
3226 about (?c) will warn about (?g) -- japhy */
3228 I32 wastedflags = 0x00;
3229 const I32 wasted_o = 0x01;
3230 const I32 wasted_g = 0x02;
3231 const I32 wasted_gc = 0x02 | 0x04;
3232 const I32 wasted_c = 0x04;
3234 char * parse_start = RExC_parse; /* MJD */
3235 char * const oregcomp_parse = RExC_parse;
3238 *flagp = 0; /* Tentatively. */
3241 /* Make an OPEN node, if parenthesized. */
3243 if (*RExC_parse == '?') { /* (?...) */
3244 U32 posflags = 0, negflags = 0;
3245 U32 *flagsp = &posflags;
3247 const char * const seqstart = RExC_parse;
3250 paren = *RExC_parse++;
3251 ret = NULL; /* For look-ahead/behind. */
3253 case '<': /* (?<...) */
3254 RExC_seen |= REG_SEEN_LOOKBEHIND;
3255 if (*RExC_parse == '!')
3257 if (*RExC_parse != '=' && *RExC_parse != '!')
3260 case '=': /* (?=...) */
3261 case '!': /* (?!...) */
3262 RExC_seen_zerolen++;
3263 case ':': /* (?:...) */
3264 case '>': /* (?>...) */
3266 case '$': /* (?$...) */
3267 case '@': /* (?@...) */
3268 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3270 case '#': /* (?#...) */
3271 while (*RExC_parse && *RExC_parse != ')')
3273 if (*RExC_parse != ')')
3274 FAIL("Sequence (?#... not terminated");
3275 nextchar(pRExC_state);
3278 case 'p': /* (?p...) */
3279 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3280 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3282 case '?': /* (??...) */
3284 if (*RExC_parse != '{')
3286 paren = *RExC_parse++;
3288 case '{': /* (?{...}) */
3290 I32 count = 1, n = 0;
3292 char *s = RExC_parse;
3294 OP_4tree *sop, *rop;
3296 RExC_seen_zerolen++;
3297 RExC_seen |= REG_SEEN_EVAL;
3298 while (count && (c = *RExC_parse)) {
3299 if (c == '\\' && RExC_parse[1])
3307 if (*RExC_parse != ')')
3310 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3315 if (RExC_parse - 1 - s)
3316 sv = newSVpvn(s, RExC_parse - 1 - s);
3321 Perl_save_re_context(aTHX);
3322 rop = sv_compile_2op(sv, &sop, "re", &pad);
3323 sop->op_private |= OPpREFCOUNTED;
3324 /* re_dup will OpREFCNT_inc */
3325 OpREFCNT_set(sop, 1);
3328 n = add_data(pRExC_state, 3, "nop");
3329 RExC_rx->data->data[n] = (void*)rop;
3330 RExC_rx->data->data[n+1] = (void*)sop;
3331 RExC_rx->data->data[n+2] = (void*)pad;
3334 else { /* First pass */
3335 if (PL_reginterp_cnt < ++RExC_seen_evals
3337 /* No compiled RE interpolated, has runtime
3338 components ===> unsafe. */
3339 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3340 if (PL_tainting && PL_tainted)
3341 FAIL("Eval-group in insecure regular expression");
3342 if (IN_PERL_COMPILETIME)
3346 nextchar(pRExC_state);
3348 ret = reg_node(pRExC_state, LOGICAL);
3351 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3352 /* deal with the length of this later - MJD */
3355 ret = reganode(pRExC_state, EVAL, n);
3356 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3357 Set_Node_Offset(ret, parse_start);
3360 case '(': /* (?(?{...})...) and (?(?=...)...) */
3362 if (RExC_parse[0] == '?') { /* (?(?...)) */
3363 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3364 || RExC_parse[1] == '<'
3365 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3368 ret = reg_node(pRExC_state, LOGICAL);
3371 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3375 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3377 parno = atoi(RExC_parse++);
3379 while (isDIGIT(*RExC_parse))
3381 ret = reganode(pRExC_state, GROUPP, parno);
3383 if ((c = *nextchar(pRExC_state)) != ')')
3384 vFAIL("Switch condition not recognized");
3386 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3387 br = regbranch(pRExC_state, &flags, 1);
3389 br = reganode(pRExC_state, LONGJMP, 0);
3391 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3392 c = *nextchar(pRExC_state);
3396 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3397 regbranch(pRExC_state, &flags, 1);
3398 regtail(pRExC_state, ret, lastbr);
3401 c = *nextchar(pRExC_state);
3406 vFAIL("Switch (?(condition)... contains too many branches");
3407 ender = reg_node(pRExC_state, TAIL);
3408 regtail(pRExC_state, br, ender);
3410 regtail(pRExC_state, lastbr, ender);
3411 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3414 regtail(pRExC_state, ret, ender);
3418 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3422 RExC_parse--; /* for vFAIL to print correctly */
3423 vFAIL("Sequence (? incomplete");
3427 parse_flags: /* (?i) */
3428 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3429 /* (?g), (?gc) and (?o) are useless here
3430 and must be globally applied -- japhy */
3432 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3433 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3434 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3435 if (! (wastedflags & wflagbit) ) {
3436 wastedflags |= wflagbit;
3439 "Useless (%s%c) - %suse /%c modifier",
3440 flagsp == &negflags ? "?-" : "?",
3442 flagsp == &negflags ? "don't " : "",
3448 else if (*RExC_parse == 'c') {
3449 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3450 if (! (wastedflags & wasted_c) ) {
3451 wastedflags |= wasted_gc;
3454 "Useless (%sc) - %suse /gc modifier",
3455 flagsp == &negflags ? "?-" : "?",
3456 flagsp == &negflags ? "don't " : ""
3461 else { pmflag(flagsp, *RExC_parse); }
3465 if (*RExC_parse == '-') {
3467 wastedflags = 0; /* reset so (?g-c) warns twice */
3471 RExC_flags |= posflags;
3472 RExC_flags &= ~negflags;
3473 if (*RExC_parse == ':') {
3479 if (*RExC_parse != ')') {
3481 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3483 nextchar(pRExC_state);
3491 ret = reganode(pRExC_state, OPEN, parno);
3492 Set_Node_Length(ret, 1); /* MJD */
3493 Set_Node_Offset(ret, RExC_parse); /* MJD */
3500 /* Pick up the branches, linking them together. */
3501 parse_start = RExC_parse; /* MJD */
3502 br = regbranch(pRExC_state, &flags, 1);
3503 /* branch_len = (paren != 0); */
3507 if (*RExC_parse == '|') {
3508 if (!SIZE_ONLY && RExC_extralen) {
3509 reginsert(pRExC_state, BRANCHJ, br);
3512 reginsert(pRExC_state, BRANCH, br);
3513 Set_Node_Length(br, paren != 0);
3514 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3518 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3520 else if (paren == ':') {
3521 *flagp |= flags&SIMPLE;
3523 if (open) { /* Starts with OPEN. */
3524 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3526 else if (paren != '?') /* Not Conditional */
3528 *flagp |= flags & (SPSTART | HASWIDTH);
3530 while (*RExC_parse == '|') {
3531 if (!SIZE_ONLY && RExC_extralen) {
3532 ender = reganode(pRExC_state, LONGJMP,0);
3533 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3536 RExC_extralen += 2; /* Account for LONGJMP. */
3537 nextchar(pRExC_state);
3538 br = regbranch(pRExC_state, &flags, 0);
3542 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3546 *flagp |= flags&SPSTART;
3549 if (have_branch || paren != ':') {
3550 /* Make a closing node, and hook it on the end. */
3553 ender = reg_node(pRExC_state, TAIL);
3556 ender = reganode(pRExC_state, CLOSE, parno);
3557 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3558 Set_Node_Length(ender,1); /* MJD */
3564 *flagp &= ~HASWIDTH;
3567 ender = reg_node(pRExC_state, SUCCEED);
3570 ender = reg_node(pRExC_state, END);
3573 regtail(pRExC_state, lastbr, ender);
3576 /* Hook the tails of the branches to the closing node. */
3577 for (br = ret; br != NULL; br = regnext(br)) {
3578 regoptail(pRExC_state, br, ender);
3585 static const char parens[] = "=!<,>";
3587 if (paren && (p = strchr(parens, paren))) {
3588 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3589 int flag = (p - parens) > 1;
3592 node = SUSPEND, flag = 0;
3593 reginsert(pRExC_state, node,ret);
3594 Set_Node_Cur_Length(ret);
3595 Set_Node_Offset(ret, parse_start + 1);
3597 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3601 /* Check for proper termination. */
3603 RExC_flags = oregflags;
3604 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3605 RExC_parse = oregcomp_parse;
3606 vFAIL("Unmatched (");
3609 else if (!paren && RExC_parse < RExC_end) {
3610 if (*RExC_parse == ')') {
3612 vFAIL("Unmatched )");
3615 FAIL("Junk on end of regexp"); /* "Can't happen". */
3623 - regbranch - one alternative of an | operator
3625 * Implements the concatenation operator.
3628 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3630 register regnode *ret;
3631 register regnode *chain = NULL;
3632 register regnode *latest;
3633 I32 flags = 0, c = 0;
3638 if (!SIZE_ONLY && RExC_extralen)
3639 ret = reganode(pRExC_state, BRANCHJ,0);
3641 ret = reg_node(pRExC_state, BRANCH);
3642 Set_Node_Length(ret, 1);
3646 if (!first && SIZE_ONLY)
3647 RExC_extralen += 1; /* BRANCHJ */
3649 *flagp = WORST; /* Tentatively. */
3652 nextchar(pRExC_state);
3653 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3655 latest = regpiece(pRExC_state, &flags);
3656 if (latest == NULL) {
3657 if (flags & TRYAGAIN)
3661 else if (ret == NULL)
3663 *flagp |= flags&HASWIDTH;
3664 if (chain == NULL) /* First piece. */
3665 *flagp |= flags&SPSTART;
3668 regtail(pRExC_state, chain, latest);
3673 if (chain == NULL) { /* Loop ran zero times. */
3674 chain = reg_node(pRExC_state, NOTHING);
3679 *flagp |= flags&SIMPLE;
3686 - regpiece - something followed by possible [*+?]
3688 * Note that the branching code sequences used for ? and the general cases
3689 * of * and + are somewhat optimized: they use the same NOTHING node as
3690 * both the endmarker for their branch list and the body of the last branch.
3691 * It might seem that this node could be dispensed with entirely, but the
3692 * endmarker role is not redundant.
3695 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3697 register regnode *ret;
3699 register char *next;
3701 const char * const origparse = RExC_parse;
3704 I32 max = REG_INFTY;
3707 ret = regatom(pRExC_state, &flags);
3709 if (flags & TRYAGAIN)
3716 if (op == '{' && regcurly(RExC_parse)) {
3717 parse_start = RExC_parse; /* MJD */
3718 next = RExC_parse + 1;
3720 while (isDIGIT(*next) || *next == ',') {
3729 if (*next == '}') { /* got one */
3733 min = atoi(RExC_parse);
3737 maxpos = RExC_parse;
3739 if (!max && *maxpos != '0')
3740 max = REG_INFTY; /* meaning "infinity" */
3741 else if (max >= REG_INFTY)
3742 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3744 nextchar(pRExC_state);
3747 if ((flags&SIMPLE)) {
3748 RExC_naughty += 2 + RExC_naughty / 2;
3749 reginsert(pRExC_state, CURLY, ret);
3750 Set_Node_Offset(ret, parse_start+1); /* MJD */
3751 Set_Node_Cur_Length(ret);
3754 regnode *w = reg_node(pRExC_state, WHILEM);
3757 regtail(pRExC_state, ret, w);
3758 if (!SIZE_ONLY && RExC_extralen) {
3759 reginsert(pRExC_state, LONGJMP,ret);
3760 reginsert(pRExC_state, NOTHING,ret);
3761 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3763 reginsert(pRExC_state, CURLYX,ret);
3765 Set_Node_Offset(ret, parse_start+1);
3766 Set_Node_Length(ret,
3767 op == '{' ? (RExC_parse - parse_start) : 1);
3769 if (!SIZE_ONLY && RExC_extralen)
3770 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3771 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3773 RExC_whilem_seen++, RExC_extralen += 3;
3774 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3782 if (max && max < min)
3783 vFAIL("Can't do {n,m} with n > m");
3785 ARG1_SET(ret, (U16)min);
3786 ARG2_SET(ret, (U16)max);
3798 #if 0 /* Now runtime fix should be reliable. */
3800 /* if this is reinstated, don't forget to put this back into perldiag:
3802 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3804 (F) The part of the regexp subject to either the * or + quantifier
3805 could match an empty string. The {#} shows in the regular
3806 expression about where the problem was discovered.
3810 if (!(flags&HASWIDTH) && op != '?')
3811 vFAIL("Regexp *+ operand could be empty");
3814 parse_start = RExC_parse;
3815 nextchar(pRExC_state);
3817 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3819 if (op == '*' && (flags&SIMPLE)) {
3820 reginsert(pRExC_state, STAR, ret);
3824 else if (op == '*') {
3828 else if (op == '+' && (flags&SIMPLE)) {
3829 reginsert(pRExC_state, PLUS, ret);
3833 else if (op == '+') {
3837 else if (op == '?') {
3842 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
3844 "%.*s matches null string many times",
3845 RExC_parse - origparse,
3849 if (*RExC_parse == '?') {
3850 nextchar(pRExC_state);
3851 reginsert(pRExC_state, MINMOD, ret);
3852 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3854 if (ISMULT2(RExC_parse)) {
3856 vFAIL("Nested quantifiers");
3863 - regatom - the lowest level
3865 * Optimization: gobbles an entire sequence of ordinary characters so that
3866 * it can turn them into a single node, which is smaller to store and
3867 * faster to run. Backslashed characters are exceptions, each becoming a
3868 * separate node; the code is simpler that way and it's not worth fixing.
3870 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3872 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3874 register regnode *ret = NULL;
3876 char *parse_start = RExC_parse;
3878 *flagp = WORST; /* Tentatively. */
3881 switch (*RExC_parse) {
3883 RExC_seen_zerolen++;
3884 nextchar(pRExC_state);
3885 if (RExC_flags & PMf_MULTILINE)
3886 ret = reg_node(pRExC_state, MBOL);
3887 else if (RExC_flags & PMf_SINGLELINE)
3888 ret = reg_node(pRExC_state, SBOL);
3890 ret = reg_node(pRExC_state, BOL);
3891 Set_Node_Length(ret, 1); /* MJD */
3894 nextchar(pRExC_state);
3896 RExC_seen_zerolen++;
3897 if (RExC_flags & PMf_MULTILINE)
3898 ret = reg_node(pRExC_state, MEOL);
3899 else if (RExC_flags & PMf_SINGLELINE)
3900 ret = reg_node(pRExC_state, SEOL);
3902 ret = reg_node(pRExC_state, EOL);
3903 Set_Node_Length(ret, 1); /* MJD */
3906 nextchar(pRExC_state);
3907 if (RExC_flags & PMf_SINGLELINE)
3908 ret = reg_node(pRExC_state, SANY);
3910 ret = reg_node(pRExC_state, REG_ANY);
3911 *flagp |= HASWIDTH|SIMPLE;
3913 Set_Node_Length(ret, 1); /* MJD */
3917 char *oregcomp_parse = ++RExC_parse;
3918 ret = regclass(pRExC_state);
3919 if (*RExC_parse != ']') {
3920 RExC_parse = oregcomp_parse;
3921 vFAIL("Unmatched [");
3923 nextchar(pRExC_state);
3924 *flagp |= HASWIDTH|SIMPLE;
3925 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3929 nextchar(pRExC_state);
3930 ret = reg(pRExC_state, 1, &flags);
3932 if (flags & TRYAGAIN) {
3933 if (RExC_parse == RExC_end) {
3934 /* Make parent create an empty node if needed. */
3942 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3946 if (flags & TRYAGAIN) {
3950 vFAIL("Internal urp");
3951 /* Supposed to be caught earlier. */
3954 if (!regcurly(RExC_parse)) {
3963 vFAIL("Quantifier follows nothing");
3966 switch (*++RExC_parse) {
3968 RExC_seen_zerolen++;
3969 ret = reg_node(pRExC_state, SBOL);
3971 nextchar(pRExC_state);
3972 Set_Node_Length(ret, 2); /* MJD */
3975 ret = reg_node(pRExC_state, GPOS);
3976 RExC_seen |= REG_SEEN_GPOS;
3978 nextchar(pRExC_state);
3979 Set_Node_Length(ret, 2); /* MJD */
3982 ret = reg_node(pRExC_state, SEOL);
3984 RExC_seen_zerolen++; /* Do not optimize RE away */
3985 nextchar(pRExC_state);
3988 ret = reg_node(pRExC_state, EOS);
3990 RExC_seen_zerolen++; /* Do not optimize RE away */
3991 nextchar(pRExC_state);
3992 Set_Node_Length(ret, 2); /* MJD */
3995 ret = reg_node(pRExC_state, CANY);
3996 RExC_seen |= REG_SEEN_CANY;
3997 *flagp |= HASWIDTH|SIMPLE;
3998 nextchar(pRExC_state);
3999 Set_Node_Length(ret, 2); /* MJD */
4002 ret = reg_node(pRExC_state, CLUMP);
4004 nextchar(pRExC_state);
4005 Set_Node_Length(ret, 2); /* MJD */
4008 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4009 *flagp |= HASWIDTH|SIMPLE;
4010 nextchar(pRExC_state);
4011 Set_Node_Length(ret, 2); /* MJD */
4014 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4015 *flagp |= HASWIDTH|SIMPLE;
4016 nextchar(pRExC_state);
4017 Set_Node_Length(ret, 2); /* MJD */
4020 RExC_seen_zerolen++;
4021 RExC_seen |= REG_SEEN_LOOKBEHIND;
4022 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4024 nextchar(pRExC_state);
4025 Set_Node_Length(ret, 2); /* MJD */
4028 RExC_seen_zerolen++;
4029 RExC_seen |= REG_SEEN_LOOKBEHIND;
4030 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4032 nextchar(pRExC_state);
4033 Set_Node_Length(ret, 2); /* MJD */
4036 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4037 *flagp |= HASWIDTH|SIMPLE;
4038 nextchar(pRExC_state);
4039 Set_Node_Length(ret, 2); /* MJD */
4042 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4043 *flagp |= HASWIDTH|SIMPLE;
4044 nextchar(pRExC_state);
4045 Set_Node_Length(ret, 2); /* MJD */
4048 ret = reg_node(pRExC_state, DIGIT);
4049 *flagp |= HASWIDTH|SIMPLE;
4050 nextchar(pRExC_state);
4051 Set_Node_Length(ret, 2); /* MJD */
4054 ret = reg_node(pRExC_state, NDIGIT);
4055 *flagp |= HASWIDTH|SIMPLE;
4056 nextchar(pRExC_state);
4057 Set_Node_Length(ret, 2); /* MJD */
4062 char* oldregxend = RExC_end;
4063 char* parse_start = RExC_parse - 2;
4065 if (RExC_parse[1] == '{') {
4066 /* a lovely hack--pretend we saw [\pX] instead */
4067 RExC_end = strchr(RExC_parse, '}');
4069 U8 c = (U8)*RExC_parse;
4071 RExC_end = oldregxend;
4072 vFAIL2("Missing right brace on \\%c{}", c);
4077 RExC_end = RExC_parse + 2;
4078 if (RExC_end > oldregxend)
4079 RExC_end = oldregxend;
4083 ret = regclass(pRExC_state);
4085 RExC_end = oldregxend;
4088 Set_Node_Offset(ret, parse_start + 2);
4089 Set_Node_Cur_Length(ret);
4090 nextchar(pRExC_state);
4091 *flagp |= HASWIDTH|SIMPLE;
4104 case '1': case '2': case '3': case '4':
4105 case '5': case '6': case '7': case '8': case '9':
4107 const I32 num = atoi(RExC_parse);
4109 if (num > 9 && num >= RExC_npar)
4112 char * parse_start = RExC_parse - 1; /* MJD */
4113 while (isDIGIT(*RExC_parse))
4116 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4117 vFAIL("Reference to nonexistent group");
4119 ret = reganode(pRExC_state,
4120 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4124 /* override incorrect value set in reganode MJD */
4125 Set_Node_Offset(ret, parse_start+1);
4126 Set_Node_Cur_Length(ret); /* MJD */
4128 nextchar(pRExC_state);
4133 if (RExC_parse >= RExC_end)
4134 FAIL("Trailing \\");
4137 /* Do not generate "unrecognized" warnings here, we fall
4138 back into the quick-grab loop below */
4145 if (RExC_flags & PMf_EXTENDED) {
4146 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4147 if (RExC_parse < RExC_end)
4153 register STRLEN len;
4158 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4160 parse_start = RExC_parse - 1;
4166 ret = reg_node(pRExC_state,
4167 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4169 for (len = 0, p = RExC_parse - 1;
4170 len < 127 && p < RExC_end;
4175 if (RExC_flags & PMf_EXTENDED)
4176 p = regwhite(p, RExC_end);
4223 ender = ASCII_TO_NATIVE('\033');
4227 ender = ASCII_TO_NATIVE('\007');
4232 char* const e = strchr(p, '}');
4236 vFAIL("Missing right brace on \\x{}");
4239 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4240 | PERL_SCAN_DISALLOW_PREFIX;
4241 STRLEN numlen = e - p - 1;
4242 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4249 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4251 ender = grok_hex(p, &numlen, &flags, NULL);
4257 ender = UCHARAT(p++);
4258 ender = toCTRL(ender);
4260 case '0': case '1': case '2': case '3':case '4':
4261 case '5': case '6': case '7': case '8':case '9':
4263 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4266 ender = grok_oct(p, &numlen, &flags, NULL);
4276 FAIL("Trailing \\");
4279 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4280 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4281 goto normal_default;
4286 if (UTF8_IS_START(*p) && UTF) {
4288 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4296 if (RExC_flags & PMf_EXTENDED)
4297 p = regwhite(p, RExC_end);
4299 /* Prime the casefolded buffer. */
4300 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4302 if (ISMULT2(p)) { /* Back off on ?+*. */
4309 /* Emit all the Unicode characters. */
4311 for (foldbuf = tmpbuf;
4313 foldlen -= numlen) {
4314 ender = utf8_to_uvchr(foldbuf, &numlen);
4316 reguni(pRExC_state, ender, s, &unilen);
4319 /* In EBCDIC the numlen
4320 * and unilen can differ. */
4322 if (numlen >= foldlen)
4326 break; /* "Can't happen." */
4330 reguni(pRExC_state, ender, s, &unilen);
4339 REGC((char)ender, s++);
4347 /* Emit all the Unicode characters. */
4349 for (foldbuf = tmpbuf;
4351 foldlen -= numlen) {
4352 ender = utf8_to_uvchr(foldbuf, &numlen);
4354 reguni(pRExC_state, ender, s, &unilen);
4357 /* In EBCDIC the numlen
4358 * and unilen can differ. */
4360 if (numlen >= foldlen)
4368 reguni(pRExC_state, ender, s, &unilen);
4377 REGC((char)ender, s++);
4381 Set_Node_Cur_Length(ret); /* MJD */
4382 nextchar(pRExC_state);
4384 /* len is STRLEN which is unsigned, need to copy to signed */
4387 vFAIL("Internal disaster");
4391 if (len == 1 && UNI_IS_INVARIANT(ender))
4396 RExC_size += STR_SZ(len);
4398 RExC_emit += STR_SZ(len);
4403 /* If the encoding pragma is in effect recode the text of
4404 * any EXACT-kind nodes. */
4405 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4406 STRLEN oldlen = STR_LEN(ret);
4407 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4411 if (sv_utf8_downgrade(sv, TRUE)) {
4412 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4413 const STRLEN newlen = SvCUR(sv);
4418 GET_RE_DEBUG_FLAGS_DECL;
4419 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4420 (int)oldlen, STRING(ret),
4422 Copy(s, STRING(ret), newlen, char);
4423 STR_LEN(ret) += newlen - oldlen;
4424 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4426 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4434 S_regwhite(pTHX_ char *p, const char *e)
4439 else if (*p == '#') {
4442 } while (p < e && *p != '\n');
4450 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4451 Character classes ([:foo:]) can also be negated ([:^foo:]).
4452 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4453 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4454 but trigger failures because they are currently unimplemented. */
4456 #define POSIXCC_DONE(c) ((c) == ':')
4457 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4458 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4461 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4463 I32 namedclass = OOB_NAMEDCLASS;
4465 if (value == '[' && RExC_parse + 1 < RExC_end &&
4466 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4467 POSIXCC(UCHARAT(RExC_parse))) {
4468 const char c = UCHARAT(RExC_parse);
4469 char* s = RExC_parse++;
4471 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4473 if (RExC_parse == RExC_end)
4474 /* Grandfather lone [:, [=, [. */
4477 const char* t = RExC_parse++; /* skip over the c */
4478 const char *posixcc;
4482 if (UCHARAT(RExC_parse) == ']') {
4483 RExC_parse++; /* skip over the ending ] */
4486 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4487 const I32 skip = t - posixcc;
4489 /* Initially switch on the length of the name. */
4492 if (memEQ(posixcc, "word", 4)) {
4493 /* this is not POSIX, this is the Perl \w */;
4495 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4499 /* Names all of length 5. */
4500 /* alnum alpha ascii blank cntrl digit graph lower
4501 print punct space upper */
4502 /* Offset 4 gives the best switch position. */
4503 switch (posixcc[4]) {
4505 if (memEQ(posixcc, "alph", 4)) {
4508 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4512 if (memEQ(posixcc, "spac", 4)) {
4515 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4519 if (memEQ(posixcc, "grap", 4)) {
4522 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4526 if (memEQ(posixcc, "asci", 4)) {
4529 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4533 if (memEQ(posixcc, "blan", 4)) {
4536 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4540 if (memEQ(posixcc, "cntr", 4)) {
4543 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4547 if (memEQ(posixcc, "alnu", 4)) {
4550 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4554 if (memEQ(posixcc, "lowe", 4)) {
4557 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4559 if (memEQ(posixcc, "uppe", 4)) {
4562 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4566 if (memEQ(posixcc, "digi", 4)) {
4569 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4571 if (memEQ(posixcc, "prin", 4)) {
4574 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4576 if (memEQ(posixcc, "punc", 4)) {
4579 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4585 if (memEQ(posixcc, "xdigit", 6)) {
4587 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4592 if (namedclass == OOB_NAMEDCLASS)
4594 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4597 assert (posixcc[skip] == ':');
4598 assert (posixcc[skip+1] == ']');
4599 } else if (!SIZE_ONLY) {
4600 /* [[=foo=]] and [[.foo.]] are still future. */
4602 /* adjust RExC_parse so the warning shows after
4604 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4606 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4609 /* Maternal grandfather:
4610 * "[:" ending in ":" but not in ":]" */
4620 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4622 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4623 const char *s = RExC_parse;
4624 const char c = *s++;
4626 while(*s && isALNUM(*s))
4628 if (*s && c == *s && s[1] == ']') {
4629 if (ckWARN(WARN_REGEXP))
4631 "POSIX syntax [%c %c] belongs inside character classes",
4634 /* [[=foo=]] and [[.foo.]] are still future. */
4635 if (POSIXCC_NOTYET(c)) {
4636 /* adjust RExC_parse so the error shows after
4638 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4640 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4647 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4650 register UV nextvalue;
4651 register IV prevvalue = OOB_UNICODE;
4652 register IV range = 0;
4653 register regnode *ret;
4656 char *rangebegin = NULL;
4657 bool need_class = 0;
4661 bool optimize_invert = TRUE;
4662 AV* unicode_alternate = NULL;
4664 UV literal_endpoint = 0;
4667 ret = reganode(pRExC_state, ANYOF, 0);
4670 ANYOF_FLAGS(ret) = 0;
4672 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4676 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4680 RExC_size += ANYOF_SKIP;
4682 RExC_emit += ANYOF_SKIP;
4684 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4686 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4687 ANYOF_BITMAP_ZERO(ret);
4688 listsv = newSVpvs("# comment\n");
4691 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4693 if (!SIZE_ONLY && POSIXCC(nextvalue))
4694 checkposixcc(pRExC_state);
4696 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4697 if (UCHARAT(RExC_parse) == ']')
4700 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4704 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4707 rangebegin = RExC_parse;
4709 value = utf8n_to_uvchr((U8*)RExC_parse,
4710 RExC_end - RExC_parse,
4712 RExC_parse += numlen;
4715 value = UCHARAT(RExC_parse++);
4716 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4717 if (value == '[' && POSIXCC(nextvalue))
4718 namedclass = regpposixcc(pRExC_state, value);
4719 else if (value == '\\') {
4721 value = utf8n_to_uvchr((U8*)RExC_parse,
4722 RExC_end - RExC_parse,
4724 RExC_parse += numlen;
4727 value = UCHARAT(RExC_parse++);
4728 /* Some compilers cannot handle switching on 64-bit integer
4729 * values, therefore value cannot be an UV. Yes, this will
4730 * be a problem later if we want switch on Unicode.
4731 * A similar issue a little bit later when switching on
4732 * namedclass. --jhi */
4733 switch ((I32)value) {
4734 case 'w': namedclass = ANYOF_ALNUM; break;
4735 case 'W': namedclass = ANYOF_NALNUM; break;
4736 case 's': namedclass = ANYOF_SPACE; break;
4737 case 'S': namedclass = ANYOF_NSPACE; break;
4738 case 'd': namedclass = ANYOF_DIGIT; break;
4739 case 'D': namedclass = ANYOF_NDIGIT; break;
4742 if (RExC_parse >= RExC_end)
4743 vFAIL2("Empty \\%c{}", (U8)value);
4744 if (*RExC_parse == '{') {
4745 const U8 c = (U8)value;
4746 e = strchr(RExC_parse++, '}');
4748 vFAIL2("Missing right brace on \\%c{}", c);
4749 while (isSPACE(UCHARAT(RExC_parse)))
4751 if (e == RExC_parse)
4752 vFAIL2("Empty \\%c{}", c);
4754 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4762 if (UCHARAT(RExC_parse) == '^') {
4765 value = value == 'p' ? 'P' : 'p'; /* toggle */
4766 while (isSPACE(UCHARAT(RExC_parse))) {
4772 Perl_sv_catpvf(aTHX_ listsv,
4773 "+utf8::%.*s\n", (int)n, RExC_parse);
4775 Perl_sv_catpvf(aTHX_ listsv,
4776 "!utf8::%.*s\n", (int)n, RExC_parse);
4779 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4780 namedclass = ANYOF_MAX; /* no official name, but it's named */
4782 case 'n': value = '\n'; break;
4783 case 'r': value = '\r'; break;
4784 case 't': value = '\t'; break;
4785 case 'f': value = '\f'; break;
4786 case 'b': value = '\b'; break;
4787 case 'e': value = ASCII_TO_NATIVE('\033');break;
4788 case 'a': value = ASCII_TO_NATIVE('\007');break;
4790 if (*RExC_parse == '{') {
4791 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4792 | PERL_SCAN_DISALLOW_PREFIX;
4793 e = strchr(RExC_parse++, '}');
4795 vFAIL("Missing right brace on \\x{}");
4797 numlen = e - RExC_parse;
4798 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4802 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4804 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4805 RExC_parse += numlen;
4809 value = UCHARAT(RExC_parse++);
4810 value = toCTRL(value);
4812 case '0': case '1': case '2': case '3': case '4':
4813 case '5': case '6': case '7': case '8': case '9':
4817 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4818 RExC_parse += numlen;
4822 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
4824 "Unrecognized escape \\%c in character class passed through",
4828 } /* end of \blah */
4834 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4836 if (!SIZE_ONLY && !need_class)
4837 ANYOF_CLASS_ZERO(ret);
4841 /* a bad range like a-\d, a-[:digit:] ? */
4844 if (ckWARN(WARN_REGEXP))
4846 "False [] range \"%*.*s\"",
4847 RExC_parse - rangebegin,
4848 RExC_parse - rangebegin,
4850 if (prevvalue < 256) {
4851 ANYOF_BITMAP_SET(ret, prevvalue);
4852 ANYOF_BITMAP_SET(ret, '-');
4855 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4856 Perl_sv_catpvf(aTHX_ listsv,
4857 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4861 range = 0; /* this was not a true range */
4865 const char *what = NULL;
4868 if (namedclass > OOB_NAMEDCLASS)
4869 optimize_invert = FALSE;
4870 /* Possible truncation here but in some 64-bit environments
4871 * the compiler gets heartburn about switch on 64-bit values.
4872 * A similar issue a little earlier when switching on value.
4874 switch ((I32)namedclass) {
4877 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4879 for (value = 0; value < 256; value++)
4881 ANYOF_BITMAP_SET(ret, value);
4888 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4890 for (value = 0; value < 256; value++)
4891 if (!isALNUM(value))
4892 ANYOF_BITMAP_SET(ret, value);
4899 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4901 for (value = 0; value < 256; value++)
4902 if (isALNUMC(value))
4903 ANYOF_BITMAP_SET(ret, value);
4910 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4912 for (value = 0; value < 256; value++)
4913 if (!isALNUMC(value))
4914 ANYOF_BITMAP_SET(ret, value);
4921 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4923 for (value = 0; value < 256; value++)
4925 ANYOF_BITMAP_SET(ret, value);
4932 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4934 for (value = 0; value < 256; value++)
4935 if (!isALPHA(value))
4936 ANYOF_BITMAP_SET(ret, value);
4943 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4946 for (value = 0; value < 128; value++)
4947 ANYOF_BITMAP_SET(ret, value);
4949 for (value = 0; value < 256; value++) {
4951 ANYOF_BITMAP_SET(ret, value);
4960 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4963 for (value = 128; value < 256; value++)
4964 ANYOF_BITMAP_SET(ret, value);
4966 for (value = 0; value < 256; value++) {
4967 if (!isASCII(value))
4968 ANYOF_BITMAP_SET(ret, value);
4977 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4979 for (value = 0; value < 256; value++)
4981 ANYOF_BITMAP_SET(ret, value);
4988 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4990 for (value = 0; value < 256; value++)
4991 if (!isBLANK(value))
4992 ANYOF_BITMAP_SET(ret, value);
4999 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5001 for (value = 0; value < 256; value++)
5003 ANYOF_BITMAP_SET(ret, value);
5010 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5012 for (value = 0; value < 256; value++)
5013 if (!isCNTRL(value))
5014 ANYOF_BITMAP_SET(ret, value);
5021 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5023 /* consecutive digits assumed */
5024 for (value = '0'; value <= '9'; value++)
5025 ANYOF_BITMAP_SET(ret, value);
5032 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5034 /* consecutive digits assumed */
5035 for (value = 0; value < '0'; value++)
5036 ANYOF_BITMAP_SET(ret, value);
5037 for (value = '9' + 1; value < 256; value++)
5038 ANYOF_BITMAP_SET(ret, value);
5045 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5047 for (value = 0; value < 256; value++)
5049 ANYOF_BITMAP_SET(ret, value);
5056 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5058 for (value = 0; value < 256; value++)
5059 if (!isGRAPH(value))
5060 ANYOF_BITMAP_SET(ret, value);
5067 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5069 for (value = 0; value < 256; value++)
5071 ANYOF_BITMAP_SET(ret, value);
5078 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5080 for (value = 0; value < 256; value++)
5081 if (!isLOWER(value))
5082 ANYOF_BITMAP_SET(ret, value);
5089 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5091 for (value = 0; value < 256; value++)
5093 ANYOF_BITMAP_SET(ret, value);
5100 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5102 for (value = 0; value < 256; value++)
5103 if (!isPRINT(value))
5104 ANYOF_BITMAP_SET(ret, value);
5111 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5113 for (value = 0; value < 256; value++)
5114 if (isPSXSPC(value))
5115 ANYOF_BITMAP_SET(ret, value);
5122 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5124 for (value = 0; value < 256; value++)
5125 if (!isPSXSPC(value))
5126 ANYOF_BITMAP_SET(ret, value);
5133 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5135 for (value = 0; value < 256; value++)
5137 ANYOF_BITMAP_SET(ret, value);
5144 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5146 for (value = 0; value < 256; value++)
5147 if (!isPUNCT(value))
5148 ANYOF_BITMAP_SET(ret, value);
5155 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5157 for (value = 0; value < 256; value++)
5159 ANYOF_BITMAP_SET(ret, value);
5166 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5168 for (value = 0; value < 256; value++)
5169 if (!isSPACE(value))
5170 ANYOF_BITMAP_SET(ret, value);
5177 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5179 for (value = 0; value < 256; value++)
5181 ANYOF_BITMAP_SET(ret, value);
5188 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5190 for (value = 0; value < 256; value++)
5191 if (!isUPPER(value))
5192 ANYOF_BITMAP_SET(ret, value);
5199 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5201 for (value = 0; value < 256; value++)
5202 if (isXDIGIT(value))
5203 ANYOF_BITMAP_SET(ret, value);
5210 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5212 for (value = 0; value < 256; value++)
5213 if (!isXDIGIT(value))
5214 ANYOF_BITMAP_SET(ret, value);
5220 /* this is to handle \p and \P */
5223 vFAIL("Invalid [::] class");
5227 /* Strings such as "+utf8::isWord\n" */
5228 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5231 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5234 } /* end of namedclass \blah */
5237 if (prevvalue > (IV)value) /* b-a */ {
5238 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5239 RExC_parse - rangebegin,
5240 RExC_parse - rangebegin,
5242 range = 0; /* not a valid range */
5246 prevvalue = value; /* save the beginning of the range */
5247 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5248 RExC_parse[1] != ']') {
5251 /* a bad range like \w-, [:word:]- ? */
5252 if (namedclass > OOB_NAMEDCLASS) {
5253 if (ckWARN(WARN_REGEXP))
5255 "False [] range \"%*.*s\"",
5256 RExC_parse - rangebegin,
5257 RExC_parse - rangebegin,
5260 ANYOF_BITMAP_SET(ret, '-');
5262 range = 1; /* yeah, it's a range! */
5263 continue; /* but do it the next time */
5267 /* now is the next time */
5271 if (prevvalue < 256) {
5272 const IV ceilvalue = value < 256 ? value : 255;
5275 /* In EBCDIC [\x89-\x91] should include
5276 * the \x8e but [i-j] should not. */
5277 if (literal_endpoint == 2 &&
5278 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5279 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5281 if (isLOWER(prevvalue)) {
5282 for (i = prevvalue; i <= ceilvalue; i++)
5284 ANYOF_BITMAP_SET(ret, i);
5286 for (i = prevvalue; i <= ceilvalue; i++)
5288 ANYOF_BITMAP_SET(ret, i);
5293 for (i = prevvalue; i <= ceilvalue; i++)
5294 ANYOF_BITMAP_SET(ret, i);
5296 if (value > 255 || UTF) {
5297 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5298 const UV natvalue = NATIVE_TO_UNI(value);
5300 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5301 if (prevnatvalue < natvalue) { /* what about > ? */
5302 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5303 prevnatvalue, natvalue);
5305 else if (prevnatvalue == natvalue) {
5306 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5308 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5310 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5312 /* If folding and foldable and a single
5313 * character, insert also the folded version
5314 * to the charclass. */
5316 if (foldlen == (STRLEN)UNISKIP(f))
5317 Perl_sv_catpvf(aTHX_ listsv,
5320 /* Any multicharacter foldings
5321 * require the following transform:
5322 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5323 * where E folds into "pq" and F folds
5324 * into "rst", all other characters
5325 * fold to single characters. We save
5326 * away these multicharacter foldings,
5327 * to be later saved as part of the
5328 * additional "s" data. */
5331 if (!unicode_alternate)
5332 unicode_alternate = newAV();
5333 sv = newSVpvn((char*)foldbuf, foldlen);
5335 av_push(unicode_alternate, sv);
5339 /* If folding and the value is one of the Greek
5340 * sigmas insert a few more sigmas to make the
5341 * folding rules of the sigmas to work right.
5342 * Note that not all the possible combinations
5343 * are handled here: some of them are handled
5344 * by the standard folding rules, and some of
5345 * them (literal or EXACTF cases) are handled
5346 * during runtime in regexec.c:S_find_byclass(). */
5347 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5348 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5349 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5350 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5351 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5353 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5354 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5355 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5360 literal_endpoint = 0;
5364 range = 0; /* this range (if it was one) is done now */
5368 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5370 RExC_size += ANYOF_CLASS_ADD_SKIP;
5372 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5375 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5377 /* If the only flag is folding (plus possibly inversion). */
5378 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5380 for (value = 0; value < 256; ++value) {
5381 if (ANYOF_BITMAP_TEST(ret, value)) {
5382 UV fold = PL_fold[value];
5385 ANYOF_BITMAP_SET(ret, fold);
5388 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5391 /* optimize inverted simple patterns (e.g. [^a-z]) */
5392 if (!SIZE_ONLY && optimize_invert &&
5393 /* If the only flag is inversion. */
5394 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5395 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5396 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5397 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5404 /* The 0th element stores the character class description
5405 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5406 * to initialize the appropriate swash (which gets stored in
5407 * the 1st element), and also useful for dumping the regnode.
5408 * The 2nd element stores the multicharacter foldings,
5409 * used later (regexec.c:S_reginclass()). */
5410 av_store(av, 0, listsv);
5411 av_store(av, 1, NULL);
5412 av_store(av, 2, (SV*)unicode_alternate);
5413 rv = newRV_noinc((SV*)av);
5414 n = add_data(pRExC_state, 1, "s");
5415 RExC_rx->data->data[n] = (void*)rv;
5423 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5425 char* retval = RExC_parse++;
5428 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5429 RExC_parse[2] == '#') {
5430 while (*RExC_parse != ')') {
5431 if (RExC_parse == RExC_end)
5432 FAIL("Sequence (?#... not terminated");
5438 if (RExC_flags & PMf_EXTENDED) {
5439 if (isSPACE(*RExC_parse)) {
5443 else if (*RExC_parse == '#') {
5444 while (RExC_parse < RExC_end)
5445 if (*RExC_parse++ == '\n') break;
5454 - reg_node - emit a node
5456 STATIC regnode * /* Location. */
5457 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5459 register regnode *ptr;
5460 regnode * const ret = RExC_emit;
5463 SIZE_ALIGN(RExC_size);
5468 NODE_ALIGN_FILL(ret);
5470 FILL_ADVANCE_NODE(ptr, op);
5471 if (RExC_offsets) { /* MJD */
5472 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5473 "reg_node", __LINE__,
5475 RExC_emit - RExC_emit_start > RExC_offsets[0]
5476 ? "Overwriting end of array!\n" : "OK",
5477 RExC_emit - RExC_emit_start,
5478 RExC_parse - RExC_start,
5480 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5489 - reganode - emit a node with an argument
5491 STATIC regnode * /* Location. */
5492 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5494 register regnode *ptr;
5495 regnode * const ret = RExC_emit;
5498 SIZE_ALIGN(RExC_size);
5503 NODE_ALIGN_FILL(ret);
5505 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5506 if (RExC_offsets) { /* MJD */
5507 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5511 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5512 "Overwriting end of array!\n" : "OK",
5513 RExC_emit - RExC_emit_start,
5514 RExC_parse - RExC_start,
5516 Set_Cur_Node_Offset;
5525 - reguni - emit (if appropriate) a Unicode character
5528 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5530 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5534 - reginsert - insert an operator in front of already-emitted operand
5536 * Means relocating the operand.
5539 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5541 register regnode *src;
5542 register regnode *dst;
5543 register regnode *place;
5544 const int offset = regarglen[(U8)op];
5546 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5549 RExC_size += NODE_STEP_REGNODE + offset;
5554 RExC_emit += NODE_STEP_REGNODE + offset;
5556 while (src > opnd) {
5557 StructCopy(--src, --dst, regnode);
5558 if (RExC_offsets) { /* MJD 20010112 */
5559 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5563 dst - RExC_emit_start > RExC_offsets[0]
5564 ? "Overwriting end of array!\n" : "OK",
5565 src - RExC_emit_start,
5566 dst - RExC_emit_start,
5568 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5569 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5574 place = opnd; /* Op node, where operand used to be. */
5575 if (RExC_offsets) { /* MJD */
5576 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5580 place - RExC_emit_start > RExC_offsets[0]
5581 ? "Overwriting end of array!\n" : "OK",
5582 place - RExC_emit_start,
5583 RExC_parse - RExC_start,
5585 Set_Node_Offset(place, RExC_parse);
5586 Set_Node_Length(place, 1);
5588 src = NEXTOPER(place);
5589 FILL_ADVANCE_NODE(place, op);
5590 Zero(src, offset, regnode);
5594 - regtail - set the next-pointer at the end of a node chain of p to val.
5597 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5599 register regnode *scan;
5604 /* Find last node. */
5607 regnode * const temp = regnext(scan);
5613 if (reg_off_by_arg[OP(scan)]) {
5614 ARG_SET(scan, val - scan);
5617 NEXT_OFF(scan) = val - scan;
5622 - regoptail - regtail on operand of first argument; nop if operandless
5625 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5627 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5628 if (p == NULL || SIZE_ONLY)
5630 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5631 regtail(pRExC_state, NEXTOPER(p), val);
5633 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5634 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5641 - regcurly - a little FSA that accepts {\d+,?\d*}
5644 S_regcurly(pTHX_ register const char *s)
5663 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5666 Perl_regdump(pTHX_ regexp *r)
5669 SV * const sv = sv_newmortal();
5671 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5673 /* Header fields of interest. */
5674 if (r->anchored_substr)
5675 PerlIO_printf(Perl_debug_log,
5676 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5678 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5679 SvPVX_const(r->anchored_substr),
5681 SvTAIL(r->anchored_substr) ? "$" : "",
5682 (IV)r->anchored_offset);
5683 else if (r->anchored_utf8)
5684 PerlIO_printf(Perl_debug_log,
5685 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5687 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5688 SvPVX_const(r->anchored_utf8),
5690 SvTAIL(r->anchored_utf8) ? "$" : "",
5691 (IV)r->anchored_offset);
5692 if (r->float_substr)
5693 PerlIO_printf(Perl_debug_log,
5694 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5696 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5697 SvPVX_const(r->float_substr),
5699 SvTAIL(r->float_substr) ? "$" : "",
5700 (IV)r->float_min_offset, (UV)r->float_max_offset);
5701 else if (r->float_utf8)
5702 PerlIO_printf(Perl_debug_log,
5703 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5705 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5706 SvPVX_const(r->float_utf8),
5708 SvTAIL(r->float_utf8) ? "$" : "",
5709 (IV)r->float_min_offset, (UV)r->float_max_offset);
5710 if (r->check_substr || r->check_utf8)
5711 PerlIO_printf(Perl_debug_log,
5712 r->check_substr == r->float_substr
5713 && r->check_utf8 == r->float_utf8
5714 ? "(checking floating" : "(checking anchored");
5715 if (r->reganch & ROPT_NOSCAN)
5716 PerlIO_printf(Perl_debug_log, " noscan");
5717 if (r->reganch & ROPT_CHECK_ALL)
5718 PerlIO_printf(Perl_debug_log, " isall");
5719 if (r->check_substr || r->check_utf8)
5720 PerlIO_printf(Perl_debug_log, ") ");
5722 if (r->regstclass) {
5723 regprop(sv, r->regstclass);
5724 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5726 if (r->reganch & ROPT_ANCH) {
5727 PerlIO_printf(Perl_debug_log, "anchored");
5728 if (r->reganch & ROPT_ANCH_BOL)
5729 PerlIO_printf(Perl_debug_log, "(BOL)");
5730 if (r->reganch & ROPT_ANCH_MBOL)
5731 PerlIO_printf(Perl_debug_log, "(MBOL)");
5732 if (r->reganch & ROPT_ANCH_SBOL)
5733 PerlIO_printf(Perl_debug_log, "(SBOL)");
5734 if (r->reganch & ROPT_ANCH_GPOS)
5735 PerlIO_printf(Perl_debug_log, "(GPOS)");
5736 PerlIO_putc(Perl_debug_log, ' ');
5738 if (r->reganch & ROPT_GPOS_SEEN)
5739 PerlIO_printf(Perl_debug_log, "GPOS ");
5740 if (r->reganch & ROPT_SKIP)
5741 PerlIO_printf(Perl_debug_log, "plus ");
5742 if (r->reganch & ROPT_IMPLICIT)
5743 PerlIO_printf(Perl_debug_log, "implicit ");
5744 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5745 if (r->reganch & ROPT_EVAL_SEEN)
5746 PerlIO_printf(Perl_debug_log, "with eval ");
5747 PerlIO_printf(Perl_debug_log, "\n");
5749 const U32 len = r->offsets[0];
5750 GET_RE_DEBUG_FLAGS_DECL;
5753 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5754 for (i = 1; i <= len; i++)
5755 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5756 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5757 PerlIO_printf(Perl_debug_log, "\n");
5762 #endif /* DEBUGGING */
5766 - regprop - printable representation of opcode
5769 Perl_regprop(pTHX_ SV *sv, const regnode *o)
5774 sv_setpvn(sv, "", 0);
5775 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5776 /* It would be nice to FAIL() here, but this may be called from
5777 regexec.c, and it would be hard to supply pRExC_state. */
5778 Perl_croak(aTHX_ "Corrupted regexp opcode");
5779 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5781 k = PL_regkind[(U8)OP(o)];
5784 SV * const dsv = sv_2mortal(newSVpvs(""));
5785 /* Using is_utf8_string() is a crude hack but it may
5786 * be the best for now since we have no flag "this EXACTish
5787 * node was UTF-8" --jhi */
5788 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5789 const char * const s = do_utf8 ?
5790 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5791 UNI_DISPLAY_REGEX) :
5793 const int len = do_utf8 ?
5796 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5800 } else if (k == TRIE) {/*
5801 this isn't always safe, as Pl_regdata may not be for this regex yet
5802 (depending on where its called from) so its being moved to dumpuntil
5804 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5805 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5808 trie->uniquecharcount,
5811 } else if (k == CURLY) {
5812 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5813 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5814 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5816 else if (k == WHILEM && o->flags) /* Ordinal/of */
5817 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5818 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5819 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5820 else if (k == LOGICAL)
5821 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5822 else if (k == ANYOF) {
5823 int i, rangestart = -1;
5824 const U8 flags = ANYOF_FLAGS(o);
5826 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5827 static const char * const anyofs[] = {
5860 if (flags & ANYOF_LOCALE)
5861 sv_catpvs(sv, "{loc}");
5862 if (flags & ANYOF_FOLD)
5863 sv_catpvs(sv, "{i}");
5864 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5865 if (flags & ANYOF_INVERT)
5867 for (i = 0; i <= 256; i++) {
5868 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5869 if (rangestart == -1)
5871 } else if (rangestart != -1) {
5872 if (i <= rangestart + 3)
5873 for (; rangestart < i; rangestart++)
5874 put_byte(sv, rangestart);
5876 put_byte(sv, rangestart);
5878 put_byte(sv, i - 1);
5884 if (o->flags & ANYOF_CLASS)
5885 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5886 if (ANYOF_CLASS_TEST(o,i))
5887 sv_catpv(sv, anyofs[i]);
5889 if (flags & ANYOF_UNICODE)
5890 sv_catpvs(sv, "{unicode}");
5891 else if (flags & ANYOF_UNICODE_ALL)
5892 sv_catpvs(sv, "{unicode_all}");
5896 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
5900 U8 s[UTF8_MAXBYTES_CASE+1];
5902 for (i = 0; i <= 256; i++) { /* just the first 256 */
5903 uvchr_to_utf8(s, i);
5905 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5906 if (rangestart == -1)
5908 } else if (rangestart != -1) {
5909 if (i <= rangestart + 3)
5910 for (; rangestart < i; rangestart++) {
5911 const U8 * const e = uvchr_to_utf8(s,rangestart);
5913 for(p = s; p < e; p++)
5917 const U8 *e = uvchr_to_utf8(s,rangestart);
5919 for (p = s; p < e; p++)
5922 e = uvchr_to_utf8(s, i-1);
5923 for (p = s; p < e; p++)
5930 sv_catpvs(sv, "..."); /* et cetera */
5934 char *s = savesvpv(lv);
5935 char * const origs = s;
5937 while(*s && *s != '\n') s++;
5940 const char * const t = ++s;
5958 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5960 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5961 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5963 PERL_UNUSED_ARG(sv);
5965 #endif /* DEBUGGING */
5969 Perl_re_intuit_string(pTHX_ regexp *prog)
5970 { /* Assume that RE_INTUIT is set */
5971 GET_RE_DEBUG_FLAGS_DECL;
5974 const char * const s = SvPV_nolen_const(prog->check_substr
5975 ? prog->check_substr : prog->check_utf8);
5977 if (!PL_colorset) reginitcolors();
5978 PerlIO_printf(Perl_debug_log,
5979 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
5981 prog->check_substr ? "" : "utf8 ",
5982 PL_colors[5],PL_colors[0],
5985 (strlen(s) > 60 ? "..." : ""));
5988 return prog->check_substr ? prog->check_substr : prog->check_utf8;
5992 Perl_pregfree(pTHX_ struct regexp *r)
5996 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
5997 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
6001 if (!r || (--r->refcnt > 0))
6003 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6004 const char * const s = (r->reganch & ROPT_UTF8)
6005 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6006 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6007 const int len = SvCUR(dsv);
6010 PerlIO_printf(Perl_debug_log,
6011 "%sFreeing REx:%s %s%*.*s%s%s\n",
6012 PL_colors[4],PL_colors[5],PL_colors[0],
6015 len > 60 ? "..." : "");
6018 /* gcov results gave these as non-null 100% of the time, so there's no
6019 optimisation in checking them before calling Safefree */
6020 Safefree(r->precomp);
6021 Safefree(r->offsets); /* 20010421 MJD */
6022 RX_MATCH_COPY_FREE(r);
6023 #ifdef PERL_OLD_COPY_ON_WRITE
6025 SvREFCNT_dec(r->saved_copy);
6028 if (r->anchored_substr)
6029 SvREFCNT_dec(r->anchored_substr);
6030 if (r->anchored_utf8)
6031 SvREFCNT_dec(r->anchored_utf8);
6032 if (r->float_substr)
6033 SvREFCNT_dec(r->float_substr);
6035 SvREFCNT_dec(r->float_utf8);
6036 Safefree(r->substrs);
6039 int n = r->data->count;
6040 PAD* new_comppad = NULL;
6045 /* If you add a ->what type here, update the comment in regcomp.h */
6046 switch (r->data->what[n]) {
6048 SvREFCNT_dec((SV*)r->data->data[n]);
6051 Safefree(r->data->data[n]);
6054 new_comppad = (AV*)r->data->data[n];
6057 if (new_comppad == NULL)
6058 Perl_croak(aTHX_ "panic: pregfree comppad");
6059 PAD_SAVE_LOCAL(old_comppad,
6060 /* Watch out for global destruction's random ordering. */
6061 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6064 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6067 op_free((OP_4tree*)r->data->data[n]);
6069 PAD_RESTORE_LOCAL(old_comppad);
6070 SvREFCNT_dec((SV*)new_comppad);
6077 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
6080 refcount = --trie->refcount;
6083 Safefree(trie->charmap);
6084 if (trie->widecharmap)
6085 SvREFCNT_dec((SV*)trie->widecharmap);
6086 Safefree(trie->states);
6087 Safefree(trie->trans);
6090 SvREFCNT_dec((SV*)trie->words);
6091 if (trie->revcharmap)
6092 SvREFCNT_dec((SV*)trie->revcharmap);
6094 Safefree(r->data->data[n]); /* do this last!!!! */
6099 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6102 Safefree(r->data->what);
6105 Safefree(r->startp);
6111 - regnext - dig the "next" pointer out of a node
6114 Perl_regnext(pTHX_ register regnode *p)
6116 register I32 offset;
6118 if (p == &PL_regdummy)
6121 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6129 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6132 STRLEN l1 = strlen(pat1);
6133 STRLEN l2 = strlen(pat2);
6136 const char *message;
6142 Copy(pat1, buf, l1 , char);
6143 Copy(pat2, buf + l1, l2 , char);
6144 buf[l1 + l2] = '\n';
6145 buf[l1 + l2 + 1] = '\0';
6147 /* ANSI variant takes additional second argument */
6148 va_start(args, pat2);
6152 msv = vmess(buf, &args);
6154 message = SvPV_const(msv,l1);
6157 Copy(message, buf, l1 , char);
6158 buf[l1-1] = '\0'; /* Overwrite \n */
6159 Perl_croak(aTHX_ "%s", buf);
6162 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6165 Perl_save_re_context(pTHX)
6167 SAVEI32(PL_reg_flags); /* from regexec.c */
6169 SAVEPPTR(PL_reginput); /* String-input pointer. */
6170 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6171 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6172 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6173 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6174 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6175 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6176 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6177 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6178 PL_reg_start_tmp = 0;
6179 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6180 PL_reg_start_tmpl = 0;
6181 SAVEVPTR(PL_regdata);
6182 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6183 SAVEI32(PL_regnarrate); /* from regexec.c */
6184 SAVEVPTR(PL_regprogram); /* from regexec.c */
6185 SAVEINT(PL_regindent); /* from regexec.c */
6186 SAVEVPTR(PL_regcc); /* from regexec.c */
6187 SAVEVPTR(PL_curcop);
6188 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6189 SAVEVPTR(PL_reg_re); /* from regexec.c */
6190 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6191 SAVESPTR(PL_reg_sv); /* from regexec.c */
6192 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6193 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6194 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6195 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6196 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6197 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6198 PL_reg_oldsaved = NULL;
6199 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6200 PL_reg_oldsavedlen = 0;
6201 #ifdef PERL_OLD_COPY_ON_WRITE
6205 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6207 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6208 PL_reg_leftiter = 0;
6209 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6210 PL_reg_poscache = NULL;
6211 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6212 PL_reg_poscache_size = 0;
6213 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6214 SAVEI32(PL_regnpar); /* () count. */
6215 SAVEI32(PL_regsize); /* from regexec.c */
6217 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6219 const REGEXP * const rx = PM_GETRE(PL_curpm);
6222 for (i = 1; i <= rx->nparens; i++) {
6223 char digits[TYPE_CHARS(long)];
6224 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
6225 GV * const mgv = gv_fetchpvn_flags(digits, len, 0, SVt_PV);
6233 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6238 clear_re(pTHX_ void *r)
6240 ReREFCNT_dec((regexp *)r);
6246 S_put_byte(pTHX_ SV *sv, int c)
6248 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6249 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6250 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6251 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6253 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6258 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6260 register U8 op = EXACT; /* Arbitrary non-END op. */
6261 register regnode *next;
6263 while (op != END && (!last || node < last)) {
6264 /* While that wasn't END last time... */
6270 next = regnext(node);
6272 if (OP(node) == OPTIMIZED)
6275 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6276 (int)(2*l + 1), "", SvPVX_const(sv));
6277 if (next == NULL) /* Next ptr. */
6278 PerlIO_printf(Perl_debug_log, "(0)");
6280 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6281 (void)PerlIO_putc(Perl_debug_log, '\n');
6283 if (PL_regkind[(U8)op] == BRANCHJ) {
6284 register regnode *nnode = (OP(next) == LONGJMP
6287 if (last && nnode > last)
6289 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6291 else if (PL_regkind[(U8)op] == BRANCH) {
6292 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6294 else if ( PL_regkind[(U8)op] == TRIE ) {
6295 const I32 n = ARG(node);
6296 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6297 const I32 arry_len = av_len(trie->words)+1;
6299 PerlIO_printf(Perl_debug_log,
6300 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6304 (int)trie->charcount,
6305 trie->uniquecharcount,
6306 (IV)trie->laststate-1,
6307 node->flags ? " EVAL mode" : "");
6309 for (word_idx=0; word_idx < arry_len; word_idx++) {
6310 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6312 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6315 SvPV_nolen_const(*elem_ptr),
6320 PerlIO_printf(Perl_debug_log, "(0)\n");
6322 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6328 node = NEXTOPER(node);
6329 node += regarglen[(U8)op];
6332 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6333 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6334 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6336 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6337 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6340 else if ( op == PLUS || op == STAR) {
6341 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6343 else if (op == ANYOF) {
6344 /* arglen 1 + class block */
6345 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6346 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6347 node = NEXTOPER(node);
6349 else if (PL_regkind[(U8)op] == EXACT) {
6350 /* Literal string, where present. */
6351 node += NODE_SZ_STR(node) - 1;
6352 node = NEXTOPER(node);
6355 node = NEXTOPER(node);
6356 node += regarglen[(U8)op];
6358 if (op == CURLYX || op == OPEN)
6360 else if (op == WHILEM)
6366 #endif /* DEBUGGING */
6370 * c-indentation-style: bsd
6372 * indent-tabs-mode: t
6375 * ex: set ts=8 sts=4 sw=4 noet: