5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
289 #define FAIL2(pat,msg) STMT_START { \
290 const char *ellipses = ""; \
291 IV len = RExC_end - RExC_precomp; \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 #define Simple_vFAIL(m) STMT_START { \
309 const IV offset = RExC_parse - RExC_precomp; \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 #define vFAIL(m) STMT_START { \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
324 * Like Simple_vFAIL(), but accepts two arguments.
326 #define Simple_vFAIL2(m,a1) STMT_START { \
327 const IV offset = RExC_parse - RExC_precomp; \
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 #define vFAIL2(m,a1) STMT_START { \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
346 const IV offset = RExC_parse - RExC_precomp; \
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 #define vFAIL3(m,a1,a2) STMT_START { \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
361 * Like Simple_vFAIL(), but accepts four arguments.
363 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
364 const IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 #define vWARN(loc,m) STMT_START { \
370 const IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARNdep(loc,m) STMT_START { \
376 const IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
383 #define vWARN2(loc, m, a1) STMT_START { \
384 const IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN3(loc, m, a1, a2) STMT_START { \
390 const IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 const IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 const IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 /* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
423 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 RExC_offsets[2*(node)-1] = (byte); \
435 #define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439 #define Set_Node_Length_To_R(node,len) STMT_START { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 RExC_offsets[2*(node)] = (len); \
451 #define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461 static void clear_re(pTHX_ void *r);
463 /* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 SvSetMagicSV(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 data->flags &= ~SF_FIX_BEFORE_EOL;
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 data->flags &= ~SF_FL_BEFORE_EOL;
497 SvCUR_set(data->last_found, 0);
499 SV * const sv = data->last_found;
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
506 data->flags &= ~SF_BEFORE_EOL;
509 /* Can match anything (initialization) */
511 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 ANYOF_CLASS_ZERO(cl);
514 ANYOF_BITMAP_SETALL(cl);
515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517 cl->flags |= ANYOF_LOCALE;
520 /* Can match anything (initialization) */
522 S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
526 for (value = 0; value <= ANYOF_MAX; value += 2)
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
536 /* Can match anything (initialization) */
538 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 Zero(cl, 1, struct regnode_charclass_class);
542 cl_anything(pRExC_state, cl);
546 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 Zero(cl, 1, struct regnode_charclass_class);
550 cl_anything(pRExC_state, cl);
552 cl->flags |= ANYOF_LOCALE;
555 /* 'And' a given class with another one. Can create false positives */
556 /* We assume that cl is not inverted */
558 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559 const struct regnode_charclass_class *and_with)
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
586 cl->flags &= ~ANYOF_UNICODE_ALL;
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
589 cl->flags &= ~ANYOF_UNICODE;
592 /* 'OR' a given class with another one. Can create false positives */
593 /* We assume that cl is not inverted */
595 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
597 if (or_with->flags & ANYOF_INVERT) {
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
616 cl_anything(pRExC_state, cl);
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && (!(or_with->flags & ANYOF_FOLD)
622 || (cl->flags & ANYOF_FOLD)) ) {
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
635 cl_anything(pRExC_state, cl);
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
654 make_trie(startbranch,first,last,tail,flags)
655 startbranch: the first branch in the whole branch sequence
656 first : start branch of sequence of branch-exact nodes.
657 May be the same as startbranch
658 last : Thing following the last branch.
659 May be the same as tail.
660 tail : item following the branch sequence
661 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
663 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
665 A trie is an N'ary tree where the branches are determined by digital
666 decomposition of the key. IE, at the root node you look up the 1st character and
667 follow that branch repeat until you find the end of the branches. Nodes can be
668 marked as "accepting" meaning they represent a complete word. Eg:
672 would convert into the following structure. Numbers represent states, letters
673 following numbers represent valid transitions on the letter from that state, if
674 the number is in square brackets it represents an accepting state, otherwise it
675 will be in parenthesis.
677 +-h->+-e->[3]-+-r->(8)-+-s->[9]
681 (1) +-i->(6)-+-s->[7]
683 +-s->(3)-+-h->(4)-+-e->[5]
685 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
687 This shows that when matching against the string 'hers' we will begin at state 1
688 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
689 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
690 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
691 single traverse. We store a mapping from accepting to state to which word was
692 matched, and then when we have multiple possibilities we try to complete the
693 rest of the regex in the order in which they occured in the alternation.
695 The only prior NFA like behaviour that would be changed by the TRIE support is
696 the silent ignoring of duplicate alternations which are of the form:
698 / (DUPE|DUPE) X? (?{ ... }) Y /x
700 Thus EVAL blocks follwing a trie may be called a different number of times with
701 and without the optimisation. With the optimisations dupes will be silently
702 ignored. This inconsistant behaviour of EVAL type nodes is well established as
703 the following demonstrates:
705 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
707 which prints out 'word' three times, but
709 'words'=~/(word|word|word)(?{ print $1 })S/
711 which doesnt print it out at all. This is due to other optimisations kicking in.
713 Example of what happens on a structural level:
715 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
717 1: CURLYM[1] {1,32767}(18)
728 This would be optimizable with startbranch=5, first=5, last=16, tail=16
729 and should turn into:
731 1: CURLYM[1] {1,32767}(18)
733 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
741 Cases where tail != last would be like /(?foo|bar)baz/:
751 which would be optimizable with startbranch=1, first=1, last=7, tail=8
752 and would end up looking like:
755 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
764 #define TRIE_DEBUG_CHAR \
765 DEBUG_TRIE_COMPILE_r({ \
768 tmp = newSVpvn( "", 0 ); \
769 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
771 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
773 av_push( trie->revcharmap, tmp ); \
776 #define TRIE_READ_CHAR STMT_START { \
779 if ( foldlen > 0 ) { \
780 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
785 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
786 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
787 foldlen -= UNISKIP( uvc ); \
788 scan = foldbuf + UNISKIP( uvc ); \
791 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
800 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
801 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
802 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
803 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
805 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
806 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
807 TRIE_LIST_LEN( state ) *= 2; \
808 Renew( trie->states[ state ].trans.list, \
809 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
811 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
812 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
813 TRIE_LIST_CUR( state )++; \
816 #define TRIE_LIST_NEW(state) STMT_START { \
817 Newxz( trie->states[ state ].trans.list, \
818 4, reg_trie_trans_le ); \
819 TRIE_LIST_CUR( state ) = 1; \
820 TRIE_LIST_LEN( state ) = 4; \
824 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
827 /* first pass, loop through and scan words */
830 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
835 /* we just use folder as a flag in utf8 */
836 const U8 * const folder = ( flags == EXACTF
844 const U32 data_slot = add_data( pRExC_state, 1, "t" );
847 GET_RE_DEBUG_FLAGS_DECL;
849 Newxz( trie, 1, reg_trie_data );
851 RExC_rx->data->data[ data_slot ] = (void*)trie;
852 Newxz( trie->charmap, 256, U16 );
854 trie->words = newAV();
855 trie->revcharmap = newAV();
859 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
860 if (!SvIOK(re_trie_maxbuff)) {
861 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
864 /* -- First loop and Setup --
866 We first traverse the branches and scan each word to determine if it
867 contains widechars, and how many unique chars there are, this is
868 important as we have to build a table with at least as many columns as we
871 We use an array of integers to represent the character codes 0..255
872 (trie->charmap) and we use a an HV* to store unicode characters. We use the
873 native representation of the character value as the key and IV's for the
876 *TODO* If we keep track of how many times each character is used we can
877 remap the columns so that the table compression later on is more
878 efficient in terms of memory by ensuring most common value is in the
879 middle and the least common are on the outside. IMO this would be better
880 than a most to least common mapping as theres a decent chance the most
881 common letter will share a node with the least common, meaning the node
882 will not be compressable. With a middle is most common approach the worst
883 case is when we have the least common nodes twice.
888 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
889 regnode *noper = NEXTOPER( cur );
890 const U8 *uc = (U8*)STRING( noper );
891 const U8 * const e = uc + STR_LEN( noper );
893 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
894 const U8 *scan = (U8*)NULL;
896 for ( ; uc < e ; uc += len ) {
900 if ( !trie->charmap[ uvc ] ) {
901 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
903 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
908 if ( !trie->widecharmap )
909 trie->widecharmap = newHV();
911 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
914 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
916 if ( !SvTRUE( *svpp ) ) {
917 sv_setiv( *svpp, ++trie->uniquecharcount );
923 } /* end first pass */
924 DEBUG_TRIE_COMPILE_r(
925 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
926 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
927 (int)trie->charcount, trie->uniquecharcount )
932 We now know what we are dealing with in terms of unique chars and
933 string sizes so we can calculate how much memory a naive
934 representation using a flat table will take. If it's over a reasonable
935 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
936 conservative but potentially much slower representation using an array
939 At the end we convert both representations into the same compressed
940 form that will be used in regexec.c for matching with. The latter
941 is a form that cannot be used to construct with but has memory
942 properties similar to the list form and access properties similar
943 to the table form making it both suitable for fast searches and
944 small enough that its feasable to store for the duration of a program.
946 See the comment in the code where the compressed table is produced
947 inplace from the flat tabe representation for an explanation of how
948 the compression works.
953 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
955 Second Pass -- Array Of Lists Representation
957 Each state will be represented by a list of charid:state records
958 (reg_trie_trans_le) the first such element holds the CUR and LEN
959 points of the allocated array. (See defines above).
961 We build the initial structure using the lists, and then convert
962 it into the compressed table form which allows faster lookups
963 (but cant be modified once converted).
969 STRLEN transcount = 1;
971 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
975 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
977 regnode *noper = NEXTOPER( cur );
978 U8 *uc = (U8*)STRING( noper );
979 const U8 * const e = uc + STR_LEN( noper );
980 U32 state = 1; /* required init */
981 U16 charid = 0; /* sanity init */
982 U8 *scan = (U8*)NULL; /* sanity init */
983 STRLEN foldlen = 0; /* required init */
984 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
987 for ( ; uc < e ; uc += len ) {
992 charid = trie->charmap[ uvc ];
994 SV** svpp=(SV**)NULL;
995 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
999 charid=(U16)SvIV( *svpp );
1008 if ( !trie->states[ state ].trans.list ) {
1009 TRIE_LIST_NEW( state );
1011 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1012 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1013 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1018 newstate = next_alloc++;
1019 TRIE_LIST_PUSH( state, charid, newstate );
1024 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1026 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1029 if ( !trie->states[ state ].wordnum ) {
1030 /* we havent inserted this word into the structure yet. */
1031 trie->states[ state ].wordnum = ++curword;
1034 /* store the word for dumping */
1035 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1036 if ( UTF ) SvUTF8_on( tmp );
1037 av_push( trie->words, tmp );
1041 /* Its a dupe. So ignore it. */
1044 } /* end second pass */
1046 trie->laststate = next_alloc;
1047 Renew( trie->states, next_alloc, reg_trie_state );
1049 DEBUG_TRIE_COMPILE_MORE_r({
1052 /* print out the table precompression. */
1054 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1055 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1057 for( state=1 ; state < next_alloc ; state ++ ) {
1060 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1061 if ( ! trie->states[ state ].wordnum ) {
1062 PerlIO_printf( Perl_debug_log, "%5s| ","");
1064 PerlIO_printf( Perl_debug_log, "W%04x| ",
1065 trie->states[ state ].wordnum
1068 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1069 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1070 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1071 SvPV_nolen_const( *tmp ),
1072 TRIE_LIST_ITEM(state,charid).forid,
1073 (UV)TRIE_LIST_ITEM(state,charid).newstate
1078 PerlIO_printf( Perl_debug_log, "\n\n" );
1081 Newxz( trie->trans, transcount ,reg_trie_trans );
1088 for( state=1 ; state < next_alloc ; state ++ ) {
1092 DEBUG_TRIE_COMPILE_MORE_r(
1093 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1097 if (trie->states[state].trans.list) {
1098 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1102 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1103 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1104 minid=TRIE_LIST_ITEM( state, idx).forid;
1105 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1106 maxid=TRIE_LIST_ITEM( state, idx).forid;
1109 if ( transcount < tp + maxid - minid + 1) {
1111 Renew( trie->trans, transcount, reg_trie_trans );
1112 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1114 base = trie->uniquecharcount + tp - minid;
1115 if ( maxid == minid ) {
1117 for ( ; zp < tp ; zp++ ) {
1118 if ( ! trie->trans[ zp ].next ) {
1119 base = trie->uniquecharcount + zp - minid;
1120 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1121 trie->trans[ zp ].check = state;
1127 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1128 trie->trans[ tp ].check = state;
1133 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1134 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1135 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1136 trie->trans[ tid ].check = state;
1138 tp += ( maxid - minid + 1 );
1140 Safefree(trie->states[ state ].trans.list);
1143 DEBUG_TRIE_COMPILE_MORE_r(
1144 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1147 trie->states[ state ].trans.base=base;
1149 trie->lasttrans = tp + 1;
1153 Second Pass -- Flat Table Representation.
1155 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1156 We know that we will need Charcount+1 trans at most to store the data
1157 (one row per char at worst case) So we preallocate both structures
1158 assuming worst case.
1160 We then construct the trie using only the .next slots of the entry
1163 We use the .check field of the first entry of the node temporarily to
1164 make compression both faster and easier by keeping track of how many non
1165 zero fields are in the node.
1167 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1170 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1171 number representing the first entry of the node, and state as a
1172 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1173 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1174 are 2 entrys per node. eg:
1182 The table is internally in the right hand, idx form. However as we also
1183 have to deal with the states array which is indexed by nodenum we have to
1184 use TRIE_NODENUM() to convert.
1188 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1190 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
1191 next_alloc = trie->uniquecharcount + 1;
1193 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1195 regnode *noper = NEXTOPER( cur );
1196 const U8 *uc = (U8*)STRING( noper );
1197 const U8 * const e = uc + STR_LEN( noper );
1199 U32 state = 1; /* required init */
1201 U16 charid = 0; /* sanity init */
1202 U32 accept_state = 0; /* sanity init */
1203 U8 *scan = (U8*)NULL; /* sanity init */
1205 STRLEN foldlen = 0; /* required init */
1206 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1209 for ( ; uc < e ; uc += len ) {
1214 charid = trie->charmap[ uvc ];
1216 SV** svpp=(SV**)NULL;
1217 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1221 charid=(U16)SvIV( *svpp );
1226 if ( !trie->trans[ state + charid ].next ) {
1227 trie->trans[ state + charid ].next = next_alloc;
1228 trie->trans[ state ].check++;
1229 next_alloc += trie->uniquecharcount;
1231 state = trie->trans[ state + charid ].next;
1233 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1235 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1238 accept_state = TRIE_NODENUM( state );
1239 if ( !trie->states[ accept_state ].wordnum ) {
1240 /* we havent inserted this word into the structure yet. */
1241 trie->states[ accept_state ].wordnum = ++curword;
1244 /* store the word for dumping */
1245 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1246 if ( UTF ) SvUTF8_on( tmp );
1247 av_push( trie->words, tmp );
1251 /* Its a dupe. So ignore it. */
1254 } /* end second pass */
1256 DEBUG_TRIE_COMPILE_MORE_r({
1258 print out the table precompression so that we can do a visual check
1259 that they are identical.
1263 PerlIO_printf( Perl_debug_log, "\nChar : " );
1265 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1266 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1268 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1272 PerlIO_printf( Perl_debug_log, "\nState+-" );
1274 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1275 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1278 PerlIO_printf( Perl_debug_log, "\n" );
1280 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1282 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1284 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1285 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1286 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1288 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1289 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1291 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1292 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1295 PerlIO_printf( Perl_debug_log, "\n\n" );
1299 * Inplace compress the table.*
1301 For sparse data sets the table constructed by the trie algorithm will
1302 be mostly 0/FAIL transitions or to put it another way mostly empty.
1303 (Note that leaf nodes will not contain any transitions.)
1305 This algorithm compresses the tables by eliminating most such
1306 transitions, at the cost of a modest bit of extra work during lookup:
1308 - Each states[] entry contains a .base field which indicates the
1309 index in the state[] array wheres its transition data is stored.
1311 - If .base is 0 there are no valid transitions from that node.
1313 - If .base is nonzero then charid is added to it to find an entry in
1316 -If trans[states[state].base+charid].check!=state then the
1317 transition is taken to be a 0/Fail transition. Thus if there are fail
1318 transitions at the front of the node then the .base offset will point
1319 somewhere inside the previous nodes data (or maybe even into a node
1320 even earlier), but the .check field determines if the transition is
1323 The following process inplace converts the table to the compressed
1324 table: We first do not compress the root node 1,and mark its all its
1325 .check pointers as 1 and set its .base pointer as 1 as well. This
1326 allows to do a DFA construction from the compressed table later, and
1327 ensures that any .base pointers we calculate later are greater than
1330 - We set 'pos' to indicate the first entry of the second node.
1332 - We then iterate over the columns of the node, finding the first and
1333 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1334 and set the .check pointers accordingly, and advance pos
1335 appropriately and repreat for the next node. Note that when we copy
1336 the next pointers we have to convert them from the original
1337 NODEIDX form to NODENUM form as the former is not valid post
1340 - If a node has no transitions used we mark its base as 0 and do not
1341 advance the pos pointer.
1343 - If a node only has one transition we use a second pointer into the
1344 structure to fill in allocated fail transitions from other states.
1345 This pointer is independent of the main pointer and scans forward
1346 looking for null transitions that are allocated to a state. When it
1347 finds one it writes the single transition into the "hole". If the
1348 pointer doesnt find one the single transition is appeneded as normal.
1350 - Once compressed we can Renew/realloc the structures to release the
1353 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1354 specifically Fig 3.47 and the associated pseudocode.
1358 const U32 laststate = TRIE_NODENUM( next_alloc );
1361 trie->laststate = laststate;
1363 for ( state = 1 ; state < laststate ; state++ ) {
1365 const U32 stateidx = TRIE_NODEIDX( state );
1366 const U32 o_used = trie->trans[ stateidx ].check;
1367 U32 used = trie->trans[ stateidx ].check;
1368 trie->trans[ stateidx ].check = 0;
1370 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1371 if ( flag || trie->trans[ stateidx + charid ].next ) {
1372 if ( trie->trans[ stateidx + charid ].next ) {
1374 for ( ; zp < pos ; zp++ ) {
1375 if ( ! trie->trans[ zp ].next ) {
1379 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1380 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1381 trie->trans[ zp ].check = state;
1382 if ( ++zp > pos ) pos = zp;
1389 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1391 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1392 trie->trans[ pos ].check = state;
1397 trie->lasttrans = pos + 1;
1398 Renew( trie->states, laststate + 1, reg_trie_state);
1399 DEBUG_TRIE_COMPILE_MORE_r(
1400 PerlIO_printf( Perl_debug_log,
1401 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1402 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1405 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1408 } /* end table compress */
1410 /* resize the trans array to remove unused space */
1411 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1413 DEBUG_TRIE_COMPILE_r({
1416 Now we print it out again, in a slightly different form as there is additional
1417 info we want to be able to see when its compressed. They are close enough for
1418 visual comparison though.
1420 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1422 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1423 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1425 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1428 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1430 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1431 PerlIO_printf( Perl_debug_log, "-----");
1432 PerlIO_printf( Perl_debug_log, "\n");
1434 for( state = 1 ; state < trie->laststate ; state++ ) {
1435 const U32 base = trie->states[ state ].trans.base;
1437 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1439 if ( trie->states[ state ].wordnum ) {
1440 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1442 PerlIO_printf( Perl_debug_log, "%6s", "" );
1445 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1450 while( ( base + ofs < trie->uniquecharcount ) ||
1451 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1452 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1455 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1457 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1458 if ( ( base + ofs >= trie->uniquecharcount ) &&
1459 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1460 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1462 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1463 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1465 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1469 PerlIO_printf( Perl_debug_log, "]");
1472 PerlIO_printf( Perl_debug_log, "\n" );
1477 /* now finally we "stitch in" the new TRIE node
1478 This means we convert either the first branch or the first Exact,
1479 depending on whether the thing following (in 'last') is a branch
1480 or not and whther first is the startbranch (ie is it a sub part of
1481 the alternation or is it the whole thing.)
1482 Assuming its a sub part we conver the EXACT otherwise we convert
1483 the whole branch sequence, including the first.
1490 if ( first == startbranch && OP( last ) != BRANCH ) {
1493 convert = NEXTOPER( first );
1494 NEXT_OFF( first ) = (U16)(last - first);
1497 OP( convert ) = TRIE + (U8)( flags - EXACT );
1498 NEXT_OFF( convert ) = (U16)(tail - convert);
1499 ARG_SET( convert, data_slot );
1501 /* tells us if we need to handle accept buffers specially */
1502 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1505 /* needed for dumping*/
1507 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1508 /* We now need to mark all of the space originally used by the
1509 branches as optimized away. This keeps the dumpuntil from
1510 throwing a wobbly as it doesnt use regnext() to traverse the
1513 while( optimize < last ) {
1514 OP( optimize ) = OPTIMIZED;
1518 } /* end node insert */
1525 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1526 * These need to be revisited when a newer toolchain becomes available.
1528 #if defined(__sparc64__) && defined(__GNUC__)
1529 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1530 # undef SPARC64_GCC_WORKAROUND
1531 # define SPARC64_GCC_WORKAROUND 1
1535 /* REx optimizer. Converts nodes into quickier variants "in place".
1536 Finds fixed substrings. */
1538 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1539 to the position after last scanned or to NULL. */
1543 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1544 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1545 /* scanp: Start here (read-write). */
1546 /* deltap: Write maxlen-minlen here. */
1547 /* last: Stop before this one. */
1549 I32 min = 0, pars = 0, code;
1550 regnode *scan = *scanp, *next;
1552 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1553 int is_inf_internal = 0; /* The studied chunk is infinite */
1554 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1555 scan_data_t data_fake;
1556 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1557 SV *re_trie_maxbuff = NULL;
1559 GET_RE_DEBUG_FLAGS_DECL;
1561 while (scan && OP(scan) != END && scan < last) {
1562 /* Peephole optimizer: */
1564 SV *mysv=sv_newmortal();
1565 regprop( mysv, scan);
1566 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1567 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1570 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1571 /* Merge several consecutive EXACTish nodes into one. */
1572 regnode *n = regnext(scan);
1575 regnode *stop = scan;
1578 next = scan + NODE_SZ_STR(scan);
1579 /* Skip NOTHING, merge EXACT*. */
1581 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1582 (stringok && (OP(n) == OP(scan))))
1584 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1585 if (OP(n) == TAIL || n > next)
1587 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1588 NEXT_OFF(scan) += NEXT_OFF(n);
1589 next = n + NODE_STEP_REGNODE;
1596 else if (stringok) {
1597 const int oldl = STR_LEN(scan);
1598 regnode *nnext = regnext(n);
1600 if (oldl + STR_LEN(n) > U8_MAX)
1602 NEXT_OFF(scan) += NEXT_OFF(n);
1603 STR_LEN(scan) += STR_LEN(n);
1604 next = n + NODE_SZ_STR(n);
1605 /* Now we can overwrite *n : */
1606 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1614 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1616 Two problematic code points in Unicode casefolding of EXACT nodes:
1618 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1619 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1625 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1626 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1628 This means that in case-insensitive matching (or "loose matching",
1629 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1630 length of the above casefolded versions) can match a target string
1631 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1632 This would rather mess up the minimum length computation.
1634 What we'll do is to look for the tail four bytes, and then peek
1635 at the preceding two bytes to see whether we need to decrease
1636 the minimum length by four (six minus two).
1638 Thanks to the design of UTF-8, there cannot be false matches:
1639 A sequence of valid UTF-8 bytes cannot be a subsequence of
1640 another valid sequence of UTF-8 bytes.
1643 char *s0 = STRING(scan), *s, *t;
1644 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
1645 const char * const t0 = "\xcc\x88\xcc\x81";
1646 const char * const t1 = t0 + 3;
1649 s < s2 && (t = ninstr(s, s1, t0, t1));
1651 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1652 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1659 n = scan + NODE_SZ_STR(scan);
1661 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1672 /* Follow the next-chain of the current node and optimize
1673 away all the NOTHINGs from it. */
1674 if (OP(scan) != CURLYX) {
1675 const int max = (reg_off_by_arg[OP(scan)]
1677 /* I32 may be smaller than U16 on CRAYs! */
1678 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1679 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1683 /* Skip NOTHING and LONGJMP. */
1684 while ((n = regnext(n))
1685 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1686 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1687 && off + noff < max)
1689 if (reg_off_by_arg[OP(scan)])
1692 NEXT_OFF(scan) = off;
1695 /* The principal pseudo-switch. Cannot be a switch, since we
1696 look into several different things. */
1697 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1698 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1699 next = regnext(scan);
1701 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1703 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1704 I32 max1 = 0, min1 = I32_MAX, num = 0;
1705 struct regnode_charclass_class accum;
1706 regnode *startbranch=scan;
1708 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1709 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1710 if (flags & SCF_DO_STCLASS)
1711 cl_init_zero(pRExC_state, &accum);
1713 while (OP(scan) == code) {
1714 I32 deltanext, minnext, f = 0, fake;
1715 struct regnode_charclass_class this_class;
1718 data_fake.flags = 0;
1720 data_fake.whilem_c = data->whilem_c;
1721 data_fake.last_closep = data->last_closep;
1724 data_fake.last_closep = &fake;
1725 next = regnext(scan);
1726 scan = NEXTOPER(scan);
1728 scan = NEXTOPER(scan);
1729 if (flags & SCF_DO_STCLASS) {
1730 cl_init(pRExC_state, &this_class);
1731 data_fake.start_class = &this_class;
1732 f = SCF_DO_STCLASS_AND;
1734 if (flags & SCF_WHILEM_VISITED_POS)
1735 f |= SCF_WHILEM_VISITED_POS;
1737 /* we suppose the run is continuous, last=next...*/
1738 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1739 next, &data_fake, f,depth+1);
1742 if (max1 < minnext + deltanext)
1743 max1 = minnext + deltanext;
1744 if (deltanext == I32_MAX)
1745 is_inf = is_inf_internal = 1;
1747 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1749 if (data && (data_fake.flags & SF_HAS_EVAL))
1750 data->flags |= SF_HAS_EVAL;
1752 data->whilem_c = data_fake.whilem_c;
1753 if (flags & SCF_DO_STCLASS)
1754 cl_or(pRExC_state, &accum, &this_class);
1755 if (code == SUSPEND)
1758 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1760 if (flags & SCF_DO_SUBSTR) {
1761 data->pos_min += min1;
1762 data->pos_delta += max1 - min1;
1763 if (max1 != min1 || is_inf)
1764 data->longest = &(data->longest_float);
1767 delta += max1 - min1;
1768 if (flags & SCF_DO_STCLASS_OR) {
1769 cl_or(pRExC_state, data->start_class, &accum);
1771 cl_and(data->start_class, &and_with);
1772 flags &= ~SCF_DO_STCLASS;
1775 else if (flags & SCF_DO_STCLASS_AND) {
1777 cl_and(data->start_class, &accum);
1778 flags &= ~SCF_DO_STCLASS;
1781 /* Switch to OR mode: cache the old value of
1782 * data->start_class */
1783 StructCopy(data->start_class, &and_with,
1784 struct regnode_charclass_class);
1785 flags &= ~SCF_DO_STCLASS_AND;
1786 StructCopy(&accum, data->start_class,
1787 struct regnode_charclass_class);
1788 flags |= SCF_DO_STCLASS_OR;
1789 data->start_class->flags |= ANYOF_EOS;
1795 Assuming this was/is a branch we are dealing with: 'scan' now
1796 points at the item that follows the branch sequence, whatever
1797 it is. We now start at the beginning of the sequence and look
1803 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1805 If we can find such a subseqence we need to turn the first
1806 element into a trie and then add the subsequent branch exact
1807 strings to the trie.
1811 1. patterns where the whole set of branch can be converted to a trie,
1813 2. patterns where only a subset of the alternations can be
1814 converted to a trie.
1816 In case 1 we can replace the whole set with a single regop
1817 for the trie. In case 2 we need to keep the start and end
1820 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1821 becomes BRANCH TRIE; BRANCH X;
1823 Hypthetically when we know the regex isnt anchored we can
1824 turn a case 1 into a DFA and let it rip... Every time it finds a match
1825 it would just call its tail, no WHILEM/CURLY needed.
1829 if (!re_trie_maxbuff) {
1830 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1831 if (!SvIOK(re_trie_maxbuff))
1832 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1834 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1836 regnode *first = (regnode *)NULL;
1837 regnode *last = (regnode *)NULL;
1838 regnode *tail = scan;
1843 SV *mysv = sv_newmortal(); /* for dumping */
1845 /* var tail is used because there may be a TAIL
1846 regop in the way. Ie, the exacts will point to the
1847 thing following the TAIL, but the last branch will
1848 point at the TAIL. So we advance tail. If we
1849 have nested (?:) we may have to move through several
1853 while ( OP( tail ) == TAIL ) {
1854 /* this is the TAIL generated by (?:) */
1855 tail = regnext( tail );
1859 regprop( mysv, tail );
1860 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1861 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1862 (RExC_seen_evals) ? "[EVAL]" : ""
1867 step through the branches, cur represents each
1868 branch, noper is the first thing to be matched
1869 as part of that branch and noper_next is the
1870 regnext() of that node. if noper is an EXACT
1871 and noper_next is the same as scan (our current
1872 position in the regex) then the EXACT branch is
1873 a possible optimization target. Once we have
1874 two or more consequetive such branches we can
1875 create a trie of the EXACT's contents and stich
1876 it in place. If the sequence represents all of
1877 the branches we eliminate the whole thing and
1878 replace it with a single TRIE. If it is a
1879 subsequence then we need to stitch it in. This
1880 means the first branch has to remain, and needs
1881 to be repointed at the item on the branch chain
1882 following the last branch optimized. This could
1883 be either a BRANCH, in which case the
1884 subsequence is internal, or it could be the
1885 item following the branch sequence in which
1886 case the subsequence is at the end.
1890 /* dont use tail as the end marker for this traverse */
1891 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1892 regnode * const noper = NEXTOPER( cur );
1893 regnode * const noper_next = regnext( noper );
1896 regprop( mysv, cur);
1897 PerlIO_printf( Perl_debug_log, "%*s%s",
1898 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1900 regprop( mysv, noper);
1901 PerlIO_printf( Perl_debug_log, " -> %s",
1902 SvPV_nolen_const(mysv));
1905 regprop( mysv, noper_next );
1906 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1907 SvPV_nolen_const(mysv));
1909 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1912 if ( ( first ? OP( noper ) == optype
1913 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1914 && noper_next == tail && count<U16_MAX)
1919 optype = OP( noper );
1923 regprop( mysv, first);
1924 PerlIO_printf( Perl_debug_log, "%*s%s",
1925 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1926 regprop( mysv, NEXTOPER(first) );
1927 PerlIO_printf( Perl_debug_log, " -> %s\n",
1928 SvPV_nolen_const( mysv ) );
1933 regprop( mysv, cur);
1934 PerlIO_printf( Perl_debug_log, "%*s%s",
1935 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1936 regprop( mysv, noper );
1937 PerlIO_printf( Perl_debug_log, " -> %s\n",
1938 SvPV_nolen_const( mysv ) );
1944 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1945 (int)depth * 2 + 2, "E:", "**END**" );
1947 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1949 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1950 && noper_next == tail )
1954 optype = OP( noper );
1964 regprop( mysv, cur);
1965 PerlIO_printf( Perl_debug_log,
1966 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1967 " ", SvPV_nolen_const( mysv ), first, last, cur);
1972 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1973 (int)depth * 2 + 2, "E:", "==END==" );
1975 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1980 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1981 scan = NEXTOPER(NEXTOPER(scan));
1982 } else /* single branch is optimized. */
1983 scan = NEXTOPER(scan);
1986 else if (OP(scan) == EXACT) {
1987 I32 l = STR_LEN(scan);
1988 UV uc = *((U8*)STRING(scan));
1990 const U8 * const s = (U8*)STRING(scan);
1991 l = utf8_length(s, s + l);
1992 uc = utf8_to_uvchr(s, NULL);
1995 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1996 /* The code below prefers earlier match for fixed
1997 offset, later match for variable offset. */
1998 if (data->last_end == -1) { /* Update the start info. */
1999 data->last_start_min = data->pos_min;
2000 data->last_start_max = is_inf
2001 ? I32_MAX : data->pos_min + data->pos_delta;
2003 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2005 SV * const sv = data->last_found;
2006 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2007 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2008 if (mg && mg->mg_len >= 0)
2009 mg->mg_len += utf8_length((U8*)STRING(scan),
2010 (U8*)STRING(scan)+STR_LEN(scan));
2013 SvUTF8_on(data->last_found);
2014 data->last_end = data->pos_min + l;
2015 data->pos_min += l; /* As in the first entry. */
2016 data->flags &= ~SF_BEFORE_EOL;
2018 if (flags & SCF_DO_STCLASS_AND) {
2019 /* Check whether it is compatible with what we know already! */
2023 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2024 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2025 && (!(data->start_class->flags & ANYOF_FOLD)
2026 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2029 ANYOF_CLASS_ZERO(data->start_class);
2030 ANYOF_BITMAP_ZERO(data->start_class);
2032 ANYOF_BITMAP_SET(data->start_class, uc);
2033 data->start_class->flags &= ~ANYOF_EOS;
2035 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2037 else if (flags & SCF_DO_STCLASS_OR) {
2038 /* false positive possible if the class is case-folded */
2040 ANYOF_BITMAP_SET(data->start_class, uc);
2042 data->start_class->flags |= ANYOF_UNICODE_ALL;
2043 data->start_class->flags &= ~ANYOF_EOS;
2044 cl_and(data->start_class, &and_with);
2046 flags &= ~SCF_DO_STCLASS;
2048 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2049 I32 l = STR_LEN(scan);
2050 UV uc = *((U8*)STRING(scan));
2052 /* Search for fixed substrings supports EXACT only. */
2053 if (flags & SCF_DO_SUBSTR)
2054 scan_commit(pRExC_state, data);
2056 U8 *s = (U8 *)STRING(scan);
2057 l = utf8_length(s, s + l);
2058 uc = utf8_to_uvchr(s, NULL);
2061 if (data && (flags & SCF_DO_SUBSTR))
2063 if (flags & SCF_DO_STCLASS_AND) {
2064 /* Check whether it is compatible with what we know already! */
2068 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2069 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2070 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2072 ANYOF_CLASS_ZERO(data->start_class);
2073 ANYOF_BITMAP_ZERO(data->start_class);
2075 ANYOF_BITMAP_SET(data->start_class, uc);
2076 data->start_class->flags &= ~ANYOF_EOS;
2077 data->start_class->flags |= ANYOF_FOLD;
2078 if (OP(scan) == EXACTFL)
2079 data->start_class->flags |= ANYOF_LOCALE;
2082 else if (flags & SCF_DO_STCLASS_OR) {
2083 if (data->start_class->flags & ANYOF_FOLD) {
2084 /* false positive possible if the class is case-folded.
2085 Assume that the locale settings are the same... */
2087 ANYOF_BITMAP_SET(data->start_class, uc);
2088 data->start_class->flags &= ~ANYOF_EOS;
2090 cl_and(data->start_class, &and_with);
2092 flags &= ~SCF_DO_STCLASS;
2094 else if (strchr((const char*)PL_varies,OP(scan))) {
2095 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2096 I32 f = flags, pos_before = 0;
2097 regnode *oscan = scan;
2098 struct regnode_charclass_class this_class;
2099 struct regnode_charclass_class *oclass = NULL;
2100 I32 next_is_eval = 0;
2102 switch (PL_regkind[(U8)OP(scan)]) {
2103 case WHILEM: /* End of (?:...)* . */
2104 scan = NEXTOPER(scan);
2107 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2108 next = NEXTOPER(scan);
2109 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2111 maxcount = REG_INFTY;
2112 next = regnext(scan);
2113 scan = NEXTOPER(scan);
2117 if (flags & SCF_DO_SUBSTR)
2122 if (flags & SCF_DO_STCLASS) {
2124 maxcount = REG_INFTY;
2125 next = regnext(scan);
2126 scan = NEXTOPER(scan);
2129 is_inf = is_inf_internal = 1;
2130 scan = regnext(scan);
2131 if (flags & SCF_DO_SUBSTR) {
2132 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2133 data->longest = &(data->longest_float);
2135 goto optimize_curly_tail;
2137 mincount = ARG1(scan);
2138 maxcount = ARG2(scan);
2139 next = regnext(scan);
2140 if (OP(scan) == CURLYX) {
2141 I32 lp = (data ? *(data->last_closep) : 0);
2142 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2144 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2145 next_is_eval = (OP(scan) == EVAL);
2147 if (flags & SCF_DO_SUBSTR) {
2148 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2149 pos_before = data->pos_min;
2153 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2155 data->flags |= SF_IS_INF;
2157 if (flags & SCF_DO_STCLASS) {
2158 cl_init(pRExC_state, &this_class);
2159 oclass = data->start_class;
2160 data->start_class = &this_class;
2161 f |= SCF_DO_STCLASS_AND;
2162 f &= ~SCF_DO_STCLASS_OR;
2164 /* These are the cases when once a subexpression
2165 fails at a particular position, it cannot succeed
2166 even after backtracking at the enclosing scope.
2168 XXXX what if minimal match and we are at the
2169 initial run of {n,m}? */
2170 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2171 f &= ~SCF_WHILEM_VISITED_POS;
2173 /* This will finish on WHILEM, setting scan, or on NULL: */
2174 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2176 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2178 if (flags & SCF_DO_STCLASS)
2179 data->start_class = oclass;
2180 if (mincount == 0 || minnext == 0) {
2181 if (flags & SCF_DO_STCLASS_OR) {
2182 cl_or(pRExC_state, data->start_class, &this_class);
2184 else if (flags & SCF_DO_STCLASS_AND) {
2185 /* Switch to OR mode: cache the old value of
2186 * data->start_class */
2187 StructCopy(data->start_class, &and_with,
2188 struct regnode_charclass_class);
2189 flags &= ~SCF_DO_STCLASS_AND;
2190 StructCopy(&this_class, data->start_class,
2191 struct regnode_charclass_class);
2192 flags |= SCF_DO_STCLASS_OR;
2193 data->start_class->flags |= ANYOF_EOS;
2195 } else { /* Non-zero len */
2196 if (flags & SCF_DO_STCLASS_OR) {
2197 cl_or(pRExC_state, data->start_class, &this_class);
2198 cl_and(data->start_class, &and_with);
2200 else if (flags & SCF_DO_STCLASS_AND)
2201 cl_and(data->start_class, &this_class);
2202 flags &= ~SCF_DO_STCLASS;
2204 if (!scan) /* It was not CURLYX, but CURLY. */
2206 if ( /* ? quantifier ok, except for (?{ ... }) */
2207 (next_is_eval || !(mincount == 0 && maxcount == 1))
2208 && (minnext == 0) && (deltanext == 0)
2209 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2210 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2211 && ckWARN(WARN_REGEXP))
2214 "Quantifier unexpected on zero-length expression");
2217 min += minnext * mincount;
2218 is_inf_internal |= ((maxcount == REG_INFTY
2219 && (minnext + deltanext) > 0)
2220 || deltanext == I32_MAX);
2221 is_inf |= is_inf_internal;
2222 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2224 /* Try powerful optimization CURLYX => CURLYN. */
2225 if ( OP(oscan) == CURLYX && data
2226 && data->flags & SF_IN_PAR
2227 && !(data->flags & SF_HAS_EVAL)
2228 && !deltanext && minnext == 1 ) {
2229 /* Try to optimize to CURLYN. */
2230 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2231 regnode *nxt1 = nxt;
2238 if (!strchr((const char*)PL_simple,OP(nxt))
2239 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2240 && STR_LEN(nxt) == 1))
2246 if (OP(nxt) != CLOSE)
2248 /* Now we know that nxt2 is the only contents: */
2249 oscan->flags = (U8)ARG(nxt);
2251 OP(nxt1) = NOTHING; /* was OPEN. */
2253 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2254 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2255 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2256 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2257 OP(nxt + 1) = OPTIMIZED; /* was count. */
2258 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2263 /* Try optimization CURLYX => CURLYM. */
2264 if ( OP(oscan) == CURLYX && data
2265 && !(data->flags & SF_HAS_PAR)
2266 && !(data->flags & SF_HAS_EVAL)
2267 && !deltanext /* atom is fixed width */
2268 && minnext != 0 /* CURLYM can't handle zero width */
2270 /* XXXX How to optimize if data == 0? */
2271 /* Optimize to a simpler form. */
2272 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2276 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2277 && (OP(nxt2) != WHILEM))
2279 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2280 /* Need to optimize away parenths. */
2281 if (data->flags & SF_IN_PAR) {
2282 /* Set the parenth number. */
2283 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2285 if (OP(nxt) != CLOSE)
2286 FAIL("Panic opt close");
2287 oscan->flags = (U8)ARG(nxt);
2288 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2289 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2291 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2292 OP(nxt + 1) = OPTIMIZED; /* was count. */
2293 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2294 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2297 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2298 regnode *nnxt = regnext(nxt1);
2301 if (reg_off_by_arg[OP(nxt1)])
2302 ARG_SET(nxt1, nxt2 - nxt1);
2303 else if (nxt2 - nxt1 < U16_MAX)
2304 NEXT_OFF(nxt1) = nxt2 - nxt1;
2306 OP(nxt) = NOTHING; /* Cannot beautify */
2311 /* Optimize again: */
2312 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2318 else if ((OP(oscan) == CURLYX)
2319 && (flags & SCF_WHILEM_VISITED_POS)
2320 /* See the comment on a similar expression above.
2321 However, this time it not a subexpression
2322 we care about, but the expression itself. */
2323 && (maxcount == REG_INFTY)
2324 && data && ++data->whilem_c < 16) {
2325 /* This stays as CURLYX, we can put the count/of pair. */
2326 /* Find WHILEM (as in regexec.c) */
2327 regnode *nxt = oscan + NEXT_OFF(oscan);
2329 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2331 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2332 | (RExC_whilem_seen << 4)); /* On WHILEM */
2334 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2336 if (flags & SCF_DO_SUBSTR) {
2337 SV *last_str = Nullsv;
2338 int counted = mincount != 0;
2340 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2341 #if defined(SPARC64_GCC_WORKAROUND)
2344 const char *s = NULL;
2347 if (pos_before >= data->last_start_min)
2350 b = data->last_start_min;
2353 s = SvPV_const(data->last_found, l);
2354 old = b - data->last_start_min;
2357 I32 b = pos_before >= data->last_start_min
2358 ? pos_before : data->last_start_min;
2360 const char *s = SvPV_const(data->last_found, l);
2361 I32 old = b - data->last_start_min;
2365 old = utf8_hop((U8*)s, old) - (U8*)s;
2368 /* Get the added string: */
2369 last_str = newSVpvn(s + old, l);
2371 SvUTF8_on(last_str);
2372 if (deltanext == 0 && pos_before == b) {
2373 /* What was added is a constant string */
2375 SvGROW(last_str, (mincount * l) + 1);
2376 repeatcpy(SvPVX(last_str) + l,
2377 SvPVX_const(last_str), l, mincount - 1);
2378 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2379 /* Add additional parts. */
2380 SvCUR_set(data->last_found,
2381 SvCUR(data->last_found) - l);
2382 sv_catsv(data->last_found, last_str);
2384 SV * sv = data->last_found;
2386 SvUTF8(sv) && SvMAGICAL(sv) ?
2387 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2388 if (mg && mg->mg_len >= 0)
2389 mg->mg_len += CHR_SVLEN(last_str);
2391 data->last_end += l * (mincount - 1);
2394 /* start offset must point into the last copy */
2395 data->last_start_min += minnext * (mincount - 1);
2396 data->last_start_max += is_inf ? I32_MAX
2397 : (maxcount - 1) * (minnext + data->pos_delta);
2400 /* It is counted once already... */
2401 data->pos_min += minnext * (mincount - counted);
2402 data->pos_delta += - counted * deltanext +
2403 (minnext + deltanext) * maxcount - minnext * mincount;
2404 if (mincount != maxcount) {
2405 /* Cannot extend fixed substrings found inside
2407 scan_commit(pRExC_state,data);
2408 if (mincount && last_str) {
2409 sv_setsv(data->last_found, last_str);
2410 data->last_end = data->pos_min;
2411 data->last_start_min =
2412 data->pos_min - CHR_SVLEN(last_str);
2413 data->last_start_max = is_inf
2415 : data->pos_min + data->pos_delta
2416 - CHR_SVLEN(last_str);
2418 data->longest = &(data->longest_float);
2420 SvREFCNT_dec(last_str);
2422 if (data && (fl & SF_HAS_EVAL))
2423 data->flags |= SF_HAS_EVAL;
2424 optimize_curly_tail:
2425 if (OP(oscan) != CURLYX) {
2426 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2428 NEXT_OFF(oscan) += NEXT_OFF(next);
2431 default: /* REF and CLUMP only? */
2432 if (flags & SCF_DO_SUBSTR) {
2433 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2434 data->longest = &(data->longest_float);
2436 is_inf = is_inf_internal = 1;
2437 if (flags & SCF_DO_STCLASS_OR)
2438 cl_anything(pRExC_state, data->start_class);
2439 flags &= ~SCF_DO_STCLASS;
2443 else if (strchr((const char*)PL_simple,OP(scan))) {
2446 if (flags & SCF_DO_SUBSTR) {
2447 scan_commit(pRExC_state,data);
2451 if (flags & SCF_DO_STCLASS) {
2452 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2454 /* Some of the logic below assumes that switching
2455 locale on will only add false positives. */
2456 switch (PL_regkind[(U8)OP(scan)]) {
2460 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2461 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2462 cl_anything(pRExC_state, data->start_class);
2465 if (OP(scan) == SANY)
2467 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2468 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2469 || (data->start_class->flags & ANYOF_CLASS));
2470 cl_anything(pRExC_state, data->start_class);
2472 if (flags & SCF_DO_STCLASS_AND || !value)
2473 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2476 if (flags & SCF_DO_STCLASS_AND)
2477 cl_and(data->start_class,
2478 (struct regnode_charclass_class*)scan);
2480 cl_or(pRExC_state, data->start_class,
2481 (struct regnode_charclass_class*)scan);
2484 if (flags & SCF_DO_STCLASS_AND) {
2485 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2486 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2487 for (value = 0; value < 256; value++)
2488 if (!isALNUM(value))
2489 ANYOF_BITMAP_CLEAR(data->start_class, value);
2493 if (data->start_class->flags & ANYOF_LOCALE)
2494 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2496 for (value = 0; value < 256; value++)
2498 ANYOF_BITMAP_SET(data->start_class, value);
2503 if (flags & SCF_DO_STCLASS_AND) {
2504 if (data->start_class->flags & ANYOF_LOCALE)
2505 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2508 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2509 data->start_class->flags |= ANYOF_LOCALE;
2513 if (flags & SCF_DO_STCLASS_AND) {
2514 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2515 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2516 for (value = 0; value < 256; value++)
2518 ANYOF_BITMAP_CLEAR(data->start_class, value);
2522 if (data->start_class->flags & ANYOF_LOCALE)
2523 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2525 for (value = 0; value < 256; value++)
2526 if (!isALNUM(value))
2527 ANYOF_BITMAP_SET(data->start_class, value);
2532 if (flags & SCF_DO_STCLASS_AND) {
2533 if (data->start_class->flags & ANYOF_LOCALE)
2534 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2537 data->start_class->flags |= ANYOF_LOCALE;
2538 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2542 if (flags & SCF_DO_STCLASS_AND) {
2543 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2545 for (value = 0; value < 256; value++)
2546 if (!isSPACE(value))
2547 ANYOF_BITMAP_CLEAR(data->start_class, value);
2551 if (data->start_class->flags & ANYOF_LOCALE)
2552 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2554 for (value = 0; value < 256; value++)
2556 ANYOF_BITMAP_SET(data->start_class, value);
2561 if (flags & SCF_DO_STCLASS_AND) {
2562 if (data->start_class->flags & ANYOF_LOCALE)
2563 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2566 data->start_class->flags |= ANYOF_LOCALE;
2567 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2571 if (flags & SCF_DO_STCLASS_AND) {
2572 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2574 for (value = 0; value < 256; value++)
2576 ANYOF_BITMAP_CLEAR(data->start_class, value);
2580 if (data->start_class->flags & ANYOF_LOCALE)
2581 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2583 for (value = 0; value < 256; value++)
2584 if (!isSPACE(value))
2585 ANYOF_BITMAP_SET(data->start_class, value);
2590 if (flags & SCF_DO_STCLASS_AND) {
2591 if (data->start_class->flags & ANYOF_LOCALE) {
2592 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2593 for (value = 0; value < 256; value++)
2594 if (!isSPACE(value))
2595 ANYOF_BITMAP_CLEAR(data->start_class, value);
2599 data->start_class->flags |= ANYOF_LOCALE;
2600 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2606 for (value = 0; value < 256; value++)
2607 if (!isDIGIT(value))
2608 ANYOF_BITMAP_CLEAR(data->start_class, value);
2611 if (data->start_class->flags & ANYOF_LOCALE)
2612 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2614 for (value = 0; value < 256; value++)
2616 ANYOF_BITMAP_SET(data->start_class, value);
2621 if (flags & SCF_DO_STCLASS_AND) {
2622 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2623 for (value = 0; value < 256; value++)
2625 ANYOF_BITMAP_CLEAR(data->start_class, value);
2628 if (data->start_class->flags & ANYOF_LOCALE)
2629 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2631 for (value = 0; value < 256; value++)
2632 if (!isDIGIT(value))
2633 ANYOF_BITMAP_SET(data->start_class, value);
2638 if (flags & SCF_DO_STCLASS_OR)
2639 cl_and(data->start_class, &and_with);
2640 flags &= ~SCF_DO_STCLASS;
2643 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2644 data->flags |= (OP(scan) == MEOL
2648 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2649 /* Lookbehind, or need to calculate parens/evals/stclass: */
2650 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2651 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2652 /* Lookahead/lookbehind */
2653 I32 deltanext, minnext, fake = 0;
2655 struct regnode_charclass_class intrnl;
2658 data_fake.flags = 0;
2660 data_fake.whilem_c = data->whilem_c;
2661 data_fake.last_closep = data->last_closep;
2664 data_fake.last_closep = &fake;
2665 if ( flags & SCF_DO_STCLASS && !scan->flags
2666 && OP(scan) == IFMATCH ) { /* Lookahead */
2667 cl_init(pRExC_state, &intrnl);
2668 data_fake.start_class = &intrnl;
2669 f |= SCF_DO_STCLASS_AND;
2671 if (flags & SCF_WHILEM_VISITED_POS)
2672 f |= SCF_WHILEM_VISITED_POS;
2673 next = regnext(scan);
2674 nscan = NEXTOPER(NEXTOPER(scan));
2675 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2678 vFAIL("Variable length lookbehind not implemented");
2680 else if (minnext > U8_MAX) {
2681 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2683 scan->flags = (U8)minnext;
2685 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2687 if (data && (data_fake.flags & SF_HAS_EVAL))
2688 data->flags |= SF_HAS_EVAL;
2690 data->whilem_c = data_fake.whilem_c;
2691 if (f & SCF_DO_STCLASS_AND) {
2692 const int was = (data->start_class->flags & ANYOF_EOS);
2694 cl_and(data->start_class, &intrnl);
2696 data->start_class->flags |= ANYOF_EOS;
2699 else if (OP(scan) == OPEN) {
2702 else if (OP(scan) == CLOSE) {
2703 if ((I32)ARG(scan) == is_par) {
2704 next = regnext(scan);
2706 if ( next && (OP(next) != WHILEM) && next < last)
2707 is_par = 0; /* Disable optimization */
2710 *(data->last_closep) = ARG(scan);
2712 else if (OP(scan) == EVAL) {
2714 data->flags |= SF_HAS_EVAL;
2716 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2717 if (flags & SCF_DO_SUBSTR) {
2718 scan_commit(pRExC_state,data);
2719 data->longest = &(data->longest_float);
2721 is_inf = is_inf_internal = 1;
2722 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2723 cl_anything(pRExC_state, data->start_class);
2724 flags &= ~SCF_DO_STCLASS;
2726 /* Else: zero-length, ignore. */
2727 scan = regnext(scan);
2732 *deltap = is_inf_internal ? I32_MAX : delta;
2733 if (flags & SCF_DO_SUBSTR && is_inf)
2734 data->pos_delta = I32_MAX - data->pos_min;
2735 if (is_par > U8_MAX)
2737 if (is_par && pars==1 && data) {
2738 data->flags |= SF_IN_PAR;
2739 data->flags &= ~SF_HAS_PAR;
2741 else if (pars && data) {
2742 data->flags |= SF_HAS_PAR;
2743 data->flags &= ~SF_IN_PAR;
2745 if (flags & SCF_DO_STCLASS_OR)
2746 cl_and(data->start_class, &and_with);
2751 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
2753 if (RExC_rx->data) {
2754 Renewc(RExC_rx->data,
2755 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2756 char, struct reg_data);
2757 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2758 RExC_rx->data->count += n;
2761 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2762 char, struct reg_data);
2763 Newx(RExC_rx->data->what, n, U8);
2764 RExC_rx->data->count = n;
2766 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2767 return RExC_rx->data->count - n;
2771 Perl_reginitcolors(pTHX)
2773 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2775 char *t = savepv(s);
2779 t = strchr(t, '\t');
2785 PL_colors[i] = t = (char *)"";
2790 PL_colors[i++] = (char *)"";
2797 - pregcomp - compile a regular expression into internal code
2799 * We can't allocate space until we know how big the compiled form will be,
2800 * but we can't compile it (and thus know how big it is) until we've got a
2801 * place to put the code. So we cheat: we compile it twice, once with code
2802 * generation turned off and size counting turned on, and once "for real".
2803 * This also means that we don't allocate space until we are sure that the
2804 * thing really will compile successfully, and we never have to move the
2805 * code and thus invalidate pointers into it. (Note that it has to be in
2806 * one piece because free() must be able to free it all.) [NB: not true in perl]
2808 * Beware that the optimization-preparation code in here knows about some
2809 * of the structure of the compiled regexp. [I'll say.]
2812 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2822 RExC_state_t RExC_state;
2823 RExC_state_t *pRExC_state = &RExC_state;
2825 GET_RE_DEBUG_FLAGS_DECL;
2828 FAIL("NULL regexp argument");
2830 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2833 DEBUG_r(if (!PL_colorset) reginitcolors());
2835 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2836 PL_colors[4],PL_colors[5],PL_colors[0],
2837 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2839 RExC_flags = pm->op_pmflags;
2843 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2844 RExC_seen_evals = 0;
2847 /* First pass: determine size, legality. */
2854 RExC_emit = &PL_regdummy;
2855 RExC_whilem_seen = 0;
2856 #if 0 /* REGC() is (currently) a NOP at the first pass.
2857 * Clever compilers notice this and complain. --jhi */
2858 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2860 if (reg(pRExC_state, 0, &flags) == NULL) {
2861 RExC_precomp = Nullch;
2864 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2866 /* Small enough for pointer-storage convention?
2867 If extralen==0, this means that we will not need long jumps. */
2868 if (RExC_size >= 0x10000L && RExC_extralen)
2869 RExC_size += RExC_extralen;
2872 if (RExC_whilem_seen > 15)
2873 RExC_whilem_seen = 15;
2875 /* Allocate space and initialize. */
2876 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2879 FAIL("Regexp out of space");
2882 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2883 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2886 r->prelen = xend - exp;
2887 r->precomp = savepvn(RExC_precomp, r->prelen);
2889 #ifdef PERL_OLD_COPY_ON_WRITE
2890 r->saved_copy = Nullsv;
2892 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2893 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2895 r->substrs = 0; /* Useful during FAIL. */
2896 r->startp = 0; /* Useful during FAIL. */
2897 r->endp = 0; /* Useful during FAIL. */
2899 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2901 r->offsets[0] = RExC_size;
2903 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2904 "%s %"UVuf" bytes for offset annotations.\n",
2905 r->offsets ? "Got" : "Couldn't get",
2906 (UV)((2*RExC_size+1) * sizeof(U32))));
2910 /* Second pass: emit code. */
2911 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2916 RExC_emit_start = r->program;
2917 RExC_emit = r->program;
2918 /* Store the count of eval-groups for security checks: */
2919 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2920 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2922 if (reg(pRExC_state, 0, &flags) == NULL)
2926 /* Dig out information for optimizations. */
2927 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2928 pm->op_pmflags = RExC_flags;
2930 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2931 r->regstclass = NULL;
2932 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2933 r->reganch |= ROPT_NAUGHTY;
2934 scan = r->program + 1; /* First BRANCH. */
2936 /* XXXX To minimize changes to RE engine we always allocate
2937 3-units-long substrs field. */
2938 Newxz(r->substrs, 1, struct reg_substr_data);
2940 StructCopy(&zero_scan_data, &data, scan_data_t);
2941 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2942 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2944 STRLEN longest_float_length, longest_fixed_length;
2945 struct regnode_charclass_class ch_class;
2950 /* Skip introductions and multiplicators >= 1. */
2951 while ((OP(first) == OPEN && (sawopen = 1)) ||
2952 /* An OR of *one* alternative - should not happen now. */
2953 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2954 (OP(first) == PLUS) ||
2955 (OP(first) == MINMOD) ||
2956 /* An {n,m} with n>0 */
2957 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2958 if (OP(first) == PLUS)
2961 first += regarglen[(U8)OP(first)];
2962 first = NEXTOPER(first);
2965 /* Starting-point info. */
2967 if (PL_regkind[(U8)OP(first)] == EXACT) {
2968 if (OP(first) == EXACT)
2969 ; /* Empty, get anchored substr later. */
2970 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2971 r->regstclass = first;
2973 else if (strchr((const char*)PL_simple,OP(first)))
2974 r->regstclass = first;
2975 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2976 PL_regkind[(U8)OP(first)] == NBOUND)
2977 r->regstclass = first;
2978 else if (PL_regkind[(U8)OP(first)] == BOL) {
2979 r->reganch |= (OP(first) == MBOL
2981 : (OP(first) == SBOL
2984 first = NEXTOPER(first);
2987 else if (OP(first) == GPOS) {
2988 r->reganch |= ROPT_ANCH_GPOS;
2989 first = NEXTOPER(first);
2992 else if (!sawopen && (OP(first) == STAR &&
2993 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2994 !(r->reganch & ROPT_ANCH) )
2996 /* turn .* into ^.* with an implied $*=1 */
2998 (OP(NEXTOPER(first)) == REG_ANY)
3001 r->reganch |= type | ROPT_IMPLICIT;
3002 first = NEXTOPER(first);
3005 if (sawplus && (!sawopen || !RExC_sawback)
3006 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3007 /* x+ must match at the 1st pos of run of x's */
3008 r->reganch |= ROPT_SKIP;
3010 /* Scan is after the zeroth branch, first is atomic matcher. */
3011 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3012 (IV)(first - scan + 1)));
3014 * If there's something expensive in the r.e., find the
3015 * longest literal string that must appear and make it the
3016 * regmust. Resolve ties in favor of later strings, since
3017 * the regstart check works with the beginning of the r.e.
3018 * and avoiding duplication strengthens checking. Not a
3019 * strong reason, but sufficient in the absence of others.
3020 * [Now we resolve ties in favor of the earlier string if
3021 * it happens that c_offset_min has been invalidated, since the
3022 * earlier string may buy us something the later one won't.]
3026 data.longest_fixed = newSVpvn("",0);
3027 data.longest_float = newSVpvn("",0);
3028 data.last_found = newSVpvn("",0);
3029 data.longest = &(data.longest_fixed);
3031 if (!r->regstclass) {
3032 cl_init(pRExC_state, &ch_class);
3033 data.start_class = &ch_class;
3034 stclass_flag = SCF_DO_STCLASS_AND;
3035 } else /* XXXX Check for BOUND? */
3037 data.last_closep = &last_close;
3039 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3040 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3041 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3042 && data.last_start_min == 0 && data.last_end > 0
3043 && !RExC_seen_zerolen
3044 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3045 r->reganch |= ROPT_CHECK_ALL;
3046 scan_commit(pRExC_state, &data);
3047 SvREFCNT_dec(data.last_found);
3049 longest_float_length = CHR_SVLEN(data.longest_float);
3050 if (longest_float_length
3051 || (data.flags & SF_FL_BEFORE_EOL
3052 && (!(data.flags & SF_FL_BEFORE_MEOL)
3053 || (RExC_flags & PMf_MULTILINE)))) {
3056 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3057 && data.offset_fixed == data.offset_float_min
3058 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3059 goto remove_float; /* As in (a)+. */
3061 if (SvUTF8(data.longest_float)) {
3062 r->float_utf8 = data.longest_float;
3063 r->float_substr = Nullsv;
3065 r->float_substr = data.longest_float;
3066 r->float_utf8 = Nullsv;
3068 r->float_min_offset = data.offset_float_min;
3069 r->float_max_offset = data.offset_float_max;
3070 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3071 && (!(data.flags & SF_FL_BEFORE_MEOL)
3072 || (RExC_flags & PMf_MULTILINE)));
3073 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3077 r->float_substr = r->float_utf8 = Nullsv;
3078 SvREFCNT_dec(data.longest_float);
3079 longest_float_length = 0;
3082 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3083 if (longest_fixed_length
3084 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3085 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3086 || (RExC_flags & PMf_MULTILINE)))) {
3089 if (SvUTF8(data.longest_fixed)) {
3090 r->anchored_utf8 = data.longest_fixed;
3091 r->anchored_substr = Nullsv;
3093 r->anchored_substr = data.longest_fixed;
3094 r->anchored_utf8 = Nullsv;
3096 r->anchored_offset = data.offset_fixed;
3097 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3098 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3099 || (RExC_flags & PMf_MULTILINE)));
3100 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3103 r->anchored_substr = r->anchored_utf8 = Nullsv;
3104 SvREFCNT_dec(data.longest_fixed);
3105 longest_fixed_length = 0;
3108 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3109 r->regstclass = NULL;
3110 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3112 && !(data.start_class->flags & ANYOF_EOS)
3113 && !cl_is_anything(data.start_class))
3115 const I32 n = add_data(pRExC_state, 1, "f");
3117 Newx(RExC_rx->data->data[n], 1,
3118 struct regnode_charclass_class);
3119 StructCopy(data.start_class,
3120 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3121 struct regnode_charclass_class);
3122 r->regstclass = (regnode*)RExC_rx->data->data[n];
3123 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3124 PL_regdata = r->data; /* for regprop() */
3125 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3126 regprop(sv, (regnode*)data.start_class);
3127 PerlIO_printf(Perl_debug_log,
3128 "synthetic stclass \"%s\".\n",
3129 SvPVX_const(sv));});
3132 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3133 if (longest_fixed_length > longest_float_length) {
3134 r->check_substr = r->anchored_substr;
3135 r->check_utf8 = r->anchored_utf8;
3136 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3137 if (r->reganch & ROPT_ANCH_SINGLE)
3138 r->reganch |= ROPT_NOSCAN;
3141 r->check_substr = r->float_substr;
3142 r->check_utf8 = r->float_utf8;
3143 r->check_offset_min = data.offset_float_min;
3144 r->check_offset_max = data.offset_float_max;
3146 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3147 This should be changed ASAP! */
3148 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3149 r->reganch |= RE_USE_INTUIT;
3150 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3151 r->reganch |= RE_INTUIT_TAIL;
3155 /* Several toplevels. Best we can is to set minlen. */
3157 struct regnode_charclass_class ch_class;
3160 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3161 scan = r->program + 1;
3162 cl_init(pRExC_state, &ch_class);
3163 data.start_class = &ch_class;
3164 data.last_closep = &last_close;
3165 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3166 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3167 = r->float_substr = r->float_utf8 = Nullsv;
3168 if (!(data.start_class->flags & ANYOF_EOS)
3169 && !cl_is_anything(data.start_class))
3171 const I32 n = add_data(pRExC_state, 1, "f");
3173 Newx(RExC_rx->data->data[n], 1,
3174 struct regnode_charclass_class);
3175 StructCopy(data.start_class,
3176 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3177 struct regnode_charclass_class);
3178 r->regstclass = (regnode*)RExC_rx->data->data[n];
3179 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3180 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3181 regprop(sv, (regnode*)data.start_class);
3182 PerlIO_printf(Perl_debug_log,
3183 "synthetic stclass \"%s\".\n",
3184 SvPVX_const(sv));});
3189 if (RExC_seen & REG_SEEN_GPOS)
3190 r->reganch |= ROPT_GPOS_SEEN;
3191 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3192 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3193 if (RExC_seen & REG_SEEN_EVAL)
3194 r->reganch |= ROPT_EVAL_SEEN;
3195 if (RExC_seen & REG_SEEN_CANY)
3196 r->reganch |= ROPT_CANY_SEEN;
3197 Newxz(r->startp, RExC_npar, I32);
3198 Newxz(r->endp, RExC_npar, I32);
3199 PL_regdata = r->data; /* for regprop() */
3200 DEBUG_COMPILE_r(regdump(r));
3205 - reg - regular expression, i.e. main body or parenthesized thing
3207 * Caller must absorb opening parenthesis.
3209 * Combining parenthesis handling with the base level of regular expression
3210 * is a trifle forced, but the need to tie the tails of the branches to what
3211 * follows makes it hard to avoid.
3214 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3215 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3218 register regnode *ret; /* Will be the head of the group. */
3219 register regnode *br;
3220 register regnode *lastbr;
3221 register regnode *ender = 0;
3222 register I32 parno = 0;
3223 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
3225 /* for (?g), (?gc), and (?o) warnings; warning
3226 about (?c) will warn about (?g) -- japhy */
3228 I32 wastedflags = 0x00,
3231 wasted_gc = 0x02 | 0x04,
3234 char * parse_start = RExC_parse; /* MJD */
3235 char * const oregcomp_parse = RExC_parse;
3238 *flagp = 0; /* Tentatively. */
3241 /* Make an OPEN node, if parenthesized. */
3243 if (*RExC_parse == '?') { /* (?...) */
3244 U32 posflags = 0, negflags = 0;
3245 U32 *flagsp = &posflags;
3247 const char * const seqstart = RExC_parse;
3250 paren = *RExC_parse++;
3251 ret = NULL; /* For look-ahead/behind. */
3253 case '<': /* (?<...) */
3254 RExC_seen |= REG_SEEN_LOOKBEHIND;
3255 if (*RExC_parse == '!')
3257 if (*RExC_parse != '=' && *RExC_parse != '!')
3260 case '=': /* (?=...) */
3261 case '!': /* (?!...) */
3262 RExC_seen_zerolen++;
3263 case ':': /* (?:...) */
3264 case '>': /* (?>...) */
3266 case '$': /* (?$...) */
3267 case '@': /* (?@...) */
3268 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3270 case '#': /* (?#...) */
3271 while (*RExC_parse && *RExC_parse != ')')
3273 if (*RExC_parse != ')')
3274 FAIL("Sequence (?#... not terminated");
3275 nextchar(pRExC_state);
3278 case 'p': /* (?p...) */
3279 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3280 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3282 case '?': /* (??...) */
3284 if (*RExC_parse != '{')
3286 paren = *RExC_parse++;
3288 case '{': /* (?{...}) */
3290 I32 count = 1, n = 0;
3292 char *s = RExC_parse;
3294 OP_4tree *sop, *rop;
3296 RExC_seen_zerolen++;
3297 RExC_seen |= REG_SEEN_EVAL;
3298 while (count && (c = *RExC_parse)) {
3299 if (c == '\\' && RExC_parse[1])
3307 if (*RExC_parse != ')')
3310 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3315 if (RExC_parse - 1 - s)
3316 sv = newSVpvn(s, RExC_parse - 1 - s);
3318 sv = newSVpvn("", 0);
3321 Perl_save_re_context(aTHX);
3322 rop = sv_compile_2op(sv, &sop, "re", &pad);
3323 sop->op_private |= OPpREFCOUNTED;
3324 /* re_dup will OpREFCNT_inc */
3325 OpREFCNT_set(sop, 1);
3328 n = add_data(pRExC_state, 3, "nop");
3329 RExC_rx->data->data[n] = (void*)rop;
3330 RExC_rx->data->data[n+1] = (void*)sop;
3331 RExC_rx->data->data[n+2] = (void*)pad;
3334 else { /* First pass */
3335 if (PL_reginterp_cnt < ++RExC_seen_evals
3337 /* No compiled RE interpolated, has runtime
3338 components ===> unsafe. */
3339 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3340 if (PL_tainting && PL_tainted)
3341 FAIL("Eval-group in insecure regular expression");
3342 if (IN_PERL_COMPILETIME)
3346 nextchar(pRExC_state);
3348 ret = reg_node(pRExC_state, LOGICAL);
3351 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3352 /* deal with the length of this later - MJD */
3355 ret = reganode(pRExC_state, EVAL, n);
3356 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3357 Set_Node_Offset(ret, parse_start);
3360 case '(': /* (?(?{...})...) and (?(?=...)...) */
3362 if (RExC_parse[0] == '?') { /* (?(?...)) */
3363 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3364 || RExC_parse[1] == '<'
3365 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3368 ret = reg_node(pRExC_state, LOGICAL);
3371 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3375 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3377 parno = atoi(RExC_parse++);
3379 while (isDIGIT(*RExC_parse))
3381 ret = reganode(pRExC_state, GROUPP, parno);
3383 if ((c = *nextchar(pRExC_state)) != ')')
3384 vFAIL("Switch condition not recognized");
3386 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3387 br = regbranch(pRExC_state, &flags, 1);
3389 br = reganode(pRExC_state, LONGJMP, 0);
3391 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3392 c = *nextchar(pRExC_state);
3396 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3397 regbranch(pRExC_state, &flags, 1);
3398 regtail(pRExC_state, ret, lastbr);
3401 c = *nextchar(pRExC_state);
3406 vFAIL("Switch (?(condition)... contains too many branches");
3407 ender = reg_node(pRExC_state, TAIL);
3408 regtail(pRExC_state, br, ender);
3410 regtail(pRExC_state, lastbr, ender);
3411 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3414 regtail(pRExC_state, ret, ender);
3418 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3422 RExC_parse--; /* for vFAIL to print correctly */
3423 vFAIL("Sequence (? incomplete");
3427 parse_flags: /* (?i) */
3428 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3429 /* (?g), (?gc) and (?o) are useless here
3430 and must be globally applied -- japhy */
3432 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3433 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3434 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3435 if (! (wastedflags & wflagbit) ) {
3436 wastedflags |= wflagbit;
3439 "Useless (%s%c) - %suse /%c modifier",
3440 flagsp == &negflags ? "?-" : "?",
3442 flagsp == &negflags ? "don't " : "",
3448 else if (*RExC_parse == 'c') {
3449 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3450 if (! (wastedflags & wasted_c) ) {
3451 wastedflags |= wasted_gc;
3454 "Useless (%sc) - %suse /gc modifier",
3455 flagsp == &negflags ? "?-" : "?",
3456 flagsp == &negflags ? "don't " : ""
3461 else { pmflag(flagsp, *RExC_parse); }
3465 if (*RExC_parse == '-') {
3467 wastedflags = 0; /* reset so (?g-c) warns twice */
3471 RExC_flags |= posflags;
3472 RExC_flags &= ~negflags;
3473 if (*RExC_parse == ':') {
3479 if (*RExC_parse != ')') {
3481 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3483 nextchar(pRExC_state);
3491 ret = reganode(pRExC_state, OPEN, parno);
3492 Set_Node_Length(ret, 1); /* MJD */
3493 Set_Node_Offset(ret, RExC_parse); /* MJD */
3500 /* Pick up the branches, linking them together. */
3501 parse_start = RExC_parse; /* MJD */
3502 br = regbranch(pRExC_state, &flags, 1);
3503 /* branch_len = (paren != 0); */
3507 if (*RExC_parse == '|') {
3508 if (!SIZE_ONLY && RExC_extralen) {
3509 reginsert(pRExC_state, BRANCHJ, br);
3512 reginsert(pRExC_state, BRANCH, br);
3513 Set_Node_Length(br, paren != 0);
3514 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3518 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3520 else if (paren == ':') {
3521 *flagp |= flags&SIMPLE;
3523 if (open) { /* Starts with OPEN. */
3524 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3526 else if (paren != '?') /* Not Conditional */
3528 *flagp |= flags & (SPSTART | HASWIDTH);
3530 while (*RExC_parse == '|') {
3531 if (!SIZE_ONLY && RExC_extralen) {
3532 ender = reganode(pRExC_state, LONGJMP,0);
3533 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3536 RExC_extralen += 2; /* Account for LONGJMP. */
3537 nextchar(pRExC_state);
3538 br = regbranch(pRExC_state, &flags, 0);
3542 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3546 *flagp |= flags&SPSTART;
3549 if (have_branch || paren != ':') {
3550 /* Make a closing node, and hook it on the end. */
3553 ender = reg_node(pRExC_state, TAIL);
3556 ender = reganode(pRExC_state, CLOSE, parno);
3557 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3558 Set_Node_Length(ender,1); /* MJD */
3564 *flagp &= ~HASWIDTH;
3567 ender = reg_node(pRExC_state, SUCCEED);
3570 ender = reg_node(pRExC_state, END);
3573 regtail(pRExC_state, lastbr, ender);
3576 /* Hook the tails of the branches to the closing node. */
3577 for (br = ret; br != NULL; br = regnext(br)) {
3578 regoptail(pRExC_state, br, ender);
3585 static const char parens[] = "=!<,>";
3587 if (paren && (p = strchr(parens, paren))) {
3588 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3589 int flag = (p - parens) > 1;
3592 node = SUSPEND, flag = 0;
3593 reginsert(pRExC_state, node,ret);
3594 Set_Node_Cur_Length(ret);
3595 Set_Node_Offset(ret, parse_start + 1);
3597 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3601 /* Check for proper termination. */
3603 RExC_flags = oregflags;
3604 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3605 RExC_parse = oregcomp_parse;
3606 vFAIL("Unmatched (");
3609 else if (!paren && RExC_parse < RExC_end) {
3610 if (*RExC_parse == ')') {
3612 vFAIL("Unmatched )");
3615 FAIL("Junk on end of regexp"); /* "Can't happen". */
3623 - regbranch - one alternative of an | operator
3625 * Implements the concatenation operator.
3628 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3630 register regnode *ret;
3631 register regnode *chain = NULL;
3632 register regnode *latest;
3633 I32 flags = 0, c = 0;
3638 if (!SIZE_ONLY && RExC_extralen)
3639 ret = reganode(pRExC_state, BRANCHJ,0);
3641 ret = reg_node(pRExC_state, BRANCH);
3642 Set_Node_Length(ret, 1);
3646 if (!first && SIZE_ONLY)
3647 RExC_extralen += 1; /* BRANCHJ */
3649 *flagp = WORST; /* Tentatively. */
3652 nextchar(pRExC_state);
3653 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3655 latest = regpiece(pRExC_state, &flags);
3656 if (latest == NULL) {
3657 if (flags & TRYAGAIN)
3661 else if (ret == NULL)
3663 *flagp |= flags&HASWIDTH;
3664 if (chain == NULL) /* First piece. */
3665 *flagp |= flags&SPSTART;
3668 regtail(pRExC_state, chain, latest);
3673 if (chain == NULL) { /* Loop ran zero times. */
3674 chain = reg_node(pRExC_state, NOTHING);
3679 *flagp |= flags&SIMPLE;
3686 - regpiece - something followed by possible [*+?]
3688 * Note that the branching code sequences used for ? and the general cases
3689 * of * and + are somewhat optimized: they use the same NOTHING node as
3690 * both the endmarker for their branch list and the body of the last branch.
3691 * It might seem that this node could be dispensed with entirely, but the
3692 * endmarker role is not redundant.
3695 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3697 register regnode *ret;
3699 register char *next;
3701 const char * const origparse = RExC_parse;
3704 I32 max = REG_INFTY;
3707 ret = regatom(pRExC_state, &flags);
3709 if (flags & TRYAGAIN)
3716 if (op == '{' && regcurly(RExC_parse)) {
3717 parse_start = RExC_parse; /* MJD */
3718 next = RExC_parse + 1;
3720 while (isDIGIT(*next) || *next == ',') {
3729 if (*next == '}') { /* got one */
3733 min = atoi(RExC_parse);
3737 maxpos = RExC_parse;
3739 if (!max && *maxpos != '0')
3740 max = REG_INFTY; /* meaning "infinity" */
3741 else if (max >= REG_INFTY)
3742 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3744 nextchar(pRExC_state);
3747 if ((flags&SIMPLE)) {
3748 RExC_naughty += 2 + RExC_naughty / 2;
3749 reginsert(pRExC_state, CURLY, ret);
3750 Set_Node_Offset(ret, parse_start+1); /* MJD */
3751 Set_Node_Cur_Length(ret);
3754 regnode *w = reg_node(pRExC_state, WHILEM);
3757 regtail(pRExC_state, ret, w);
3758 if (!SIZE_ONLY && RExC_extralen) {
3759 reginsert(pRExC_state, LONGJMP,ret);
3760 reginsert(pRExC_state, NOTHING,ret);
3761 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3763 reginsert(pRExC_state, CURLYX,ret);
3765 Set_Node_Offset(ret, parse_start+1);
3766 Set_Node_Length(ret,
3767 op == '{' ? (RExC_parse - parse_start) : 1);
3769 if (!SIZE_ONLY && RExC_extralen)
3770 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3771 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3773 RExC_whilem_seen++, RExC_extralen += 3;
3774 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3782 if (max && max < min)
3783 vFAIL("Can't do {n,m} with n > m");
3785 ARG1_SET(ret, (U16)min);
3786 ARG2_SET(ret, (U16)max);
3798 #if 0 /* Now runtime fix should be reliable. */
3800 /* if this is reinstated, don't forget to put this back into perldiag:
3802 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3804 (F) The part of the regexp subject to either the * or + quantifier
3805 could match an empty string. The {#} shows in the regular
3806 expression about where the problem was discovered.
3810 if (!(flags&HASWIDTH) && op != '?')
3811 vFAIL("Regexp *+ operand could be empty");
3814 parse_start = RExC_parse;
3815 nextchar(pRExC_state);
3817 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3819 if (op == '*' && (flags&SIMPLE)) {
3820 reginsert(pRExC_state, STAR, ret);
3824 else if (op == '*') {
3828 else if (op == '+' && (flags&SIMPLE)) {
3829 reginsert(pRExC_state, PLUS, ret);
3833 else if (op == '+') {
3837 else if (op == '?') {
3842 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
3844 "%.*s matches null string many times",
3845 RExC_parse - origparse,
3849 if (*RExC_parse == '?') {
3850 nextchar(pRExC_state);
3851 reginsert(pRExC_state, MINMOD, ret);
3852 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3854 if (ISMULT2(RExC_parse)) {
3856 vFAIL("Nested quantifiers");
3863 - regatom - the lowest level
3865 * Optimization: gobbles an entire sequence of ordinary characters so that
3866 * it can turn them into a single node, which is smaller to store and
3867 * faster to run. Backslashed characters are exceptions, each becoming a
3868 * separate node; the code is simpler that way and it's not worth fixing.
3870 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3872 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3874 register regnode *ret = 0;
3876 char *parse_start = RExC_parse;
3878 *flagp = WORST; /* Tentatively. */
3881 switch (*RExC_parse) {
3883 RExC_seen_zerolen++;
3884 nextchar(pRExC_state);
3885 if (RExC_flags & PMf_MULTILINE)
3886 ret = reg_node(pRExC_state, MBOL);
3887 else if (RExC_flags & PMf_SINGLELINE)
3888 ret = reg_node(pRExC_state, SBOL);
3890 ret = reg_node(pRExC_state, BOL);
3891 Set_Node_Length(ret, 1); /* MJD */
3894 nextchar(pRExC_state);
3896 RExC_seen_zerolen++;
3897 if (RExC_flags & PMf_MULTILINE)
3898 ret = reg_node(pRExC_state, MEOL);
3899 else if (RExC_flags & PMf_SINGLELINE)
3900 ret = reg_node(pRExC_state, SEOL);
3902 ret = reg_node(pRExC_state, EOL);
3903 Set_Node_Length(ret, 1); /* MJD */
3906 nextchar(pRExC_state);
3907 if (RExC_flags & PMf_SINGLELINE)
3908 ret = reg_node(pRExC_state, SANY);
3910 ret = reg_node(pRExC_state, REG_ANY);
3911 *flagp |= HASWIDTH|SIMPLE;
3913 Set_Node_Length(ret, 1); /* MJD */
3917 char *oregcomp_parse = ++RExC_parse;
3918 ret = regclass(pRExC_state);
3919 if (*RExC_parse != ']') {
3920 RExC_parse = oregcomp_parse;
3921 vFAIL("Unmatched [");
3923 nextchar(pRExC_state);
3924 *flagp |= HASWIDTH|SIMPLE;
3925 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3929 nextchar(pRExC_state);
3930 ret = reg(pRExC_state, 1, &flags);
3932 if (flags & TRYAGAIN) {
3933 if (RExC_parse == RExC_end) {
3934 /* Make parent create an empty node if needed. */
3942 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3946 if (flags & TRYAGAIN) {
3950 vFAIL("Internal urp");
3951 /* Supposed to be caught earlier. */
3954 if (!regcurly(RExC_parse)) {
3963 vFAIL("Quantifier follows nothing");
3966 switch (*++RExC_parse) {
3968 RExC_seen_zerolen++;
3969 ret = reg_node(pRExC_state, SBOL);
3971 nextchar(pRExC_state);
3972 Set_Node_Length(ret, 2); /* MJD */
3975 ret = reg_node(pRExC_state, GPOS);
3976 RExC_seen |= REG_SEEN_GPOS;
3978 nextchar(pRExC_state);
3979 Set_Node_Length(ret, 2); /* MJD */
3982 ret = reg_node(pRExC_state, SEOL);
3984 RExC_seen_zerolen++; /* Do not optimize RE away */
3985 nextchar(pRExC_state);
3988 ret = reg_node(pRExC_state, EOS);
3990 RExC_seen_zerolen++; /* Do not optimize RE away */
3991 nextchar(pRExC_state);
3992 Set_Node_Length(ret, 2); /* MJD */
3995 ret = reg_node(pRExC_state, CANY);
3996 RExC_seen |= REG_SEEN_CANY;
3997 *flagp |= HASWIDTH|SIMPLE;
3998 nextchar(pRExC_state);
3999 Set_Node_Length(ret, 2); /* MJD */
4002 ret = reg_node(pRExC_state, CLUMP);
4004 nextchar(pRExC_state);
4005 Set_Node_Length(ret, 2); /* MJD */
4008 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4009 *flagp |= HASWIDTH|SIMPLE;
4010 nextchar(pRExC_state);
4011 Set_Node_Length(ret, 2); /* MJD */
4014 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4015 *flagp |= HASWIDTH|SIMPLE;
4016 nextchar(pRExC_state);
4017 Set_Node_Length(ret, 2); /* MJD */
4020 RExC_seen_zerolen++;
4021 RExC_seen |= REG_SEEN_LOOKBEHIND;
4022 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4024 nextchar(pRExC_state);
4025 Set_Node_Length(ret, 2); /* MJD */
4028 RExC_seen_zerolen++;
4029 RExC_seen |= REG_SEEN_LOOKBEHIND;
4030 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4032 nextchar(pRExC_state);
4033 Set_Node_Length(ret, 2); /* MJD */
4036 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4037 *flagp |= HASWIDTH|SIMPLE;
4038 nextchar(pRExC_state);
4039 Set_Node_Length(ret, 2); /* MJD */
4042 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4043 *flagp |= HASWIDTH|SIMPLE;
4044 nextchar(pRExC_state);
4045 Set_Node_Length(ret, 2); /* MJD */
4048 ret = reg_node(pRExC_state, DIGIT);
4049 *flagp |= HASWIDTH|SIMPLE;
4050 nextchar(pRExC_state);
4051 Set_Node_Length(ret, 2); /* MJD */
4054 ret = reg_node(pRExC_state, NDIGIT);
4055 *flagp |= HASWIDTH|SIMPLE;
4056 nextchar(pRExC_state);
4057 Set_Node_Length(ret, 2); /* MJD */
4062 char* oldregxend = RExC_end;
4063 char* parse_start = RExC_parse - 2;
4065 if (RExC_parse[1] == '{') {
4066 /* a lovely hack--pretend we saw [\pX] instead */
4067 RExC_end = strchr(RExC_parse, '}');
4069 U8 c = (U8)*RExC_parse;
4071 RExC_end = oldregxend;
4072 vFAIL2("Missing right brace on \\%c{}", c);
4077 RExC_end = RExC_parse + 2;
4078 if (RExC_end > oldregxend)
4079 RExC_end = oldregxend;
4083 ret = regclass(pRExC_state);
4085 RExC_end = oldregxend;
4088 Set_Node_Offset(ret, parse_start + 2);
4089 Set_Node_Cur_Length(ret);
4090 nextchar(pRExC_state);
4091 *flagp |= HASWIDTH|SIMPLE;
4104 case '1': case '2': case '3': case '4':
4105 case '5': case '6': case '7': case '8': case '9':
4107 const I32 num = atoi(RExC_parse);
4109 if (num > 9 && num >= RExC_npar)
4112 char * parse_start = RExC_parse - 1; /* MJD */
4113 while (isDIGIT(*RExC_parse))
4116 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4117 vFAIL("Reference to nonexistent group");
4119 ret = reganode(pRExC_state,
4120 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4124 /* override incorrect value set in reganode MJD */
4125 Set_Node_Offset(ret, parse_start+1);
4126 Set_Node_Cur_Length(ret); /* MJD */
4128 nextchar(pRExC_state);
4133 if (RExC_parse >= RExC_end)
4134 FAIL("Trailing \\");
4137 /* Do not generate "unrecognized" warnings here, we fall
4138 back into the quick-grab loop below */
4145 if (RExC_flags & PMf_EXTENDED) {
4146 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4147 if (RExC_parse < RExC_end)
4153 register STRLEN len;
4158 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4160 parse_start = RExC_parse - 1;
4166 ret = reg_node(pRExC_state,
4167 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4169 for (len = 0, p = RExC_parse - 1;
4170 len < 127 && p < RExC_end;
4175 if (RExC_flags & PMf_EXTENDED)
4176 p = regwhite(p, RExC_end);
4223 ender = ASCII_TO_NATIVE('\033');
4227 ender = ASCII_TO_NATIVE('\007');
4232 char* const e = strchr(p, '}');
4236 vFAIL("Missing right brace on \\x{}");
4239 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4240 | PERL_SCAN_DISALLOW_PREFIX;
4241 STRLEN numlen = e - p - 1;
4242 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4249 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4251 ender = grok_hex(p, &numlen, &flags, NULL);
4257 ender = UCHARAT(p++);
4258 ender = toCTRL(ender);
4260 case '0': case '1': case '2': case '3':case '4':
4261 case '5': case '6': case '7': case '8':case '9':
4263 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4266 ender = grok_oct(p, &numlen, &flags, NULL);
4276 FAIL("Trailing \\");
4279 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4280 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4281 goto normal_default;
4286 if (UTF8_IS_START(*p) && UTF) {
4288 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4296 if (RExC_flags & PMf_EXTENDED)
4297 p = regwhite(p, RExC_end);
4299 /* Prime the casefolded buffer. */
4300 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4302 if (ISMULT2(p)) { /* Back off on ?+*. */
4309 /* Emit all the Unicode characters. */
4311 for (foldbuf = tmpbuf;
4313 foldlen -= numlen) {
4314 ender = utf8_to_uvchr(foldbuf, &numlen);
4316 reguni(pRExC_state, ender, s, &unilen);
4319 /* In EBCDIC the numlen
4320 * and unilen can differ. */
4322 if (numlen >= foldlen)
4326 break; /* "Can't happen." */
4330 reguni(pRExC_state, ender, s, &unilen);
4339 REGC((char)ender, s++);
4347 /* Emit all the Unicode characters. */
4349 for (foldbuf = tmpbuf;
4351 foldlen -= numlen) {
4352 ender = utf8_to_uvchr(foldbuf, &numlen);
4354 reguni(pRExC_state, ender, s, &unilen);
4357 /* In EBCDIC the numlen
4358 * and unilen can differ. */
4360 if (numlen >= foldlen)
4368 reguni(pRExC_state, ender, s, &unilen);
4377 REGC((char)ender, s++);
4381 Set_Node_Cur_Length(ret); /* MJD */
4382 nextchar(pRExC_state);
4384 /* len is STRLEN which is unsigned, need to copy to signed */
4387 vFAIL("Internal disaster");
4391 if (len == 1 && UNI_IS_INVARIANT(ender))
4396 RExC_size += STR_SZ(len);
4398 RExC_emit += STR_SZ(len);
4403 /* If the encoding pragma is in effect recode the text of
4404 * any EXACT-kind nodes. */
4405 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4406 STRLEN oldlen = STR_LEN(ret);
4407 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4411 if (sv_utf8_downgrade(sv, TRUE)) {
4412 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4413 const STRLEN newlen = SvCUR(sv);
4418 GET_RE_DEBUG_FLAGS_DECL;
4419 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4420 (int)oldlen, STRING(ret),
4422 Copy(s, STRING(ret), newlen, char);
4423 STR_LEN(ret) += newlen - oldlen;
4424 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4426 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4434 S_regwhite(pTHX_ char *p, const char *e)
4439 else if (*p == '#') {
4442 } while (p < e && *p != '\n');
4450 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4451 Character classes ([:foo:]) can also be negated ([:^foo:]).
4452 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4453 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4454 but trigger failures because they are currently unimplemented. */
4456 #define POSIXCC_DONE(c) ((c) == ':')
4457 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4458 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4461 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4463 I32 namedclass = OOB_NAMEDCLASS;
4465 if (value == '[' && RExC_parse + 1 < RExC_end &&
4466 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4467 POSIXCC(UCHARAT(RExC_parse))) {
4468 const char c = UCHARAT(RExC_parse);
4469 char* s = RExC_parse++;
4471 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4473 if (RExC_parse == RExC_end)
4474 /* Grandfather lone [:, [=, [. */
4477 const char* t = RExC_parse++; /* skip over the c */
4478 const char *posixcc;
4482 if (UCHARAT(RExC_parse) == ']') {
4483 RExC_parse++; /* skip over the ending ] */
4486 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4487 const I32 skip = t - posixcc;
4489 /* Initially switch on the length of the name. */
4492 if (memEQ(posixcc, "word", 4)) {
4493 /* this is not POSIX, this is the Perl \w */;
4495 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4499 /* Names all of length 5. */
4500 /* alnum alpha ascii blank cntrl digit graph lower
4501 print punct space upper */
4502 /* Offset 4 gives the best switch position. */
4503 switch (posixcc[4]) {
4505 if (memEQ(posixcc, "alph", 4)) {
4508 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4512 if (memEQ(posixcc, "spac", 4)) {
4515 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4519 if (memEQ(posixcc, "grap", 4)) {
4522 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4526 if (memEQ(posixcc, "asci", 4)) {
4529 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4533 if (memEQ(posixcc, "blan", 4)) {
4536 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4540 if (memEQ(posixcc, "cntr", 4)) {
4543 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4547 if (memEQ(posixcc, "alnu", 4)) {
4550 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4554 if (memEQ(posixcc, "lowe", 4)) {
4557 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4559 if (memEQ(posixcc, "uppe", 4)) {
4562 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4566 if (memEQ(posixcc, "digi", 4)) {
4569 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4571 if (memEQ(posixcc, "prin", 4)) {
4574 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4576 if (memEQ(posixcc, "punc", 4)) {
4579 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4585 if (memEQ(posixcc, "xdigit", 6)) {
4587 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4592 if (namedclass == OOB_NAMEDCLASS)
4594 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4597 assert (posixcc[skip] == ':');
4598 assert (posixcc[skip+1] == ']');
4599 } else if (!SIZE_ONLY) {
4600 /* [[=foo=]] and [[.foo.]] are still future. */
4602 /* adjust RExC_parse so the warning shows after
4604 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4606 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4609 /* Maternal grandfather:
4610 * "[:" ending in ":" but not in ":]" */
4620 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4622 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4623 const char *s = RExC_parse;
4624 const char c = *s++;
4626 while(*s && isALNUM(*s))
4628 if (*s && c == *s && s[1] == ']') {
4629 if (ckWARN(WARN_REGEXP))
4631 "POSIX syntax [%c %c] belongs inside character classes",
4634 /* [[=foo=]] and [[.foo.]] are still future. */
4635 if (POSIXCC_NOTYET(c)) {
4636 /* adjust RExC_parse so the error shows after
4638 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4640 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4647 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4650 register UV nextvalue;
4651 register IV prevvalue = OOB_UNICODE;
4652 register IV range = 0;
4653 register regnode *ret;
4656 char *rangebegin = 0;
4657 bool need_class = 0;
4658 SV *listsv = Nullsv;
4661 bool optimize_invert = TRUE;
4662 AV* unicode_alternate = 0;
4664 UV literal_endpoint = 0;
4667 ret = reganode(pRExC_state, ANYOF, 0);
4670 ANYOF_FLAGS(ret) = 0;
4672 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4676 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4680 RExC_size += ANYOF_SKIP;
4682 RExC_emit += ANYOF_SKIP;
4684 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4686 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4687 ANYOF_BITMAP_ZERO(ret);
4688 listsv = newSVpvn("# comment\n", 10);
4691 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4693 if (!SIZE_ONLY && POSIXCC(nextvalue))
4694 checkposixcc(pRExC_state);
4696 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4697 if (UCHARAT(RExC_parse) == ']')
4700 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4704 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4707 rangebegin = RExC_parse;
4709 value = utf8n_to_uvchr((U8*)RExC_parse,
4710 RExC_end - RExC_parse,
4712 RExC_parse += numlen;
4715 value = UCHARAT(RExC_parse++);
4716 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4717 if (value == '[' && POSIXCC(nextvalue))
4718 namedclass = regpposixcc(pRExC_state, value);
4719 else if (value == '\\') {
4721 value = utf8n_to_uvchr((U8*)RExC_parse,
4722 RExC_end - RExC_parse,
4724 RExC_parse += numlen;
4727 value = UCHARAT(RExC_parse++);
4728 /* Some compilers cannot handle switching on 64-bit integer
4729 * values, therefore value cannot be an UV. Yes, this will
4730 * be a problem later if we want switch on Unicode.
4731 * A similar issue a little bit later when switching on
4732 * namedclass. --jhi */
4733 switch ((I32)value) {
4734 case 'w': namedclass = ANYOF_ALNUM; break;
4735 case 'W': namedclass = ANYOF_NALNUM; break;
4736 case 's': namedclass = ANYOF_SPACE; break;
4737 case 'S': namedclass = ANYOF_NSPACE; break;
4738 case 'd': namedclass = ANYOF_DIGIT; break;
4739 case 'D': namedclass = ANYOF_NDIGIT; break;
4742 if (RExC_parse >= RExC_end)
4743 vFAIL2("Empty \\%c{}", (U8)value);
4744 if (*RExC_parse == '{') {
4745 const U8 c = (U8)value;
4746 e = strchr(RExC_parse++, '}');
4748 vFAIL2("Missing right brace on \\%c{}", c);
4749 while (isSPACE(UCHARAT(RExC_parse)))
4751 if (e == RExC_parse)
4752 vFAIL2("Empty \\%c{}", c);
4754 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4762 if (UCHARAT(RExC_parse) == '^') {
4765 value = value == 'p' ? 'P' : 'p'; /* toggle */
4766 while (isSPACE(UCHARAT(RExC_parse))) {
4772 Perl_sv_catpvf(aTHX_ listsv,
4773 "+utf8::%.*s\n", (int)n, RExC_parse);
4775 Perl_sv_catpvf(aTHX_ listsv,
4776 "!utf8::%.*s\n", (int)n, RExC_parse);
4779 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4780 namedclass = ANYOF_MAX; /* no official name, but it's named */
4782 case 'n': value = '\n'; break;
4783 case 'r': value = '\r'; break;
4784 case 't': value = '\t'; break;
4785 case 'f': value = '\f'; break;
4786 case 'b': value = '\b'; break;
4787 case 'e': value = ASCII_TO_NATIVE('\033');break;
4788 case 'a': value = ASCII_TO_NATIVE('\007');break;
4790 if (*RExC_parse == '{') {
4791 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4792 | PERL_SCAN_DISALLOW_PREFIX;
4793 e = strchr(RExC_parse++, '}');
4795 vFAIL("Missing right brace on \\x{}");
4797 numlen = e - RExC_parse;
4798 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4802 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4804 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4805 RExC_parse += numlen;
4809 value = UCHARAT(RExC_parse++);
4810 value = toCTRL(value);
4812 case '0': case '1': case '2': case '3': case '4':
4813 case '5': case '6': case '7': case '8': case '9':
4817 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4818 RExC_parse += numlen;
4822 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
4824 "Unrecognized escape \\%c in character class passed through",
4828 } /* end of \blah */
4834 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4836 if (!SIZE_ONLY && !need_class)
4837 ANYOF_CLASS_ZERO(ret);
4841 /* a bad range like a-\d, a-[:digit:] ? */
4844 if (ckWARN(WARN_REGEXP))
4846 "False [] range \"%*.*s\"",
4847 RExC_parse - rangebegin,
4848 RExC_parse - rangebegin,
4850 if (prevvalue < 256) {
4851 ANYOF_BITMAP_SET(ret, prevvalue);
4852 ANYOF_BITMAP_SET(ret, '-');
4855 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4856 Perl_sv_catpvf(aTHX_ listsv,
4857 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4861 range = 0; /* this was not a true range */
4865 const char *what = NULL;
4868 if (namedclass > OOB_NAMEDCLASS)
4869 optimize_invert = FALSE;
4870 /* Possible truncation here but in some 64-bit environments
4871 * the compiler gets heartburn about switch on 64-bit values.
4872 * A similar issue a little earlier when switching on value.
4874 switch ((I32)namedclass) {
4877 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4879 for (value = 0; value < 256; value++)
4881 ANYOF_BITMAP_SET(ret, value);
4888 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4890 for (value = 0; value < 256; value++)
4891 if (!isALNUM(value))
4892 ANYOF_BITMAP_SET(ret, value);
4899 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4901 for (value = 0; value < 256; value++)
4902 if (isALNUMC(value))
4903 ANYOF_BITMAP_SET(ret, value);
4910 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4912 for (value = 0; value < 256; value++)
4913 if (!isALNUMC(value))
4914 ANYOF_BITMAP_SET(ret, value);
4921 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4923 for (value = 0; value < 256; value++)
4925 ANYOF_BITMAP_SET(ret, value);
4932 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4934 for (value = 0; value < 256; value++)
4935 if (!isALPHA(value))
4936 ANYOF_BITMAP_SET(ret, value);
4943 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4946 for (value = 0; value < 128; value++)
4947 ANYOF_BITMAP_SET(ret, value);
4949 for (value = 0; value < 256; value++) {
4951 ANYOF_BITMAP_SET(ret, value);
4960 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4963 for (value = 128; value < 256; value++)
4964 ANYOF_BITMAP_SET(ret, value);
4966 for (value = 0; value < 256; value++) {
4967 if (!isASCII(value))
4968 ANYOF_BITMAP_SET(ret, value);
4977 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4979 for (value = 0; value < 256; value++)
4981 ANYOF_BITMAP_SET(ret, value);
4988 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4990 for (value = 0; value < 256; value++)
4991 if (!isBLANK(value))
4992 ANYOF_BITMAP_SET(ret, value);
4999 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5001 for (value = 0; value < 256; value++)
5003 ANYOF_BITMAP_SET(ret, value);
5010 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5012 for (value = 0; value < 256; value++)
5013 if (!isCNTRL(value))
5014 ANYOF_BITMAP_SET(ret, value);
5021 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5023 /* consecutive digits assumed */
5024 for (value = '0'; value <= '9'; value++)
5025 ANYOF_BITMAP_SET(ret, value);
5032 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5034 /* consecutive digits assumed */
5035 for (value = 0; value < '0'; value++)
5036 ANYOF_BITMAP_SET(ret, value);
5037 for (value = '9' + 1; value < 256; value++)
5038 ANYOF_BITMAP_SET(ret, value);
5045 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5047 for (value = 0; value < 256; value++)
5049 ANYOF_BITMAP_SET(ret, value);
5056 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5058 for (value = 0; value < 256; value++)
5059 if (!isGRAPH(value))
5060 ANYOF_BITMAP_SET(ret, value);
5067 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5069 for (value = 0; value < 256; value++)
5071 ANYOF_BITMAP_SET(ret, value);
5078 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5080 for (value = 0; value < 256; value++)
5081 if (!isLOWER(value))
5082 ANYOF_BITMAP_SET(ret, value);
5089 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5091 for (value = 0; value < 256; value++)
5093 ANYOF_BITMAP_SET(ret, value);
5100 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5102 for (value = 0; value < 256; value++)
5103 if (!isPRINT(value))
5104 ANYOF_BITMAP_SET(ret, value);
5111 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5113 for (value = 0; value < 256; value++)
5114 if (isPSXSPC(value))
5115 ANYOF_BITMAP_SET(ret, value);
5122 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5124 for (value = 0; value < 256; value++)
5125 if (!isPSXSPC(value))
5126 ANYOF_BITMAP_SET(ret, value);
5133 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5135 for (value = 0; value < 256; value++)
5137 ANYOF_BITMAP_SET(ret, value);
5144 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5146 for (value = 0; value < 256; value++)
5147 if (!isPUNCT(value))
5148 ANYOF_BITMAP_SET(ret, value);
5155 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5157 for (value = 0; value < 256; value++)
5159 ANYOF_BITMAP_SET(ret, value);
5166 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5168 for (value = 0; value < 256; value++)
5169 if (!isSPACE(value))
5170 ANYOF_BITMAP_SET(ret, value);
5177 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5179 for (value = 0; value < 256; value++)
5181 ANYOF_BITMAP_SET(ret, value);
5188 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5190 for (value = 0; value < 256; value++)
5191 if (!isUPPER(value))
5192 ANYOF_BITMAP_SET(ret, value);
5199 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5201 for (value = 0; value < 256; value++)
5202 if (isXDIGIT(value))
5203 ANYOF_BITMAP_SET(ret, value);
5210 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5212 for (value = 0; value < 256; value++)
5213 if (!isXDIGIT(value))
5214 ANYOF_BITMAP_SET(ret, value);
5220 /* this is to handle \p and \P */
5223 vFAIL("Invalid [::] class");
5227 /* Strings such as "+utf8::isWord\n" */
5228 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5231 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5234 } /* end of namedclass \blah */
5237 if (prevvalue > (IV)value) /* b-a */ {
5238 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5239 RExC_parse - rangebegin,
5240 RExC_parse - rangebegin,
5242 range = 0; /* not a valid range */
5246 prevvalue = value; /* save the beginning of the range */
5247 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5248 RExC_parse[1] != ']') {
5251 /* a bad range like \w-, [:word:]- ? */
5252 if (namedclass > OOB_NAMEDCLASS) {
5253 if (ckWARN(WARN_REGEXP))
5255 "False [] range \"%*.*s\"",
5256 RExC_parse - rangebegin,
5257 RExC_parse - rangebegin,
5260 ANYOF_BITMAP_SET(ret, '-');
5262 range = 1; /* yeah, it's a range! */
5263 continue; /* but do it the next time */
5267 /* now is the next time */
5271 if (prevvalue < 256) {
5272 const IV ceilvalue = value < 256 ? value : 255;
5275 /* In EBCDIC [\x89-\x91] should include
5276 * the \x8e but [i-j] should not. */
5277 if (literal_endpoint == 2 &&
5278 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5279 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5281 if (isLOWER(prevvalue)) {
5282 for (i = prevvalue; i <= ceilvalue; i++)
5284 ANYOF_BITMAP_SET(ret, i);
5286 for (i = prevvalue; i <= ceilvalue; i++)
5288 ANYOF_BITMAP_SET(ret, i);
5293 for (i = prevvalue; i <= ceilvalue; i++)
5294 ANYOF_BITMAP_SET(ret, i);
5296 if (value > 255 || UTF) {
5297 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5298 const UV natvalue = NATIVE_TO_UNI(value);
5300 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5301 if (prevnatvalue < natvalue) { /* what about > ? */
5302 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5303 prevnatvalue, natvalue);
5305 else if (prevnatvalue == natvalue) {
5306 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5308 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5310 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5312 /* If folding and foldable and a single
5313 * character, insert also the folded version
5314 * to the charclass. */
5316 if (foldlen == (STRLEN)UNISKIP(f))
5317 Perl_sv_catpvf(aTHX_ listsv,
5320 /* Any multicharacter foldings
5321 * require the following transform:
5322 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5323 * where E folds into "pq" and F folds
5324 * into "rst", all other characters
5325 * fold to single characters. We save
5326 * away these multicharacter foldings,
5327 * to be later saved as part of the
5328 * additional "s" data. */
5331 if (!unicode_alternate)
5332 unicode_alternate = newAV();
5333 sv = newSVpvn((char*)foldbuf, foldlen);
5335 av_push(unicode_alternate, sv);
5339 /* If folding and the value is one of the Greek
5340 * sigmas insert a few more sigmas to make the
5341 * folding rules of the sigmas to work right.
5342 * Note that not all the possible combinations
5343 * are handled here: some of them are handled
5344 * by the standard folding rules, and some of
5345 * them (literal or EXACTF cases) are handled
5346 * during runtime in regexec.c:S_find_byclass(). */
5347 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5348 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5349 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5350 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5351 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5353 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5354 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5355 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5360 literal_endpoint = 0;
5364 range = 0; /* this range (if it was one) is done now */
5368 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5370 RExC_size += ANYOF_CLASS_ADD_SKIP;
5372 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5375 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5377 /* If the only flag is folding (plus possibly inversion). */
5378 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5380 for (value = 0; value < 256; ++value) {
5381 if (ANYOF_BITMAP_TEST(ret, value)) {
5382 UV fold = PL_fold[value];
5385 ANYOF_BITMAP_SET(ret, fold);
5388 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5391 /* optimize inverted simple patterns (e.g. [^a-z]) */
5392 if (!SIZE_ONLY && optimize_invert &&
5393 /* If the only flag is inversion. */
5394 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5395 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5396 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5397 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5404 /* The 0th element stores the character class description
5405 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5406 * to initialize the appropriate swash (which gets stored in
5407 * the 1st element), and also useful for dumping the regnode.
5408 * The 2nd element stores the multicharacter foldings,
5409 * used later (regexec.c:S_reginclass()). */
5410 av_store(av, 0, listsv);
5411 av_store(av, 1, NULL);
5412 av_store(av, 2, (SV*)unicode_alternate);
5413 rv = newRV_noinc((SV*)av);
5414 n = add_data(pRExC_state, 1, "s");
5415 RExC_rx->data->data[n] = (void*)rv;
5423 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5425 char* retval = RExC_parse++;
5428 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5429 RExC_parse[2] == '#') {
5430 while (*RExC_parse != ')') {
5431 if (RExC_parse == RExC_end)
5432 FAIL("Sequence (?#... not terminated");
5438 if (RExC_flags & PMf_EXTENDED) {
5439 if (isSPACE(*RExC_parse)) {
5443 else if (*RExC_parse == '#') {
5444 while (RExC_parse < RExC_end)
5445 if (*RExC_parse++ == '\n') break;
5454 - reg_node - emit a node
5456 STATIC regnode * /* Location. */
5457 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5459 register regnode *ptr;
5460 regnode * const ret = RExC_emit;
5463 SIZE_ALIGN(RExC_size);
5468 NODE_ALIGN_FILL(ret);
5470 FILL_ADVANCE_NODE(ptr, op);
5471 if (RExC_offsets) { /* MJD */
5472 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5473 "reg_node", __LINE__,
5475 RExC_emit - RExC_emit_start > RExC_offsets[0]
5476 ? "Overwriting end of array!\n" : "OK",
5477 RExC_emit - RExC_emit_start,
5478 RExC_parse - RExC_start,
5480 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5489 - reganode - emit a node with an argument
5491 STATIC regnode * /* Location. */
5492 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5494 register regnode *ptr;
5495 regnode * const ret = RExC_emit;
5498 SIZE_ALIGN(RExC_size);
5503 NODE_ALIGN_FILL(ret);
5505 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5506 if (RExC_offsets) { /* MJD */
5507 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5511 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5512 "Overwriting end of array!\n" : "OK",
5513 RExC_emit - RExC_emit_start,
5514 RExC_parse - RExC_start,
5516 Set_Cur_Node_Offset;
5525 - reguni - emit (if appropriate) a Unicode character
5528 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5530 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5534 - reginsert - insert an operator in front of already-emitted operand
5536 * Means relocating the operand.
5539 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5541 register regnode *src;
5542 register regnode *dst;
5543 register regnode *place;
5544 const int offset = regarglen[(U8)op];
5546 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5549 RExC_size += NODE_STEP_REGNODE + offset;
5554 RExC_emit += NODE_STEP_REGNODE + offset;
5556 while (src > opnd) {
5557 StructCopy(--src, --dst, regnode);
5558 if (RExC_offsets) { /* MJD 20010112 */
5559 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5563 dst - RExC_emit_start > RExC_offsets[0]
5564 ? "Overwriting end of array!\n" : "OK",
5565 src - RExC_emit_start,
5566 dst - RExC_emit_start,
5568 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5569 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5574 place = opnd; /* Op node, where operand used to be. */
5575 if (RExC_offsets) { /* MJD */
5576 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5580 place - RExC_emit_start > RExC_offsets[0]
5581 ? "Overwriting end of array!\n" : "OK",
5582 place - RExC_emit_start,
5583 RExC_parse - RExC_start,
5585 Set_Node_Offset(place, RExC_parse);
5586 Set_Node_Length(place, 1);
5588 src = NEXTOPER(place);
5589 FILL_ADVANCE_NODE(place, op);
5590 Zero(src, offset, regnode);
5594 - regtail - set the next-pointer at the end of a node chain of p to val.
5597 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5599 register regnode *scan;
5604 /* Find last node. */
5607 regnode * const temp = regnext(scan);
5613 if (reg_off_by_arg[OP(scan)]) {
5614 ARG_SET(scan, val - scan);
5617 NEXT_OFF(scan) = val - scan;
5622 - regoptail - regtail on operand of first argument; nop if operandless
5625 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5627 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5628 if (p == NULL || SIZE_ONLY)
5630 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5631 regtail(pRExC_state, NEXTOPER(p), val);
5633 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5634 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5641 - regcurly - a little FSA that accepts {\d+,?\d*}
5644 S_regcurly(pTHX_ register const char *s)
5663 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5666 Perl_regdump(pTHX_ regexp *r)
5669 SV *sv = sv_newmortal();
5671 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5673 /* Header fields of interest. */
5674 if (r->anchored_substr)
5675 PerlIO_printf(Perl_debug_log,
5676 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5678 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5679 SvPVX_const(r->anchored_substr),
5681 SvTAIL(r->anchored_substr) ? "$" : "",
5682 (IV)r->anchored_offset);
5683 else if (r->anchored_utf8)
5684 PerlIO_printf(Perl_debug_log,
5685 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5687 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5688 SvPVX_const(r->anchored_utf8),
5690 SvTAIL(r->anchored_utf8) ? "$" : "",
5691 (IV)r->anchored_offset);
5692 if (r->float_substr)
5693 PerlIO_printf(Perl_debug_log,
5694 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5696 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5697 SvPVX_const(r->float_substr),
5699 SvTAIL(r->float_substr) ? "$" : "",
5700 (IV)r->float_min_offset, (UV)r->float_max_offset);
5701 else if (r->float_utf8)
5702 PerlIO_printf(Perl_debug_log,
5703 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5705 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5706 SvPVX_const(r->float_utf8),
5708 SvTAIL(r->float_utf8) ? "$" : "",
5709 (IV)r->float_min_offset, (UV)r->float_max_offset);
5710 if (r->check_substr || r->check_utf8)
5711 PerlIO_printf(Perl_debug_log,
5712 r->check_substr == r->float_substr
5713 && r->check_utf8 == r->float_utf8
5714 ? "(checking floating" : "(checking anchored");
5715 if (r->reganch & ROPT_NOSCAN)
5716 PerlIO_printf(Perl_debug_log, " noscan");
5717 if (r->reganch & ROPT_CHECK_ALL)
5718 PerlIO_printf(Perl_debug_log, " isall");
5719 if (r->check_substr || r->check_utf8)
5720 PerlIO_printf(Perl_debug_log, ") ");
5722 if (r->regstclass) {
5723 regprop(sv, r->regstclass);
5724 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5726 if (r->reganch & ROPT_ANCH) {
5727 PerlIO_printf(Perl_debug_log, "anchored");
5728 if (r->reganch & ROPT_ANCH_BOL)
5729 PerlIO_printf(Perl_debug_log, "(BOL)");
5730 if (r->reganch & ROPT_ANCH_MBOL)
5731 PerlIO_printf(Perl_debug_log, "(MBOL)");
5732 if (r->reganch & ROPT_ANCH_SBOL)
5733 PerlIO_printf(Perl_debug_log, "(SBOL)");
5734 if (r->reganch & ROPT_ANCH_GPOS)
5735 PerlIO_printf(Perl_debug_log, "(GPOS)");
5736 PerlIO_putc(Perl_debug_log, ' ');
5738 if (r->reganch & ROPT_GPOS_SEEN)
5739 PerlIO_printf(Perl_debug_log, "GPOS ");
5740 if (r->reganch & ROPT_SKIP)
5741 PerlIO_printf(Perl_debug_log, "plus ");
5742 if (r->reganch & ROPT_IMPLICIT)
5743 PerlIO_printf(Perl_debug_log, "implicit ");
5744 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5745 if (r->reganch & ROPT_EVAL_SEEN)
5746 PerlIO_printf(Perl_debug_log, "with eval ");
5747 PerlIO_printf(Perl_debug_log, "\n");
5749 const U32 len = r->offsets[0];
5750 GET_RE_DEBUG_FLAGS_DECL;
5753 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5754 for (i = 1; i <= len; i++)
5755 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5756 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5757 PerlIO_printf(Perl_debug_log, "\n");
5760 #endif /* DEBUGGING */
5764 - regprop - printable representation of opcode
5767 Perl_regprop(pTHX_ SV *sv, const regnode *o)
5772 sv_setpvn(sv, "", 0);
5773 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5774 /* It would be nice to FAIL() here, but this may be called from
5775 regexec.c, and it would be hard to supply pRExC_state. */
5776 Perl_croak(aTHX_ "Corrupted regexp opcode");
5777 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5779 k = PL_regkind[(U8)OP(o)];
5782 SV * const dsv = sv_2mortal(newSVpvn("", 0));
5783 /* Using is_utf8_string() is a crude hack but it may
5784 * be the best for now since we have no flag "this EXACTish
5785 * node was UTF-8" --jhi */
5786 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5787 const char * const s = do_utf8 ?
5788 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5789 UNI_DISPLAY_REGEX) :
5791 const int len = do_utf8 ?
5794 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5798 } else if (k == TRIE) {/*
5799 this isn't always safe, as Pl_regdata may not be for this regex yet
5800 (depending on where its called from) so its being moved to dumpuntil
5802 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5803 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5806 trie->uniquecharcount,
5809 } else if (k == CURLY) {
5810 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5811 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5812 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5814 else if (k == WHILEM && o->flags) /* Ordinal/of */
5815 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5816 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5817 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5818 else if (k == LOGICAL)
5819 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5820 else if (k == ANYOF) {
5821 int i, rangestart = -1;
5822 const U8 flags = ANYOF_FLAGS(o);
5823 const char * const anyofs[] = { /* Should be synchronized with
5824 * ANYOF_ #xdefines in regcomp.h */
5857 if (flags & ANYOF_LOCALE)
5858 sv_catpv(sv, "{loc}");
5859 if (flags & ANYOF_FOLD)
5860 sv_catpv(sv, "{i}");
5861 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5862 if (flags & ANYOF_INVERT)
5864 for (i = 0; i <= 256; i++) {
5865 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5866 if (rangestart == -1)
5868 } else if (rangestart != -1) {
5869 if (i <= rangestart + 3)
5870 for (; rangestart < i; rangestart++)
5871 put_byte(sv, rangestart);
5873 put_byte(sv, rangestart);
5875 put_byte(sv, i - 1);
5881 if (o->flags & ANYOF_CLASS)
5882 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5883 if (ANYOF_CLASS_TEST(o,i))
5884 sv_catpv(sv, anyofs[i]);
5886 if (flags & ANYOF_UNICODE)
5887 sv_catpv(sv, "{unicode}");
5888 else if (flags & ANYOF_UNICODE_ALL)
5889 sv_catpv(sv, "{unicode_all}");
5893 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
5897 U8 s[UTF8_MAXBYTES_CASE+1];
5899 for (i = 0; i <= 256; i++) { /* just the first 256 */
5900 uvchr_to_utf8(s, i);
5902 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5903 if (rangestart == -1)
5905 } else if (rangestart != -1) {
5906 if (i <= rangestart + 3)
5907 for (; rangestart < i; rangestart++) {
5908 const U8 * const e = uvchr_to_utf8(s,rangestart);
5910 for(p = s; p < e; p++)
5914 const U8 *e = uvchr_to_utf8(s,rangestart);
5916 for (p = s; p < e; p++)
5918 sv_catpvn(sv, "-", 1);
5919 e = uvchr_to_utf8(s, i-1);
5920 for (p = s; p < e; p++)
5927 sv_catpv(sv, "..."); /* et cetera */
5931 char *s = savesvpv(lv);
5934 while(*s && *s != '\n') s++;
5937 const char * const t = ++s;
5955 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5957 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5958 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5959 #endif /* DEBUGGING */
5963 Perl_re_intuit_string(pTHX_ regexp *prog)
5964 { /* Assume that RE_INTUIT is set */
5965 GET_RE_DEBUG_FLAGS_DECL;
5968 const char * const s = SvPV_nolen_const(prog->check_substr
5969 ? prog->check_substr : prog->check_utf8);
5971 if (!PL_colorset) reginitcolors();
5972 PerlIO_printf(Perl_debug_log,
5973 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
5975 prog->check_substr ? "" : "utf8 ",
5976 PL_colors[5],PL_colors[0],
5979 (strlen(s) > 60 ? "..." : ""));
5982 return prog->check_substr ? prog->check_substr : prog->check_utf8;
5986 Perl_pregfree(pTHX_ struct regexp *r)
5990 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
5991 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
5995 if (!r || (--r->refcnt > 0))
5997 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
5998 const char *s = (r->reganch & ROPT_UTF8)
5999 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6000 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6001 const int len = SvCUR(dsv);
6004 PerlIO_printf(Perl_debug_log,
6005 "%sFreeing REx:%s %s%*.*s%s%s\n",
6006 PL_colors[4],PL_colors[5],PL_colors[0],
6009 len > 60 ? "..." : "");
6012 /* gcov results gave these as non-null 100% of the time, so there's no
6013 optimisation in checking them before calling Safefree */
6014 Safefree(r->precomp);
6015 Safefree(r->offsets); /* 20010421 MJD */
6016 RX_MATCH_COPY_FREE(r);
6017 #ifdef PERL_OLD_COPY_ON_WRITE
6019 SvREFCNT_dec(r->saved_copy);
6022 if (r->anchored_substr)
6023 SvREFCNT_dec(r->anchored_substr);
6024 if (r->anchored_utf8)
6025 SvREFCNT_dec(r->anchored_utf8);
6026 if (r->float_substr)
6027 SvREFCNT_dec(r->float_substr);
6029 SvREFCNT_dec(r->float_utf8);
6030 Safefree(r->substrs);
6033 int n = r->data->count;
6034 PAD* new_comppad = NULL;
6039 /* If you add a ->what type here, update the comment in regcomp.h */
6040 switch (r->data->what[n]) {
6042 SvREFCNT_dec((SV*)r->data->data[n]);
6045 Safefree(r->data->data[n]);
6048 new_comppad = (AV*)r->data->data[n];
6051 if (new_comppad == NULL)
6052 Perl_croak(aTHX_ "panic: pregfree comppad");
6053 PAD_SAVE_LOCAL(old_comppad,
6054 /* Watch out for global destruction's random ordering. */
6055 (SvTYPE(new_comppad) == SVt_PVAV) ?
6056 new_comppad : Null(PAD *)
6059 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6062 op_free((OP_4tree*)r->data->data[n]);
6064 PAD_RESTORE_LOCAL(old_comppad);
6065 SvREFCNT_dec((SV*)new_comppad);
6072 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6075 refcount = trie->refcount--;
6078 Safefree(trie->charmap);
6079 if (trie->widecharmap)
6080 SvREFCNT_dec((SV*)trie->widecharmap);
6081 Safefree(trie->states);
6082 Safefree(trie->trans);
6085 SvREFCNT_dec((SV*)trie->words);
6086 if (trie->revcharmap)
6087 SvREFCNT_dec((SV*)trie->revcharmap);
6089 Safefree(r->data->data[n]); /* do this last!!!! */
6094 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6097 Safefree(r->data->what);
6100 Safefree(r->startp);
6106 - regnext - dig the "next" pointer out of a node
6109 Perl_regnext(pTHX_ register regnode *p)
6111 register I32 offset;
6113 if (p == &PL_regdummy)
6116 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6124 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6127 STRLEN l1 = strlen(pat1);
6128 STRLEN l2 = strlen(pat2);
6131 const char *message;
6137 Copy(pat1, buf, l1 , char);
6138 Copy(pat2, buf + l1, l2 , char);
6139 buf[l1 + l2] = '\n';
6140 buf[l1 + l2 + 1] = '\0';
6142 /* ANSI variant takes additional second argument */
6143 va_start(args, pat2);
6147 msv = vmess(buf, &args);
6149 message = SvPV_const(msv,l1);
6152 Copy(message, buf, l1 , char);
6153 buf[l1-1] = '\0'; /* Overwrite \n */
6154 Perl_croak(aTHX_ "%s", buf);
6157 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6160 Perl_save_re_context(pTHX)
6162 SAVEI32(PL_reg_flags); /* from regexec.c */
6164 SAVEPPTR(PL_reginput); /* String-input pointer. */
6165 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6166 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6167 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6168 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6169 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6170 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6171 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6172 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6173 PL_reg_start_tmp = 0;
6174 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6175 PL_reg_start_tmpl = 0;
6176 SAVEVPTR(PL_regdata);
6177 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6178 SAVEI32(PL_regnarrate); /* from regexec.c */
6179 SAVEVPTR(PL_regprogram); /* from regexec.c */
6180 SAVEINT(PL_regindent); /* from regexec.c */
6181 SAVEVPTR(PL_regcc); /* from regexec.c */
6182 SAVEVPTR(PL_curcop);
6183 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6184 SAVEVPTR(PL_reg_re); /* from regexec.c */
6185 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6186 SAVESPTR(PL_reg_sv); /* from regexec.c */
6187 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6188 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6189 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6190 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6191 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6192 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6193 PL_reg_oldsaved = Nullch;
6194 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6195 PL_reg_oldsavedlen = 0;
6196 #ifdef PERL_OLD_COPY_ON_WRITE
6200 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6202 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6203 PL_reg_leftiter = 0;
6204 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6205 PL_reg_poscache = Nullch;
6206 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6207 PL_reg_poscache_size = 0;
6208 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6209 SAVEI32(PL_regnpar); /* () count. */
6210 SAVEI32(PL_regsize); /* from regexec.c */
6213 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6216 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6218 for (i = 1; i <= rx->nparens; i++) {
6220 char digits[TYPE_CHARS(long)];
6221 sprintf(digits, "%lu", (long)i);
6222 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6229 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6234 clear_re(pTHX_ void *r)
6236 ReREFCNT_dec((regexp *)r);
6242 S_put_byte(pTHX_ SV *sv, int c)
6244 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6245 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6246 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6247 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6249 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6254 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6256 register U8 op = EXACT; /* Arbitrary non-END op. */
6257 register regnode *next;
6259 while (op != END && (!last || node < last)) {
6260 /* While that wasn't END last time... */
6266 next = regnext(node);
6268 if (OP(node) == OPTIMIZED)
6271 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6272 (int)(2*l + 1), "", SvPVX_const(sv));
6273 if (next == NULL) /* Next ptr. */
6274 PerlIO_printf(Perl_debug_log, "(0)");
6276 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6277 (void)PerlIO_putc(Perl_debug_log, '\n');
6279 if (PL_regkind[(U8)op] == BRANCHJ) {
6280 register regnode *nnode = (OP(next) == LONGJMP
6283 if (last && nnode > last)
6285 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6287 else if (PL_regkind[(U8)op] == BRANCH) {
6288 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6290 else if ( PL_regkind[(U8)op] == TRIE ) {
6291 const I32 n = ARG(node);
6292 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6293 const I32 arry_len = av_len(trie->words)+1;
6295 PerlIO_printf(Perl_debug_log,
6296 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6300 (int)trie->charcount,
6301 trie->uniquecharcount,
6302 (IV)trie->laststate-1,
6303 node->flags ? " EVAL mode" : "");
6305 for (word_idx=0; word_idx < arry_len; word_idx++) {
6306 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6308 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6311 SvPV_nolen_const(*elem_ptr),
6316 PerlIO_printf(Perl_debug_log, "(0)\n");
6318 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6324 node = NEXTOPER(node);
6325 node += regarglen[(U8)op];
6328 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6329 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6330 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6332 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6333 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6336 else if ( op == PLUS || op == STAR) {
6337 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6339 else if (op == ANYOF) {
6340 /* arglen 1 + class block */
6341 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6342 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6343 node = NEXTOPER(node);
6345 else if (PL_regkind[(U8)op] == EXACT) {
6346 /* Literal string, where present. */
6347 node += NODE_SZ_STR(node) - 1;
6348 node = NEXTOPER(node);
6351 node = NEXTOPER(node);
6352 node += regarglen[(U8)op];
6354 if (op == CURLYX || op == OPEN)
6356 else if (op == WHILEM)
6362 #endif /* DEBUGGING */
6366 * c-indentation-style: bsd
6368 * indent-tabs-mode: t
6371 * ex: set ts=8 sts=4 sw=4 noet: