5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
289 #define FAIL2(pat,msg) STMT_START { \
290 const char *ellipses = ""; \
291 IV len = RExC_end - RExC_precomp; \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 #define Simple_vFAIL(m) STMT_START { \
309 const IV offset = RExC_parse - RExC_precomp; \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 #define vFAIL(m) STMT_START { \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
324 * Like Simple_vFAIL(), but accepts two arguments.
326 #define Simple_vFAIL2(m,a1) STMT_START { \
327 const IV offset = RExC_parse - RExC_precomp; \
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 #define vFAIL2(m,a1) STMT_START { \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
346 const IV offset = RExC_parse - RExC_precomp; \
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 #define vFAIL3(m,a1,a2) STMT_START { \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
361 * Like Simple_vFAIL(), but accepts four arguments.
363 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
364 const IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 #define vWARN(loc,m) STMT_START { \
370 const IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARNdep(loc,m) STMT_START { \
376 const IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
383 #define vWARN2(loc, m, a1) STMT_START { \
384 const IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN3(loc, m, a1, a2) STMT_START { \
390 const IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 const IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 const IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 /* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
423 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 RExC_offsets[2*(node)-1] = (byte); \
435 #define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439 #define Set_Node_Length_To_R(node,len) STMT_START { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 RExC_offsets[2*(node)] = (len); \
451 #define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461 static void clear_re(pTHX_ void *r);
463 /* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 SvSetMagicSV(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 data->flags &= ~SF_FIX_BEFORE_EOL;
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 data->flags &= ~SF_FL_BEFORE_EOL;
497 SvCUR_set(data->last_found, 0);
499 SV * const sv = data->last_found;
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
506 data->flags &= ~SF_BEFORE_EOL;
509 /* Can match anything (initialization) */
511 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 ANYOF_CLASS_ZERO(cl);
514 ANYOF_BITMAP_SETALL(cl);
515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517 cl->flags |= ANYOF_LOCALE;
520 /* Can match anything (initialization) */
522 S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
526 for (value = 0; value <= ANYOF_MAX; value += 2)
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
536 /* Can match anything (initialization) */
538 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 Zero(cl, 1, struct regnode_charclass_class);
542 cl_anything(pRExC_state, cl);
546 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 Zero(cl, 1, struct regnode_charclass_class);
550 cl_anything(pRExC_state, cl);
552 cl->flags |= ANYOF_LOCALE;
555 /* 'And' a given class with another one. Can create false positives */
556 /* We assume that cl is not inverted */
558 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559 const struct regnode_charclass_class *and_with)
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
586 cl->flags &= ~ANYOF_UNICODE_ALL;
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
589 cl->flags &= ~ANYOF_UNICODE;
592 /* 'OR' a given class with another one. Can create false positives */
593 /* We assume that cl is not inverted */
595 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
597 if (or_with->flags & ANYOF_INVERT) {
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
616 cl_anything(pRExC_state, cl);
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && (!(or_with->flags & ANYOF_FOLD)
622 || (cl->flags & ANYOF_FOLD)) ) {
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
635 cl_anything(pRExC_state, cl);
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
654 make_trie(startbranch,first,last,tail,flags)
655 startbranch: the first branch in the whole branch sequence
656 first : start branch of sequence of branch-exact nodes.
657 May be the same as startbranch
658 last : Thing following the last branch.
659 May be the same as tail.
660 tail : item following the branch sequence
661 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
663 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
665 A trie is an N'ary tree where the branches are determined by digital
666 decomposition of the key. IE, at the root node you look up the 1st character and
667 follow that branch repeat until you find the end of the branches. Nodes can be
668 marked as "accepting" meaning they represent a complete word. Eg:
672 would convert into the following structure. Numbers represent states, letters
673 following numbers represent valid transitions on the letter from that state, if
674 the number is in square brackets it represents an accepting state, otherwise it
675 will be in parenthesis.
677 +-h->+-e->[3]-+-r->(8)-+-s->[9]
681 (1) +-i->(6)-+-s->[7]
683 +-s->(3)-+-h->(4)-+-e->[5]
685 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
687 This shows that when matching against the string 'hers' we will begin at state 1
688 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
689 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
690 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
691 single traverse. We store a mapping from accepting to state to which word was
692 matched, and then when we have multiple possibilities we try to complete the
693 rest of the regex in the order in which they occured in the alternation.
695 The only prior NFA like behaviour that would be changed by the TRIE support is
696 the silent ignoring of duplicate alternations which are of the form:
698 / (DUPE|DUPE) X? (?{ ... }) Y /x
700 Thus EVAL blocks follwing a trie may be called a different number of times with
701 and without the optimisation. With the optimisations dupes will be silently
702 ignored. This inconsistant behaviour of EVAL type nodes is well established as
703 the following demonstrates:
705 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
707 which prints out 'word' three times, but
709 'words'=~/(word|word|word)(?{ print $1 })S/
711 which doesnt print it out at all. This is due to other optimisations kicking in.
713 Example of what happens on a structural level:
715 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
717 1: CURLYM[1] {1,32767}(18)
728 This would be optimizable with startbranch=5, first=5, last=16, tail=16
729 and should turn into:
731 1: CURLYM[1] {1,32767}(18)
733 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
741 Cases where tail != last would be like /(?foo|bar)baz/:
751 which would be optimizable with startbranch=1, first=1, last=7, tail=8
752 and would end up looking like:
755 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
764 #define TRIE_DEBUG_CHAR \
765 DEBUG_TRIE_COMPILE_r({ \
768 tmp = newSVpv( "", 0 ); \
769 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
771 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
773 av_push( trie->revcharmap, tmp ); \
776 #define TRIE_READ_CHAR STMT_START { \
779 if ( foldlen > 0 ) { \
780 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
785 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
786 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
787 foldlen -= UNISKIP( uvc ); \
788 scan = foldbuf + UNISKIP( uvc ); \
791 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
800 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
801 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
802 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
803 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
805 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
806 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
807 TRIE_LIST_LEN( state ) *= 2; \
808 Renew( trie->states[ state ].trans.list, \
809 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
811 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
812 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
813 TRIE_LIST_CUR( state )++; \
816 #define TRIE_LIST_NEW(state) STMT_START { \
817 Newz( 1023, trie->states[ state ].trans.list, \
818 4, reg_trie_trans_le ); \
819 TRIE_LIST_CUR( state ) = 1; \
820 TRIE_LIST_LEN( state ) = 4; \
824 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
827 /* first pass, loop through and scan words */
830 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
835 /* we just use folder as a flag in utf8 */
836 const U8 * const folder = ( flags == EXACTF
844 const U32 data_slot = add_data( pRExC_state, 1, "t" );
847 GET_RE_DEBUG_FLAGS_DECL;
849 Newz( 848200, trie, 1, reg_trie_data );
851 RExC_rx->data->data[ data_slot ] = (void*)trie;
852 Newz( 848201, trie->charmap, 256, U16 );
854 trie->words = newAV();
855 trie->revcharmap = newAV();
859 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
860 if (!SvIOK(re_trie_maxbuff)) {
861 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
864 /* -- First loop and Setup --
866 We first traverse the branches and scan each word to determine if it
867 contains widechars, and how many unique chars there are, this is
868 important as we have to build a table with at least as many columns as we
871 We use an array of integers to represent the character codes 0..255
872 (trie->charmap) and we use a an HV* to store unicode characters. We use the
873 native representation of the character value as the key and IV's for the
876 *TODO* If we keep track of how many times each character is used we can
877 remap the columns so that the table compression later on is more
878 efficient in terms of memory by ensuring most common value is in the
879 middle and the least common are on the outside. IMO this would be better
880 than a most to least common mapping as theres a decent chance the most
881 common letter will share a node with the least common, meaning the node
882 will not be compressable. With a middle is most common approach the worst
883 case is when we have the least common nodes twice.
888 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
889 regnode *noper = NEXTOPER( cur );
890 const U8 *uc = (U8*)STRING( noper );
891 const U8 * const e = uc + STR_LEN( noper );
893 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
894 const U8 *scan = (U8*)NULL;
896 for ( ; uc < e ; uc += len ) {
900 if ( !trie->charmap[ uvc ] ) {
901 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
903 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
908 if ( !trie->widecharmap )
909 trie->widecharmap = newHV();
911 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
914 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
916 if ( !SvTRUE( *svpp ) ) {
917 sv_setiv( *svpp, ++trie->uniquecharcount );
923 } /* end first pass */
924 DEBUG_TRIE_COMPILE_r(
925 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
926 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
927 (int)trie->charcount, trie->uniquecharcount )
932 We now know what we are dealing with in terms of unique chars and
933 string sizes so we can calculate how much memory a naive
934 representation using a flat table will take. If it's over a reasonable
935 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
936 conservative but potentially much slower representation using an array
939 At the end we convert both representations into the same compressed
940 form that will be used in regexec.c for matching with. The latter
941 is a form that cannot be used to construct with but has memory
942 properties similar to the list form and access properties similar
943 to the table form making it both suitable for fast searches and
944 small enough that its feasable to store for the duration of a program.
946 See the comment in the code where the compressed table is produced
947 inplace from the flat tabe representation for an explanation of how
948 the compression works.
953 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
955 Second Pass -- Array Of Lists Representation
957 Each state will be represented by a list of charid:state records
958 (reg_trie_trans_le) the first such element holds the CUR and LEN
959 points of the allocated array. (See defines above).
961 We build the initial structure using the lists, and then convert
962 it into the compressed table form which allows faster lookups
963 (but cant be modified once converted).
969 STRLEN transcount = 1;
971 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
975 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
977 regnode *noper = NEXTOPER( cur );
978 U8 *uc = (U8*)STRING( noper );
979 const U8 * const e = uc + STR_LEN( noper );
980 U32 state = 1; /* required init */
981 U16 charid = 0; /* sanity init */
982 U8 *scan = (U8*)NULL; /* sanity init */
983 STRLEN foldlen = 0; /* required init */
984 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
987 for ( ; uc < e ; uc += len ) {
992 charid = trie->charmap[ uvc ];
994 SV** svpp=(SV**)NULL;
995 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
999 charid=(U16)SvIV( *svpp );
1008 if ( !trie->states[ state ].trans.list ) {
1009 TRIE_LIST_NEW( state );
1011 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1012 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1013 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1018 newstate = next_alloc++;
1019 TRIE_LIST_PUSH( state, charid, newstate );
1024 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1026 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1029 if ( !trie->states[ state ].wordnum ) {
1030 /* we havent inserted this word into the structure yet. */
1031 trie->states[ state ].wordnum = ++curword;
1034 /* store the word for dumping */
1035 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1036 if ( UTF ) SvUTF8_on( tmp );
1037 av_push( trie->words, tmp );
1041 /* Its a dupe. So ignore it. */
1044 } /* end second pass */
1046 trie->laststate = next_alloc;
1047 Renew( trie->states, next_alloc, reg_trie_state );
1049 DEBUG_TRIE_COMPILE_MORE_r({
1052 /* print out the table precompression. */
1054 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1055 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1057 for( state=1 ; state < next_alloc ; state ++ ) {
1060 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
1061 if ( ! trie->states[ state ].wordnum ) {
1062 PerlIO_printf( Perl_debug_log, "%5s| ","");
1064 PerlIO_printf( Perl_debug_log, "W%04x| ",
1065 trie->states[ state ].wordnum
1068 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1069 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1070 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
1071 SvPV_nolen_const( *tmp ),
1072 TRIE_LIST_ITEM(state,charid).forid,
1073 (UV)TRIE_LIST_ITEM(state,charid).newstate
1078 PerlIO_printf( Perl_debug_log, "\n\n" );
1081 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1088 for( state=1 ; state < next_alloc ; state ++ ) {
1092 DEBUG_TRIE_COMPILE_MORE_r(
1093 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1097 if (trie->states[state].trans.list) {
1098 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1102 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1103 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1104 minid=TRIE_LIST_ITEM( state, idx).forid;
1105 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1106 maxid=TRIE_LIST_ITEM( state, idx).forid;
1109 if ( transcount < tp + maxid - minid + 1) {
1111 Renew( trie->trans, transcount, reg_trie_trans );
1112 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1114 base = trie->uniquecharcount + tp - minid;
1115 if ( maxid == minid ) {
1117 for ( ; zp < tp ; zp++ ) {
1118 if ( ! trie->trans[ zp ].next ) {
1119 base = trie->uniquecharcount + zp - minid;
1120 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1121 trie->trans[ zp ].check = state;
1127 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1128 trie->trans[ tp ].check = state;
1133 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1134 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1135 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1136 trie->trans[ tid ].check = state;
1138 tp += ( maxid - minid + 1 );
1140 Safefree(trie->states[ state ].trans.list);
1143 DEBUG_TRIE_COMPILE_MORE_r(
1144 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1147 trie->states[ state ].trans.base=base;
1149 trie->lasttrans = tp + 1;
1153 Second Pass -- Flat Table Representation.
1155 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1156 We know that we will need Charcount+1 trans at most to store the data
1157 (one row per char at worst case) So we preallocate both structures
1158 assuming worst case.
1160 We then construct the trie using only the .next slots of the entry
1163 We use the .check field of the first entry of the node temporarily to
1164 make compression both faster and easier by keeping track of how many non
1165 zero fields are in the node.
1167 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1170 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1171 number representing the first entry of the node, and state as a
1172 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1173 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1174 are 2 entrys per node. eg:
1182 The table is internally in the right hand, idx form. However as we also
1183 have to deal with the states array which is indexed by nodenum we have to
1184 use TRIE_NODENUM() to convert.
1188 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1190 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1191 next_alloc = trie->uniquecharcount + 1;
1193 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1195 regnode *noper = NEXTOPER( cur );
1196 const U8 *uc = (U8*)STRING( noper );
1197 const U8 * const e = uc + STR_LEN( noper );
1199 U32 state = 1; /* required init */
1201 U16 charid = 0; /* sanity init */
1202 U32 accept_state = 0; /* sanity init */
1203 U8 *scan = (U8*)NULL; /* sanity init */
1205 STRLEN foldlen = 0; /* required init */
1206 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1209 for ( ; uc < e ; uc += len ) {
1214 charid = trie->charmap[ uvc ];
1216 SV** svpp=(SV**)NULL;
1217 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1221 charid=(U16)SvIV( *svpp );
1226 if ( !trie->trans[ state + charid ].next ) {
1227 trie->trans[ state + charid ].next = next_alloc;
1228 trie->trans[ state ].check++;
1229 next_alloc += trie->uniquecharcount;
1231 state = trie->trans[ state + charid ].next;
1233 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1235 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1238 accept_state = TRIE_NODENUM( state );
1239 if ( !trie->states[ accept_state ].wordnum ) {
1240 /* we havent inserted this word into the structure yet. */
1241 trie->states[ accept_state ].wordnum = ++curword;
1244 /* store the word for dumping */
1245 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1246 if ( UTF ) SvUTF8_on( tmp );
1247 av_push( trie->words, tmp );
1251 /* Its a dupe. So ignore it. */
1254 } /* end second pass */
1256 DEBUG_TRIE_COMPILE_MORE_r({
1258 print out the table precompression so that we can do a visual check
1259 that they are identical.
1263 PerlIO_printf( Perl_debug_log, "\nChar : " );
1265 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1266 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1268 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1272 PerlIO_printf( Perl_debug_log, "\nState+-" );
1274 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1275 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1278 PerlIO_printf( Perl_debug_log, "\n" );
1280 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1282 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
1284 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1285 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1286 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1288 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1289 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
1291 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
1292 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1295 PerlIO_printf( Perl_debug_log, "\n\n" );
1299 * Inplace compress the table.*
1301 For sparse data sets the table constructed by the trie algorithm will
1302 be mostly 0/FAIL transitions or to put it another way mostly empty.
1303 (Note that leaf nodes will not contain any transitions.)
1305 This algorithm compresses the tables by eliminating most such
1306 transitions, at the cost of a modest bit of extra work during lookup:
1308 - Each states[] entry contains a .base field which indicates the
1309 index in the state[] array wheres its transition data is stored.
1311 - If .base is 0 there are no valid transitions from that node.
1313 - If .base is nonzero then charid is added to it to find an entry in
1316 -If trans[states[state].base+charid].check!=state then the
1317 transition is taken to be a 0/Fail transition. Thus if there are fail
1318 transitions at the front of the node then the .base offset will point
1319 somewhere inside the previous nodes data (or maybe even into a node
1320 even earlier), but the .check field determines if the transition is
1323 The following process inplace converts the table to the compressed
1324 table: We first do not compress the root node 1,and mark its all its
1325 .check pointers as 1 and set its .base pointer as 1 as well. This
1326 allows to do a DFA construction from the compressed table later, and
1327 ensures that any .base pointers we calculate later are greater than
1330 - We set 'pos' to indicate the first entry of the second node.
1332 - We then iterate over the columns of the node, finding the first and
1333 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1334 and set the .check pointers accordingly, and advance pos
1335 appropriately and repreat for the next node. Note that when we copy
1336 the next pointers we have to convert them from the original
1337 NODEIDX form to NODENUM form as the former is not valid post
1340 - If a node has no transitions used we mark its base as 0 and do not
1341 advance the pos pointer.
1343 - If a node only has one transition we use a second pointer into the
1344 structure to fill in allocated fail transitions from other states.
1345 This pointer is independent of the main pointer and scans forward
1346 looking for null transitions that are allocated to a state. When it
1347 finds one it writes the single transition into the "hole". If the
1348 pointer doesnt find one the single transition is appeneded as normal.
1350 - Once compressed we can Renew/realloc the structures to release the
1353 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1354 specifically Fig 3.47 and the associated pseudocode.
1358 const U32 laststate = TRIE_NODENUM( next_alloc );
1361 trie->laststate = laststate;
1363 for ( state = 1 ; state < laststate ; state++ ) {
1365 const U32 stateidx = TRIE_NODEIDX( state );
1366 const U32 o_used = trie->trans[ stateidx ].check;
1367 U32 used = trie->trans[ stateidx ].check;
1368 trie->trans[ stateidx ].check = 0;
1370 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1371 if ( flag || trie->trans[ stateidx + charid ].next ) {
1372 if ( trie->trans[ stateidx + charid ].next ) {
1374 for ( ; zp < pos ; zp++ ) {
1375 if ( ! trie->trans[ zp ].next ) {
1379 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1380 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1381 trie->trans[ zp ].check = state;
1382 if ( ++zp > pos ) pos = zp;
1389 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1391 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1392 trie->trans[ pos ].check = state;
1397 trie->lasttrans = pos + 1;
1398 Renew( trie->states, laststate + 1, reg_trie_state);
1399 DEBUG_TRIE_COMPILE_MORE_r(
1400 PerlIO_printf( Perl_debug_log,
1401 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1402 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1405 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1408 } /* end table compress */
1410 /* resize the trans array to remove unused space */
1411 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1413 DEBUG_TRIE_COMPILE_r({
1416 Now we print it out again, in a slightly different form as there is additional
1417 info we want to be able to see when its compressed. They are close enough for
1418 visual comparison though.
1420 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1422 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1423 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1425 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
1428 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1430 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1431 PerlIO_printf( Perl_debug_log, "-----");
1432 PerlIO_printf( Perl_debug_log, "\n");
1434 for( state = 1 ; state < trie->laststate ; state++ ) {
1435 const U32 base = trie->states[ state ].trans.base;
1437 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
1439 if ( trie->states[ state ].wordnum ) {
1440 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1442 PerlIO_printf( Perl_debug_log, "%6s", "" );
1445 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
1450 while( ( base + ofs < trie->uniquecharcount ) ||
1451 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1452 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1455 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
1457 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1458 if ( ( base + ofs >= trie->uniquecharcount ) &&
1459 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1460 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1462 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1463 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1465 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1469 PerlIO_printf( Perl_debug_log, "]");
1472 PerlIO_printf( Perl_debug_log, "\n" );
1477 /* now finally we "stitch in" the new TRIE node
1478 This means we convert either the first branch or the first Exact,
1479 depending on whether the thing following (in 'last') is a branch
1480 or not and whther first is the startbranch (ie is it a sub part of
1481 the alternation or is it the whole thing.)
1482 Assuming its a sub part we conver the EXACT otherwise we convert
1483 the whole branch sequence, including the first.
1490 if ( first == startbranch && OP( last ) != BRANCH ) {
1493 convert = NEXTOPER( first );
1494 NEXT_OFF( first ) = (U16)(last - first);
1497 OP( convert ) = TRIE + (U8)( flags - EXACT );
1498 NEXT_OFF( convert ) = (U16)(tail - convert);
1499 ARG_SET( convert, data_slot );
1501 /* tells us if we need to handle accept buffers specially */
1502 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1505 /* needed for dumping*/
1507 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1508 /* We now need to mark all of the space originally used by the
1509 branches as optimized away. This keeps the dumpuntil from
1510 throwing a wobbly as it doesnt use regnext() to traverse the
1513 while( optimize < last ) {
1514 OP( optimize ) = OPTIMIZED;
1518 } /* end node insert */
1525 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1526 * These need to be revisited when a newer toolchain becomes available.
1528 #if defined(__sparc64__) && defined(__GNUC__)
1529 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1530 # undef SPARC64_GCC_WORKAROUND
1531 # define SPARC64_GCC_WORKAROUND 1
1535 /* REx optimizer. Converts nodes into quickier variants "in place".
1536 Finds fixed substrings. */
1538 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1539 to the position after last scanned or to NULL. */
1543 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
1544 /* scanp: Start here (read-write). */
1545 /* deltap: Write maxlen-minlen here. */
1546 /* last: Stop before this one. */
1548 I32 min = 0, pars = 0, code;
1549 regnode *scan = *scanp, *next;
1551 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1552 int is_inf_internal = 0; /* The studied chunk is infinite */
1553 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1554 scan_data_t data_fake;
1555 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1556 SV *re_trie_maxbuff = NULL;
1558 GET_RE_DEBUG_FLAGS_DECL;
1560 while (scan && OP(scan) != END && scan < last) {
1561 /* Peephole optimizer: */
1563 SV *mysv=sv_newmortal();
1564 regprop( mysv, scan);
1565 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1566 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
1569 if (PL_regkind[(U8)OP(scan)] == EXACT) {
1570 /* Merge several consecutive EXACTish nodes into one. */
1571 regnode *n = regnext(scan);
1574 regnode *stop = scan;
1577 next = scan + NODE_SZ_STR(scan);
1578 /* Skip NOTHING, merge EXACT*. */
1580 ( PL_regkind[(U8)OP(n)] == NOTHING ||
1581 (stringok && (OP(n) == OP(scan))))
1583 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1584 if (OP(n) == TAIL || n > next)
1586 if (PL_regkind[(U8)OP(n)] == NOTHING) {
1587 NEXT_OFF(scan) += NEXT_OFF(n);
1588 next = n + NODE_STEP_REGNODE;
1595 else if (stringok) {
1596 const int oldl = STR_LEN(scan);
1597 regnode *nnext = regnext(n);
1599 if (oldl + STR_LEN(n) > U8_MAX)
1601 NEXT_OFF(scan) += NEXT_OFF(n);
1602 STR_LEN(scan) += STR_LEN(n);
1603 next = n + NODE_SZ_STR(n);
1604 /* Now we can overwrite *n : */
1605 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1613 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1615 Two problematic code points in Unicode casefolding of EXACT nodes:
1617 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1618 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1624 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1625 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1627 This means that in case-insensitive matching (or "loose matching",
1628 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1629 length of the above casefolded versions) can match a target string
1630 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1631 This would rather mess up the minimum length computation.
1633 What we'll do is to look for the tail four bytes, and then peek
1634 at the preceding two bytes to see whether we need to decrease
1635 the minimum length by four (six minus two).
1637 Thanks to the design of UTF-8, there cannot be false matches:
1638 A sequence of valid UTF-8 bytes cannot be a subsequence of
1639 another valid sequence of UTF-8 bytes.
1642 char *s0 = STRING(scan), *s, *t;
1643 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
1644 const char * const t0 = "\xcc\x88\xcc\x81";
1645 const char * const t1 = t0 + 3;
1648 s < s2 && (t = ninstr(s, s1, t0, t1));
1650 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1651 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1658 n = scan + NODE_SZ_STR(scan);
1660 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
1671 /* Follow the next-chain of the current node and optimize
1672 away all the NOTHINGs from it. */
1673 if (OP(scan) != CURLYX) {
1674 const int max = (reg_off_by_arg[OP(scan)]
1676 /* I32 may be smaller than U16 on CRAYs! */
1677 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1678 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1682 /* Skip NOTHING and LONGJMP. */
1683 while ((n = regnext(n))
1684 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1685 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1686 && off + noff < max)
1688 if (reg_off_by_arg[OP(scan)])
1691 NEXT_OFF(scan) = off;
1694 /* The principal pseudo-switch. Cannot be a switch, since we
1695 look into several different things. */
1696 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1697 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1698 next = regnext(scan);
1700 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1702 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1703 I32 max1 = 0, min1 = I32_MAX, num = 0;
1704 struct regnode_charclass_class accum;
1705 regnode *startbranch=scan;
1707 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1708 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1709 if (flags & SCF_DO_STCLASS)
1710 cl_init_zero(pRExC_state, &accum);
1712 while (OP(scan) == code) {
1713 I32 deltanext, minnext, f = 0, fake;
1714 struct regnode_charclass_class this_class;
1717 data_fake.flags = 0;
1719 data_fake.whilem_c = data->whilem_c;
1720 data_fake.last_closep = data->last_closep;
1723 data_fake.last_closep = &fake;
1724 next = regnext(scan);
1725 scan = NEXTOPER(scan);
1727 scan = NEXTOPER(scan);
1728 if (flags & SCF_DO_STCLASS) {
1729 cl_init(pRExC_state, &this_class);
1730 data_fake.start_class = &this_class;
1731 f = SCF_DO_STCLASS_AND;
1733 if (flags & SCF_WHILEM_VISITED_POS)
1734 f |= SCF_WHILEM_VISITED_POS;
1736 /* we suppose the run is continuous, last=next...*/
1737 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1738 next, &data_fake, f,depth+1);
1741 if (max1 < minnext + deltanext)
1742 max1 = minnext + deltanext;
1743 if (deltanext == I32_MAX)
1744 is_inf = is_inf_internal = 1;
1746 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1748 if (data && (data_fake.flags & SF_HAS_EVAL))
1749 data->flags |= SF_HAS_EVAL;
1751 data->whilem_c = data_fake.whilem_c;
1752 if (flags & SCF_DO_STCLASS)
1753 cl_or(pRExC_state, &accum, &this_class);
1754 if (code == SUSPEND)
1757 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1759 if (flags & SCF_DO_SUBSTR) {
1760 data->pos_min += min1;
1761 data->pos_delta += max1 - min1;
1762 if (max1 != min1 || is_inf)
1763 data->longest = &(data->longest_float);
1766 delta += max1 - min1;
1767 if (flags & SCF_DO_STCLASS_OR) {
1768 cl_or(pRExC_state, data->start_class, &accum);
1770 cl_and(data->start_class, &and_with);
1771 flags &= ~SCF_DO_STCLASS;
1774 else if (flags & SCF_DO_STCLASS_AND) {
1776 cl_and(data->start_class, &accum);
1777 flags &= ~SCF_DO_STCLASS;
1780 /* Switch to OR mode: cache the old value of
1781 * data->start_class */
1782 StructCopy(data->start_class, &and_with,
1783 struct regnode_charclass_class);
1784 flags &= ~SCF_DO_STCLASS_AND;
1785 StructCopy(&accum, data->start_class,
1786 struct regnode_charclass_class);
1787 flags |= SCF_DO_STCLASS_OR;
1788 data->start_class->flags |= ANYOF_EOS;
1794 Assuming this was/is a branch we are dealing with: 'scan' now
1795 points at the item that follows the branch sequence, whatever
1796 it is. We now start at the beginning of the sequence and look
1802 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1804 If we can find such a subseqence we need to turn the first
1805 element into a trie and then add the subsequent branch exact
1806 strings to the trie.
1810 1. patterns where the whole set of branch can be converted to a trie,
1812 2. patterns where only a subset of the alternations can be
1813 converted to a trie.
1815 In case 1 we can replace the whole set with a single regop
1816 for the trie. In case 2 we need to keep the start and end
1819 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1820 becomes BRANCH TRIE; BRANCH X;
1822 Hypthetically when we know the regex isnt anchored we can
1823 turn a case 1 into a DFA and let it rip... Every time it finds a match
1824 it would just call its tail, no WHILEM/CURLY needed.
1828 if (!re_trie_maxbuff) {
1829 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1830 if (!SvIOK(re_trie_maxbuff))
1831 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1833 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1835 regnode *first = (regnode *)NULL;
1836 regnode *last = (regnode *)NULL;
1837 regnode *tail = scan;
1842 SV *mysv = sv_newmortal(); /* for dumping */
1844 /* var tail is used because there may be a TAIL
1845 regop in the way. Ie, the exacts will point to the
1846 thing following the TAIL, but the last branch will
1847 point at the TAIL. So we advance tail. If we
1848 have nested (?:) we may have to move through several
1852 while ( OP( tail ) == TAIL ) {
1853 /* this is the TAIL generated by (?:) */
1854 tail = regnext( tail );
1858 regprop( mysv, tail );
1859 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1860 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
1861 (RExC_seen_evals) ? "[EVAL]" : ""
1866 step through the branches, cur represents each
1867 branch, noper is the first thing to be matched
1868 as part of that branch and noper_next is the
1869 regnext() of that node. if noper is an EXACT
1870 and noper_next is the same as scan (our current
1871 position in the regex) then the EXACT branch is
1872 a possible optimization target. Once we have
1873 two or more consequetive such branches we can
1874 create a trie of the EXACT's contents and stich
1875 it in place. If the sequence represents all of
1876 the branches we eliminate the whole thing and
1877 replace it with a single TRIE. If it is a
1878 subsequence then we need to stitch it in. This
1879 means the first branch has to remain, and needs
1880 to be repointed at the item on the branch chain
1881 following the last branch optimized. This could
1882 be either a BRANCH, in which case the
1883 subsequence is internal, or it could be the
1884 item following the branch sequence in which
1885 case the subsequence is at the end.
1889 /* dont use tail as the end marker for this traverse */
1890 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1891 regnode *noper = NEXTOPER( cur );
1892 regnode *noper_next = regnext( noper );
1895 regprop( mysv, cur);
1896 PerlIO_printf( Perl_debug_log, "%*s%s",
1897 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
1899 regprop( mysv, noper);
1900 PerlIO_printf( Perl_debug_log, " -> %s",
1901 SvPV_nolen_const(mysv));
1904 regprop( mysv, noper_next );
1905 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1906 SvPV_nolen_const(mysv));
1908 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1911 if ( ( first ? OP( noper ) == optype
1912 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1913 && noper_next == tail && count<U16_MAX)
1918 optype = OP( noper );
1922 regprop( mysv, first);
1923 PerlIO_printf( Perl_debug_log, "%*s%s",
1924 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
1925 regprop( mysv, NEXTOPER(first) );
1926 PerlIO_printf( Perl_debug_log, " -> %s\n",
1927 SvPV_nolen_const( mysv ) );
1932 regprop( mysv, cur);
1933 PerlIO_printf( Perl_debug_log, "%*s%s",
1934 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
1935 regprop( mysv, noper );
1936 PerlIO_printf( Perl_debug_log, " -> %s\n",
1937 SvPV_nolen_const( mysv ) );
1943 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1944 (int)depth * 2 + 2, "E:", "**END**" );
1946 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1948 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1949 && noper_next == tail )
1953 optype = OP( noper );
1963 regprop( mysv, cur);
1964 PerlIO_printf( Perl_debug_log,
1965 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
1966 " ", SvPV_nolen_const( mysv ), first, last, cur);
1971 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1972 (int)depth * 2 + 2, "E:", "==END==" );
1974 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1979 else if ( code == BRANCHJ ) { /* single branch is optimized. */
1980 scan = NEXTOPER(NEXTOPER(scan));
1981 } else /* single branch is optimized. */
1982 scan = NEXTOPER(scan);
1985 else if (OP(scan) == EXACT) {
1986 I32 l = STR_LEN(scan);
1987 UV uc = *((U8*)STRING(scan));
1989 const U8 * const s = (U8*)STRING(scan);
1990 l = utf8_length(s, s + l);
1991 uc = utf8_to_uvchr(s, NULL);
1994 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
1995 /* The code below prefers earlier match for fixed
1996 offset, later match for variable offset. */
1997 if (data->last_end == -1) { /* Update the start info. */
1998 data->last_start_min = data->pos_min;
1999 data->last_start_max = is_inf
2000 ? I32_MAX : data->pos_min + data->pos_delta;
2002 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2004 SV * sv = data->last_found;
2005 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2006 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2007 if (mg && mg->mg_len >= 0)
2008 mg->mg_len += utf8_length((U8*)STRING(scan),
2009 (U8*)STRING(scan)+STR_LEN(scan));
2012 SvUTF8_on(data->last_found);
2013 data->last_end = data->pos_min + l;
2014 data->pos_min += l; /* As in the first entry. */
2015 data->flags &= ~SF_BEFORE_EOL;
2017 if (flags & SCF_DO_STCLASS_AND) {
2018 /* Check whether it is compatible with what we know already! */
2022 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2023 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2024 && (!(data->start_class->flags & ANYOF_FOLD)
2025 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2028 ANYOF_CLASS_ZERO(data->start_class);
2029 ANYOF_BITMAP_ZERO(data->start_class);
2031 ANYOF_BITMAP_SET(data->start_class, uc);
2032 data->start_class->flags &= ~ANYOF_EOS;
2034 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2036 else if (flags & SCF_DO_STCLASS_OR) {
2037 /* false positive possible if the class is case-folded */
2039 ANYOF_BITMAP_SET(data->start_class, uc);
2041 data->start_class->flags |= ANYOF_UNICODE_ALL;
2042 data->start_class->flags &= ~ANYOF_EOS;
2043 cl_and(data->start_class, &and_with);
2045 flags &= ~SCF_DO_STCLASS;
2047 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
2048 I32 l = STR_LEN(scan);
2049 UV uc = *((U8*)STRING(scan));
2051 /* Search for fixed substrings supports EXACT only. */
2052 if (flags & SCF_DO_SUBSTR)
2053 scan_commit(pRExC_state, data);
2055 U8 *s = (U8 *)STRING(scan);
2056 l = utf8_length(s, s + l);
2057 uc = utf8_to_uvchr(s, NULL);
2060 if (data && (flags & SCF_DO_SUBSTR))
2062 if (flags & SCF_DO_STCLASS_AND) {
2063 /* Check whether it is compatible with what we know already! */
2067 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2068 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2069 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2071 ANYOF_CLASS_ZERO(data->start_class);
2072 ANYOF_BITMAP_ZERO(data->start_class);
2074 ANYOF_BITMAP_SET(data->start_class, uc);
2075 data->start_class->flags &= ~ANYOF_EOS;
2076 data->start_class->flags |= ANYOF_FOLD;
2077 if (OP(scan) == EXACTFL)
2078 data->start_class->flags |= ANYOF_LOCALE;
2081 else if (flags & SCF_DO_STCLASS_OR) {
2082 if (data->start_class->flags & ANYOF_FOLD) {
2083 /* false positive possible if the class is case-folded.
2084 Assume that the locale settings are the same... */
2086 ANYOF_BITMAP_SET(data->start_class, uc);
2087 data->start_class->flags &= ~ANYOF_EOS;
2089 cl_and(data->start_class, &and_with);
2091 flags &= ~SCF_DO_STCLASS;
2093 else if (strchr((const char*)PL_varies,OP(scan))) {
2094 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2095 I32 f = flags, pos_before = 0;
2096 regnode *oscan = scan;
2097 struct regnode_charclass_class this_class;
2098 struct regnode_charclass_class *oclass = NULL;
2099 I32 next_is_eval = 0;
2101 switch (PL_regkind[(U8)OP(scan)]) {
2102 case WHILEM: /* End of (?:...)* . */
2103 scan = NEXTOPER(scan);
2106 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2107 next = NEXTOPER(scan);
2108 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2110 maxcount = REG_INFTY;
2111 next = regnext(scan);
2112 scan = NEXTOPER(scan);
2116 if (flags & SCF_DO_SUBSTR)
2121 if (flags & SCF_DO_STCLASS) {
2123 maxcount = REG_INFTY;
2124 next = regnext(scan);
2125 scan = NEXTOPER(scan);
2128 is_inf = is_inf_internal = 1;
2129 scan = regnext(scan);
2130 if (flags & SCF_DO_SUBSTR) {
2131 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2132 data->longest = &(data->longest_float);
2134 goto optimize_curly_tail;
2136 mincount = ARG1(scan);
2137 maxcount = ARG2(scan);
2138 next = regnext(scan);
2139 if (OP(scan) == CURLYX) {
2140 I32 lp = (data ? *(data->last_closep) : 0);
2141 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2143 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2144 next_is_eval = (OP(scan) == EVAL);
2146 if (flags & SCF_DO_SUBSTR) {
2147 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2148 pos_before = data->pos_min;
2152 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2154 data->flags |= SF_IS_INF;
2156 if (flags & SCF_DO_STCLASS) {
2157 cl_init(pRExC_state, &this_class);
2158 oclass = data->start_class;
2159 data->start_class = &this_class;
2160 f |= SCF_DO_STCLASS_AND;
2161 f &= ~SCF_DO_STCLASS_OR;
2163 /* These are the cases when once a subexpression
2164 fails at a particular position, it cannot succeed
2165 even after backtracking at the enclosing scope.
2167 XXXX what if minimal match and we are at the
2168 initial run of {n,m}? */
2169 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2170 f &= ~SCF_WHILEM_VISITED_POS;
2172 /* This will finish on WHILEM, setting scan, or on NULL: */
2173 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2175 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2177 if (flags & SCF_DO_STCLASS)
2178 data->start_class = oclass;
2179 if (mincount == 0 || minnext == 0) {
2180 if (flags & SCF_DO_STCLASS_OR) {
2181 cl_or(pRExC_state, data->start_class, &this_class);
2183 else if (flags & SCF_DO_STCLASS_AND) {
2184 /* Switch to OR mode: cache the old value of
2185 * data->start_class */
2186 StructCopy(data->start_class, &and_with,
2187 struct regnode_charclass_class);
2188 flags &= ~SCF_DO_STCLASS_AND;
2189 StructCopy(&this_class, data->start_class,
2190 struct regnode_charclass_class);
2191 flags |= SCF_DO_STCLASS_OR;
2192 data->start_class->flags |= ANYOF_EOS;
2194 } else { /* Non-zero len */
2195 if (flags & SCF_DO_STCLASS_OR) {
2196 cl_or(pRExC_state, data->start_class, &this_class);
2197 cl_and(data->start_class, &and_with);
2199 else if (flags & SCF_DO_STCLASS_AND)
2200 cl_and(data->start_class, &this_class);
2201 flags &= ~SCF_DO_STCLASS;
2203 if (!scan) /* It was not CURLYX, but CURLY. */
2205 if (ckWARN(WARN_REGEXP)
2206 /* ? quantifier ok, except for (?{ ... }) */
2207 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2208 && (minnext == 0) && (deltanext == 0)
2209 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2210 && maxcount <= REG_INFTY/3) /* Complement check for big count */
2213 "Quantifier unexpected on zero-length expression");
2216 min += minnext * mincount;
2217 is_inf_internal |= ((maxcount == REG_INFTY
2218 && (minnext + deltanext) > 0)
2219 || deltanext == I32_MAX);
2220 is_inf |= is_inf_internal;
2221 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2223 /* Try powerful optimization CURLYX => CURLYN. */
2224 if ( OP(oscan) == CURLYX && data
2225 && data->flags & SF_IN_PAR
2226 && !(data->flags & SF_HAS_EVAL)
2227 && !deltanext && minnext == 1 ) {
2228 /* Try to optimize to CURLYN. */
2229 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2230 regnode *nxt1 = nxt;
2237 if (!strchr((const char*)PL_simple,OP(nxt))
2238 && !(PL_regkind[(U8)OP(nxt)] == EXACT
2239 && STR_LEN(nxt) == 1))
2245 if (OP(nxt) != CLOSE)
2247 /* Now we know that nxt2 is the only contents: */
2248 oscan->flags = (U8)ARG(nxt);
2250 OP(nxt1) = NOTHING; /* was OPEN. */
2252 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2253 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2254 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2255 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2256 OP(nxt + 1) = OPTIMIZED; /* was count. */
2257 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2262 /* Try optimization CURLYX => CURLYM. */
2263 if ( OP(oscan) == CURLYX && data
2264 && !(data->flags & SF_HAS_PAR)
2265 && !(data->flags & SF_HAS_EVAL)
2266 && !deltanext /* atom is fixed width */
2267 && minnext != 0 /* CURLYM can't handle zero width */
2269 /* XXXX How to optimize if data == 0? */
2270 /* Optimize to a simpler form. */
2271 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2275 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2276 && (OP(nxt2) != WHILEM))
2278 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2279 /* Need to optimize away parenths. */
2280 if (data->flags & SF_IN_PAR) {
2281 /* Set the parenth number. */
2282 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2284 if (OP(nxt) != CLOSE)
2285 FAIL("Panic opt close");
2286 oscan->flags = (U8)ARG(nxt);
2287 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2288 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2290 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2291 OP(nxt + 1) = OPTIMIZED; /* was count. */
2292 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2293 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2296 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2297 regnode *nnxt = regnext(nxt1);
2300 if (reg_off_by_arg[OP(nxt1)])
2301 ARG_SET(nxt1, nxt2 - nxt1);
2302 else if (nxt2 - nxt1 < U16_MAX)
2303 NEXT_OFF(nxt1) = nxt2 - nxt1;
2305 OP(nxt) = NOTHING; /* Cannot beautify */
2310 /* Optimize again: */
2311 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2317 else if ((OP(oscan) == CURLYX)
2318 && (flags & SCF_WHILEM_VISITED_POS)
2319 /* See the comment on a similar expression above.
2320 However, this time it not a subexpression
2321 we care about, but the expression itself. */
2322 && (maxcount == REG_INFTY)
2323 && data && ++data->whilem_c < 16) {
2324 /* This stays as CURLYX, we can put the count/of pair. */
2325 /* Find WHILEM (as in regexec.c) */
2326 regnode *nxt = oscan + NEXT_OFF(oscan);
2328 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2330 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2331 | (RExC_whilem_seen << 4)); /* On WHILEM */
2333 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2335 if (flags & SCF_DO_SUBSTR) {
2336 SV *last_str = Nullsv;
2337 int counted = mincount != 0;
2339 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2340 #if defined(SPARC64_GCC_WORKAROUND)
2343 const char *s = NULL;
2346 if (pos_before >= data->last_start_min)
2349 b = data->last_start_min;
2352 s = SvPV_const(data->last_found, l);
2353 old = b - data->last_start_min;
2356 I32 b = pos_before >= data->last_start_min
2357 ? pos_before : data->last_start_min;
2359 const char *s = SvPV_const(data->last_found, l);
2360 I32 old = b - data->last_start_min;
2364 old = utf8_hop((U8*)s, old) - (U8*)s;
2367 /* Get the added string: */
2368 last_str = newSVpvn(s + old, l);
2370 SvUTF8_on(last_str);
2371 if (deltanext == 0 && pos_before == b) {
2372 /* What was added is a constant string */
2374 SvGROW(last_str, (mincount * l) + 1);
2375 repeatcpy(SvPVX(last_str) + l,
2376 SvPVX_const(last_str), l, mincount - 1);
2377 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2378 /* Add additional parts. */
2379 SvCUR_set(data->last_found,
2380 SvCUR(data->last_found) - l);
2381 sv_catsv(data->last_found, last_str);
2383 SV * sv = data->last_found;
2385 SvUTF8(sv) && SvMAGICAL(sv) ?
2386 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2387 if (mg && mg->mg_len >= 0)
2388 mg->mg_len += CHR_SVLEN(last_str);
2390 data->last_end += l * (mincount - 1);
2393 /* start offset must point into the last copy */
2394 data->last_start_min += minnext * (mincount - 1);
2395 data->last_start_max += is_inf ? I32_MAX
2396 : (maxcount - 1) * (minnext + data->pos_delta);
2399 /* It is counted once already... */
2400 data->pos_min += minnext * (mincount - counted);
2401 data->pos_delta += - counted * deltanext +
2402 (minnext + deltanext) * maxcount - minnext * mincount;
2403 if (mincount != maxcount) {
2404 /* Cannot extend fixed substrings found inside
2406 scan_commit(pRExC_state,data);
2407 if (mincount && last_str) {
2408 sv_setsv(data->last_found, last_str);
2409 data->last_end = data->pos_min;
2410 data->last_start_min =
2411 data->pos_min - CHR_SVLEN(last_str);
2412 data->last_start_max = is_inf
2414 : data->pos_min + data->pos_delta
2415 - CHR_SVLEN(last_str);
2417 data->longest = &(data->longest_float);
2419 SvREFCNT_dec(last_str);
2421 if (data && (fl & SF_HAS_EVAL))
2422 data->flags |= SF_HAS_EVAL;
2423 optimize_curly_tail:
2424 if (OP(oscan) != CURLYX) {
2425 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
2427 NEXT_OFF(oscan) += NEXT_OFF(next);
2430 default: /* REF and CLUMP only? */
2431 if (flags & SCF_DO_SUBSTR) {
2432 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2433 data->longest = &(data->longest_float);
2435 is_inf = is_inf_internal = 1;
2436 if (flags & SCF_DO_STCLASS_OR)
2437 cl_anything(pRExC_state, data->start_class);
2438 flags &= ~SCF_DO_STCLASS;
2442 else if (strchr((const char*)PL_simple,OP(scan))) {
2445 if (flags & SCF_DO_SUBSTR) {
2446 scan_commit(pRExC_state,data);
2450 if (flags & SCF_DO_STCLASS) {
2451 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2453 /* Some of the logic below assumes that switching
2454 locale on will only add false positives. */
2455 switch (PL_regkind[(U8)OP(scan)]) {
2459 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2460 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2461 cl_anything(pRExC_state, data->start_class);
2464 if (OP(scan) == SANY)
2466 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2467 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2468 || (data->start_class->flags & ANYOF_CLASS));
2469 cl_anything(pRExC_state, data->start_class);
2471 if (flags & SCF_DO_STCLASS_AND || !value)
2472 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2475 if (flags & SCF_DO_STCLASS_AND)
2476 cl_and(data->start_class,
2477 (struct regnode_charclass_class*)scan);
2479 cl_or(pRExC_state, data->start_class,
2480 (struct regnode_charclass_class*)scan);
2483 if (flags & SCF_DO_STCLASS_AND) {
2484 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2485 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2486 for (value = 0; value < 256; value++)
2487 if (!isALNUM(value))
2488 ANYOF_BITMAP_CLEAR(data->start_class, value);
2492 if (data->start_class->flags & ANYOF_LOCALE)
2493 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2495 for (value = 0; value < 256; value++)
2497 ANYOF_BITMAP_SET(data->start_class, value);
2502 if (flags & SCF_DO_STCLASS_AND) {
2503 if (data->start_class->flags & ANYOF_LOCALE)
2504 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2507 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2508 data->start_class->flags |= ANYOF_LOCALE;
2512 if (flags & SCF_DO_STCLASS_AND) {
2513 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2514 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2515 for (value = 0; value < 256; value++)
2517 ANYOF_BITMAP_CLEAR(data->start_class, value);
2521 if (data->start_class->flags & ANYOF_LOCALE)
2522 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2524 for (value = 0; value < 256; value++)
2525 if (!isALNUM(value))
2526 ANYOF_BITMAP_SET(data->start_class, value);
2531 if (flags & SCF_DO_STCLASS_AND) {
2532 if (data->start_class->flags & ANYOF_LOCALE)
2533 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2536 data->start_class->flags |= ANYOF_LOCALE;
2537 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2541 if (flags & SCF_DO_STCLASS_AND) {
2542 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2543 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2544 for (value = 0; value < 256; value++)
2545 if (!isSPACE(value))
2546 ANYOF_BITMAP_CLEAR(data->start_class, value);
2550 if (data->start_class->flags & ANYOF_LOCALE)
2551 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2553 for (value = 0; value < 256; value++)
2555 ANYOF_BITMAP_SET(data->start_class, value);
2560 if (flags & SCF_DO_STCLASS_AND) {
2561 if (data->start_class->flags & ANYOF_LOCALE)
2562 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2565 data->start_class->flags |= ANYOF_LOCALE;
2566 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2570 if (flags & SCF_DO_STCLASS_AND) {
2571 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2572 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2573 for (value = 0; value < 256; value++)
2575 ANYOF_BITMAP_CLEAR(data->start_class, value);
2579 if (data->start_class->flags & ANYOF_LOCALE)
2580 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2582 for (value = 0; value < 256; value++)
2583 if (!isSPACE(value))
2584 ANYOF_BITMAP_SET(data->start_class, value);
2589 if (flags & SCF_DO_STCLASS_AND) {
2590 if (data->start_class->flags & ANYOF_LOCALE) {
2591 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2592 for (value = 0; value < 256; value++)
2593 if (!isSPACE(value))
2594 ANYOF_BITMAP_CLEAR(data->start_class, value);
2598 data->start_class->flags |= ANYOF_LOCALE;
2599 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2603 if (flags & SCF_DO_STCLASS_AND) {
2604 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2605 for (value = 0; value < 256; value++)
2606 if (!isDIGIT(value))
2607 ANYOF_BITMAP_CLEAR(data->start_class, value);
2610 if (data->start_class->flags & ANYOF_LOCALE)
2611 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2613 for (value = 0; value < 256; value++)
2615 ANYOF_BITMAP_SET(data->start_class, value);
2620 if (flags & SCF_DO_STCLASS_AND) {
2621 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2622 for (value = 0; value < 256; value++)
2624 ANYOF_BITMAP_CLEAR(data->start_class, value);
2627 if (data->start_class->flags & ANYOF_LOCALE)
2628 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2630 for (value = 0; value < 256; value++)
2631 if (!isDIGIT(value))
2632 ANYOF_BITMAP_SET(data->start_class, value);
2637 if (flags & SCF_DO_STCLASS_OR)
2638 cl_and(data->start_class, &and_with);
2639 flags &= ~SCF_DO_STCLASS;
2642 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2643 data->flags |= (OP(scan) == MEOL
2647 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2648 /* Lookbehind, or need to calculate parens/evals/stclass: */
2649 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2650 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2651 /* Lookahead/lookbehind */
2652 I32 deltanext, minnext, fake = 0;
2654 struct regnode_charclass_class intrnl;
2657 data_fake.flags = 0;
2659 data_fake.whilem_c = data->whilem_c;
2660 data_fake.last_closep = data->last_closep;
2663 data_fake.last_closep = &fake;
2664 if ( flags & SCF_DO_STCLASS && !scan->flags
2665 && OP(scan) == IFMATCH ) { /* Lookahead */
2666 cl_init(pRExC_state, &intrnl);
2667 data_fake.start_class = &intrnl;
2668 f |= SCF_DO_STCLASS_AND;
2670 if (flags & SCF_WHILEM_VISITED_POS)
2671 f |= SCF_WHILEM_VISITED_POS;
2672 next = regnext(scan);
2673 nscan = NEXTOPER(NEXTOPER(scan));
2674 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2677 vFAIL("Variable length lookbehind not implemented");
2679 else if (minnext > U8_MAX) {
2680 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2682 scan->flags = (U8)minnext;
2684 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2686 if (data && (data_fake.flags & SF_HAS_EVAL))
2687 data->flags |= SF_HAS_EVAL;
2689 data->whilem_c = data_fake.whilem_c;
2690 if (f & SCF_DO_STCLASS_AND) {
2691 const int was = (data->start_class->flags & ANYOF_EOS);
2693 cl_and(data->start_class, &intrnl);
2695 data->start_class->flags |= ANYOF_EOS;
2698 else if (OP(scan) == OPEN) {
2701 else if (OP(scan) == CLOSE) {
2702 if ((I32)ARG(scan) == is_par) {
2703 next = regnext(scan);
2705 if ( next && (OP(next) != WHILEM) && next < last)
2706 is_par = 0; /* Disable optimization */
2709 *(data->last_closep) = ARG(scan);
2711 else if (OP(scan) == EVAL) {
2713 data->flags |= SF_HAS_EVAL;
2715 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2716 if (flags & SCF_DO_SUBSTR) {
2717 scan_commit(pRExC_state,data);
2718 data->longest = &(data->longest_float);
2720 is_inf = is_inf_internal = 1;
2721 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2722 cl_anything(pRExC_state, data->start_class);
2723 flags &= ~SCF_DO_STCLASS;
2725 /* Else: zero-length, ignore. */
2726 scan = regnext(scan);
2731 *deltap = is_inf_internal ? I32_MAX : delta;
2732 if (flags & SCF_DO_SUBSTR && is_inf)
2733 data->pos_delta = I32_MAX - data->pos_min;
2734 if (is_par > U8_MAX)
2736 if (is_par && pars==1 && data) {
2737 data->flags |= SF_IN_PAR;
2738 data->flags &= ~SF_HAS_PAR;
2740 else if (pars && data) {
2741 data->flags |= SF_HAS_PAR;
2742 data->flags &= ~SF_IN_PAR;
2744 if (flags & SCF_DO_STCLASS_OR)
2745 cl_and(data->start_class, &and_with);
2750 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
2752 if (RExC_rx->data) {
2753 Renewc(RExC_rx->data,
2754 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2755 char, struct reg_data);
2756 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2757 RExC_rx->data->count += n;
2760 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2761 char, struct reg_data);
2762 New(1208, RExC_rx->data->what, n, U8);
2763 RExC_rx->data->count = n;
2765 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2766 return RExC_rx->data->count - n;
2770 Perl_reginitcolors(pTHX)
2772 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2774 char *t = savepv(s);
2778 t = strchr(t, '\t');
2784 PL_colors[i] = t = (char *)"";
2789 PL_colors[i++] = (char *)"";
2796 - pregcomp - compile a regular expression into internal code
2798 * We can't allocate space until we know how big the compiled form will be,
2799 * but we can't compile it (and thus know how big it is) until we've got a
2800 * place to put the code. So we cheat: we compile it twice, once with code
2801 * generation turned off and size counting turned on, and once "for real".
2802 * This also means that we don't allocate space until we are sure that the
2803 * thing really will compile successfully, and we never have to move the
2804 * code and thus invalidate pointers into it. (Note that it has to be in
2805 * one piece because free() must be able to free it all.) [NB: not true in perl]
2807 * Beware that the optimization-preparation code in here knows about some
2808 * of the structure of the compiled regexp. [I'll say.]
2811 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
2821 RExC_state_t RExC_state;
2822 RExC_state_t *pRExC_state = &RExC_state;
2824 GET_RE_DEBUG_FLAGS_DECL;
2827 FAIL("NULL regexp argument");
2829 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
2832 DEBUG_r(if (!PL_colorset) reginitcolors());
2834 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
2835 PL_colors[4],PL_colors[5],PL_colors[0],
2836 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2838 RExC_flags = pm->op_pmflags;
2842 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2843 RExC_seen_evals = 0;
2846 /* First pass: determine size, legality. */
2853 RExC_emit = &PL_regdummy;
2854 RExC_whilem_seen = 0;
2855 #if 0 /* REGC() is (currently) a NOP at the first pass.
2856 * Clever compilers notice this and complain. --jhi */
2857 REGC((U8)REG_MAGIC, (char*)RExC_emit);
2859 if (reg(pRExC_state, 0, &flags) == NULL) {
2860 RExC_precomp = Nullch;
2863 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
2865 /* Small enough for pointer-storage convention?
2866 If extralen==0, this means that we will not need long jumps. */
2867 if (RExC_size >= 0x10000L && RExC_extralen)
2868 RExC_size += RExC_extralen;
2871 if (RExC_whilem_seen > 15)
2872 RExC_whilem_seen = 15;
2874 /* Allocate space and initialize. */
2875 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
2878 FAIL("Regexp out of space");
2881 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
2882 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
2885 r->prelen = xend - exp;
2886 r->precomp = savepvn(RExC_precomp, r->prelen);
2888 #ifdef PERL_OLD_COPY_ON_WRITE
2889 r->saved_copy = Nullsv;
2891 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
2892 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
2894 r->substrs = 0; /* Useful during FAIL. */
2895 r->startp = 0; /* Useful during FAIL. */
2896 r->endp = 0; /* Useful during FAIL. */
2898 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2900 r->offsets[0] = RExC_size;
2902 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2903 "%s %"UVuf" bytes for offset annotations.\n",
2904 r->offsets ? "Got" : "Couldn't get",
2905 (UV)((2*RExC_size+1) * sizeof(U32))));
2909 /* Second pass: emit code. */
2910 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
2915 RExC_emit_start = r->program;
2916 RExC_emit = r->program;
2917 /* Store the count of eval-groups for security checks: */
2918 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
2919 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
2921 if (reg(pRExC_state, 0, &flags) == NULL)
2925 /* Dig out information for optimizations. */
2926 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
2927 pm->op_pmflags = RExC_flags;
2929 r->reganch |= ROPT_UTF8; /* Unicode in it? */
2930 r->regstclass = NULL;
2931 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
2932 r->reganch |= ROPT_NAUGHTY;
2933 scan = r->program + 1; /* First BRANCH. */
2935 /* XXXX To minimize changes to RE engine we always allocate
2936 3-units-long substrs field. */
2937 Newz(1004, r->substrs, 1, struct reg_substr_data);
2939 StructCopy(&zero_scan_data, &data, scan_data_t);
2940 /* XXXX Should not we check for something else? Usually it is OPEN1... */
2941 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
2943 STRLEN longest_float_length, longest_fixed_length;
2944 struct regnode_charclass_class ch_class;
2949 /* Skip introductions and multiplicators >= 1. */
2950 while ((OP(first) == OPEN && (sawopen = 1)) ||
2951 /* An OR of *one* alternative - should not happen now. */
2952 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2953 (OP(first) == PLUS) ||
2954 (OP(first) == MINMOD) ||
2955 /* An {n,m} with n>0 */
2956 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
2957 if (OP(first) == PLUS)
2960 first += regarglen[(U8)OP(first)];
2961 first = NEXTOPER(first);
2964 /* Starting-point info. */
2966 if (PL_regkind[(U8)OP(first)] == EXACT) {
2967 if (OP(first) == EXACT)
2968 ; /* Empty, get anchored substr later. */
2969 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
2970 r->regstclass = first;
2972 else if (strchr((const char*)PL_simple,OP(first)))
2973 r->regstclass = first;
2974 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2975 PL_regkind[(U8)OP(first)] == NBOUND)
2976 r->regstclass = first;
2977 else if (PL_regkind[(U8)OP(first)] == BOL) {
2978 r->reganch |= (OP(first) == MBOL
2980 : (OP(first) == SBOL
2983 first = NEXTOPER(first);
2986 else if (OP(first) == GPOS) {
2987 r->reganch |= ROPT_ANCH_GPOS;
2988 first = NEXTOPER(first);
2991 else if (!sawopen && (OP(first) == STAR &&
2992 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
2993 !(r->reganch & ROPT_ANCH) )
2995 /* turn .* into ^.* with an implied $*=1 */
2997 (OP(NEXTOPER(first)) == REG_ANY)
3000 r->reganch |= type | ROPT_IMPLICIT;
3001 first = NEXTOPER(first);
3004 if (sawplus && (!sawopen || !RExC_sawback)
3005 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3006 /* x+ must match at the 1st pos of run of x's */
3007 r->reganch |= ROPT_SKIP;
3009 /* Scan is after the zeroth branch, first is atomic matcher. */
3010 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3011 (IV)(first - scan + 1)));
3013 * If there's something expensive in the r.e., find the
3014 * longest literal string that must appear and make it the
3015 * regmust. Resolve ties in favor of later strings, since
3016 * the regstart check works with the beginning of the r.e.
3017 * and avoiding duplication strengthens checking. Not a
3018 * strong reason, but sufficient in the absence of others.
3019 * [Now we resolve ties in favor of the earlier string if
3020 * it happens that c_offset_min has been invalidated, since the
3021 * earlier string may buy us something the later one won't.]
3025 data.longest_fixed = newSVpvn("",0);
3026 data.longest_float = newSVpvn("",0);
3027 data.last_found = newSVpvn("",0);
3028 data.longest = &(data.longest_fixed);
3030 if (!r->regstclass) {
3031 cl_init(pRExC_state, &ch_class);
3032 data.start_class = &ch_class;
3033 stclass_flag = SCF_DO_STCLASS_AND;
3034 } else /* XXXX Check for BOUND? */
3036 data.last_closep = &last_close;
3038 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3039 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3040 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3041 && data.last_start_min == 0 && data.last_end > 0
3042 && !RExC_seen_zerolen
3043 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3044 r->reganch |= ROPT_CHECK_ALL;
3045 scan_commit(pRExC_state, &data);
3046 SvREFCNT_dec(data.last_found);
3048 longest_float_length = CHR_SVLEN(data.longest_float);
3049 if (longest_float_length
3050 || (data.flags & SF_FL_BEFORE_EOL
3051 && (!(data.flags & SF_FL_BEFORE_MEOL)
3052 || (RExC_flags & PMf_MULTILINE)))) {
3055 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3056 && data.offset_fixed == data.offset_float_min
3057 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3058 goto remove_float; /* As in (a)+. */
3060 if (SvUTF8(data.longest_float)) {
3061 r->float_utf8 = data.longest_float;
3062 r->float_substr = Nullsv;
3064 r->float_substr = data.longest_float;
3065 r->float_utf8 = Nullsv;
3067 r->float_min_offset = data.offset_float_min;
3068 r->float_max_offset = data.offset_float_max;
3069 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3070 && (!(data.flags & SF_FL_BEFORE_MEOL)
3071 || (RExC_flags & PMf_MULTILINE)));
3072 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3076 r->float_substr = r->float_utf8 = Nullsv;
3077 SvREFCNT_dec(data.longest_float);
3078 longest_float_length = 0;
3081 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3082 if (longest_fixed_length
3083 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3084 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3085 || (RExC_flags & PMf_MULTILINE)))) {
3088 if (SvUTF8(data.longest_fixed)) {
3089 r->anchored_utf8 = data.longest_fixed;
3090 r->anchored_substr = Nullsv;
3092 r->anchored_substr = data.longest_fixed;
3093 r->anchored_utf8 = Nullsv;
3095 r->anchored_offset = data.offset_fixed;
3096 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3097 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3098 || (RExC_flags & PMf_MULTILINE)));
3099 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3102 r->anchored_substr = r->anchored_utf8 = Nullsv;
3103 SvREFCNT_dec(data.longest_fixed);
3104 longest_fixed_length = 0;
3107 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3108 r->regstclass = NULL;
3109 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3111 && !(data.start_class->flags & ANYOF_EOS)
3112 && !cl_is_anything(data.start_class))
3114 const I32 n = add_data(pRExC_state, 1, "f");
3116 New(1006, RExC_rx->data->data[n], 1,
3117 struct regnode_charclass_class);
3118 StructCopy(data.start_class,
3119 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3120 struct regnode_charclass_class);
3121 r->regstclass = (regnode*)RExC_rx->data->data[n];
3122 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3123 PL_regdata = r->data; /* for regprop() */
3124 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3125 regprop(sv, (regnode*)data.start_class);
3126 PerlIO_printf(Perl_debug_log,
3127 "synthetic stclass \"%s\".\n",
3128 SvPVX_const(sv));});
3131 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3132 if (longest_fixed_length > longest_float_length) {
3133 r->check_substr = r->anchored_substr;
3134 r->check_utf8 = r->anchored_utf8;
3135 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3136 if (r->reganch & ROPT_ANCH_SINGLE)
3137 r->reganch |= ROPT_NOSCAN;
3140 r->check_substr = r->float_substr;
3141 r->check_utf8 = r->float_utf8;
3142 r->check_offset_min = data.offset_float_min;
3143 r->check_offset_max = data.offset_float_max;
3145 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3146 This should be changed ASAP! */
3147 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3148 r->reganch |= RE_USE_INTUIT;
3149 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3150 r->reganch |= RE_INTUIT_TAIL;
3154 /* Several toplevels. Best we can is to set minlen. */
3156 struct regnode_charclass_class ch_class;
3159 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3160 scan = r->program + 1;
3161 cl_init(pRExC_state, &ch_class);
3162 data.start_class = &ch_class;
3163 data.last_closep = &last_close;
3164 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3165 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3166 = r->float_substr = r->float_utf8 = Nullsv;
3167 if (!(data.start_class->flags & ANYOF_EOS)
3168 && !cl_is_anything(data.start_class))
3170 const I32 n = add_data(pRExC_state, 1, "f");
3172 New(1006, RExC_rx->data->data[n], 1,
3173 struct regnode_charclass_class);
3174 StructCopy(data.start_class,
3175 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3176 struct regnode_charclass_class);
3177 r->regstclass = (regnode*)RExC_rx->data->data[n];
3178 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3179 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3180 regprop(sv, (regnode*)data.start_class);
3181 PerlIO_printf(Perl_debug_log,
3182 "synthetic stclass \"%s\".\n",
3183 SvPVX_const(sv));});
3188 if (RExC_seen & REG_SEEN_GPOS)
3189 r->reganch |= ROPT_GPOS_SEEN;
3190 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3191 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3192 if (RExC_seen & REG_SEEN_EVAL)
3193 r->reganch |= ROPT_EVAL_SEEN;
3194 if (RExC_seen & REG_SEEN_CANY)
3195 r->reganch |= ROPT_CANY_SEEN;
3196 Newz(1002, r->startp, RExC_npar, I32);
3197 Newz(1002, r->endp, RExC_npar, I32);
3198 PL_regdata = r->data; /* for regprop() */
3199 DEBUG_COMPILE_r(regdump(r));
3204 - reg - regular expression, i.e. main body or parenthesized thing
3206 * Caller must absorb opening parenthesis.
3208 * Combining parenthesis handling with the base level of regular expression
3209 * is a trifle forced, but the need to tie the tails of the branches to what
3210 * follows makes it hard to avoid.
3213 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
3214 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3217 register regnode *ret; /* Will be the head of the group. */
3218 register regnode *br;
3219 register regnode *lastbr;
3220 register regnode *ender = 0;
3221 register I32 parno = 0;
3222 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
3224 /* for (?g), (?gc), and (?o) warnings; warning
3225 about (?c) will warn about (?g) -- japhy */
3227 I32 wastedflags = 0x00,
3230 wasted_gc = 0x02 | 0x04,
3233 char * parse_start = RExC_parse; /* MJD */
3234 char * const oregcomp_parse = RExC_parse;
3237 *flagp = 0; /* Tentatively. */
3240 /* Make an OPEN node, if parenthesized. */
3242 if (*RExC_parse == '?') { /* (?...) */
3243 U32 posflags = 0, negflags = 0;
3244 U32 *flagsp = &posflags;
3246 const char * const seqstart = RExC_parse;
3249 paren = *RExC_parse++;
3250 ret = NULL; /* For look-ahead/behind. */
3252 case '<': /* (?<...) */
3253 RExC_seen |= REG_SEEN_LOOKBEHIND;
3254 if (*RExC_parse == '!')
3256 if (*RExC_parse != '=' && *RExC_parse != '!')
3259 case '=': /* (?=...) */
3260 case '!': /* (?!...) */
3261 RExC_seen_zerolen++;
3262 case ':': /* (?:...) */
3263 case '>': /* (?>...) */
3265 case '$': /* (?$...) */
3266 case '@': /* (?@...) */
3267 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3269 case '#': /* (?#...) */
3270 while (*RExC_parse && *RExC_parse != ')')
3272 if (*RExC_parse != ')')
3273 FAIL("Sequence (?#... not terminated");
3274 nextchar(pRExC_state);
3277 case 'p': /* (?p...) */
3278 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3279 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3281 case '?': /* (??...) */
3283 if (*RExC_parse != '{')
3285 paren = *RExC_parse++;
3287 case '{': /* (?{...}) */
3289 I32 count = 1, n = 0;
3291 char *s = RExC_parse;
3293 OP_4tree *sop, *rop;
3295 RExC_seen_zerolen++;
3296 RExC_seen |= REG_SEEN_EVAL;
3297 while (count && (c = *RExC_parse)) {
3298 if (c == '\\' && RExC_parse[1])
3306 if (*RExC_parse != ')')
3309 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3314 if (RExC_parse - 1 - s)
3315 sv = newSVpvn(s, RExC_parse - 1 - s);
3317 sv = newSVpvn("", 0);
3320 Perl_save_re_context(aTHX);
3321 rop = sv_compile_2op(sv, &sop, "re", &pad);
3322 sop->op_private |= OPpREFCOUNTED;
3323 /* re_dup will OpREFCNT_inc */
3324 OpREFCNT_set(sop, 1);
3327 n = add_data(pRExC_state, 3, "nop");
3328 RExC_rx->data->data[n] = (void*)rop;
3329 RExC_rx->data->data[n+1] = (void*)sop;
3330 RExC_rx->data->data[n+2] = (void*)pad;
3333 else { /* First pass */
3334 if (PL_reginterp_cnt < ++RExC_seen_evals
3336 /* No compiled RE interpolated, has runtime
3337 components ===> unsafe. */
3338 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3339 if (PL_tainting && PL_tainted)
3340 FAIL("Eval-group in insecure regular expression");
3341 if (IN_PERL_COMPILETIME)
3345 nextchar(pRExC_state);
3347 ret = reg_node(pRExC_state, LOGICAL);
3350 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3351 /* deal with the length of this later - MJD */
3354 ret = reganode(pRExC_state, EVAL, n);
3355 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3356 Set_Node_Offset(ret, parse_start);
3359 case '(': /* (?(?{...})...) and (?(?=...)...) */
3361 if (RExC_parse[0] == '?') { /* (?(?...)) */
3362 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3363 || RExC_parse[1] == '<'
3364 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3367 ret = reg_node(pRExC_state, LOGICAL);
3370 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
3374 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3376 parno = atoi(RExC_parse++);
3378 while (isDIGIT(*RExC_parse))
3380 ret = reganode(pRExC_state, GROUPP, parno);
3382 if ((c = *nextchar(pRExC_state)) != ')')
3383 vFAIL("Switch condition not recognized");
3385 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3386 br = regbranch(pRExC_state, &flags, 1);
3388 br = reganode(pRExC_state, LONGJMP, 0);
3390 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3391 c = *nextchar(pRExC_state);
3395 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3396 regbranch(pRExC_state, &flags, 1);
3397 regtail(pRExC_state, ret, lastbr);
3400 c = *nextchar(pRExC_state);
3405 vFAIL("Switch (?(condition)... contains too many branches");
3406 ender = reg_node(pRExC_state, TAIL);
3407 regtail(pRExC_state, br, ender);
3409 regtail(pRExC_state, lastbr, ender);
3410 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3413 regtail(pRExC_state, ret, ender);
3417 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3421 RExC_parse--; /* for vFAIL to print correctly */
3422 vFAIL("Sequence (? incomplete");
3426 parse_flags: /* (?i) */
3427 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3428 /* (?g), (?gc) and (?o) are useless here
3429 and must be globally applied -- japhy */
3431 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3432 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3433 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3434 if (! (wastedflags & wflagbit) ) {
3435 wastedflags |= wflagbit;
3438 "Useless (%s%c) - %suse /%c modifier",
3439 flagsp == &negflags ? "?-" : "?",
3441 flagsp == &negflags ? "don't " : "",
3447 else if (*RExC_parse == 'c') {
3448 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3449 if (! (wastedflags & wasted_c) ) {
3450 wastedflags |= wasted_gc;
3453 "Useless (%sc) - %suse /gc modifier",
3454 flagsp == &negflags ? "?-" : "?",
3455 flagsp == &negflags ? "don't " : ""
3460 else { pmflag(flagsp, *RExC_parse); }
3464 if (*RExC_parse == '-') {
3466 wastedflags = 0; /* reset so (?g-c) warns twice */
3470 RExC_flags |= posflags;
3471 RExC_flags &= ~negflags;
3472 if (*RExC_parse == ':') {
3478 if (*RExC_parse != ')') {
3480 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3482 nextchar(pRExC_state);
3490 ret = reganode(pRExC_state, OPEN, parno);
3491 Set_Node_Length(ret, 1); /* MJD */
3492 Set_Node_Offset(ret, RExC_parse); /* MJD */
3499 /* Pick up the branches, linking them together. */
3500 parse_start = RExC_parse; /* MJD */
3501 br = regbranch(pRExC_state, &flags, 1);
3502 /* branch_len = (paren != 0); */
3506 if (*RExC_parse == '|') {
3507 if (!SIZE_ONLY && RExC_extralen) {
3508 reginsert(pRExC_state, BRANCHJ, br);
3511 reginsert(pRExC_state, BRANCH, br);
3512 Set_Node_Length(br, paren != 0);
3513 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3517 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3519 else if (paren == ':') {
3520 *flagp |= flags&SIMPLE;
3522 if (open) { /* Starts with OPEN. */
3523 regtail(pRExC_state, ret, br); /* OPEN -> first. */
3525 else if (paren != '?') /* Not Conditional */
3527 *flagp |= flags & (SPSTART | HASWIDTH);
3529 while (*RExC_parse == '|') {
3530 if (!SIZE_ONLY && RExC_extralen) {
3531 ender = reganode(pRExC_state, LONGJMP,0);
3532 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3535 RExC_extralen += 2; /* Account for LONGJMP. */
3536 nextchar(pRExC_state);
3537 br = regbranch(pRExC_state, &flags, 0);
3541 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3545 *flagp |= flags&SPSTART;
3548 if (have_branch || paren != ':') {
3549 /* Make a closing node, and hook it on the end. */
3552 ender = reg_node(pRExC_state, TAIL);
3555 ender = reganode(pRExC_state, CLOSE, parno);
3556 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3557 Set_Node_Length(ender,1); /* MJD */
3563 *flagp &= ~HASWIDTH;
3566 ender = reg_node(pRExC_state, SUCCEED);
3569 ender = reg_node(pRExC_state, END);
3572 regtail(pRExC_state, lastbr, ender);
3575 /* Hook the tails of the branches to the closing node. */
3576 for (br = ret; br != NULL; br = regnext(br)) {
3577 regoptail(pRExC_state, br, ender);
3584 static const char parens[] = "=!<,>";
3586 if (paren && (p = strchr(parens, paren))) {
3587 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3588 int flag = (p - parens) > 1;
3591 node = SUSPEND, flag = 0;
3592 reginsert(pRExC_state, node,ret);
3593 Set_Node_Cur_Length(ret);
3594 Set_Node_Offset(ret, parse_start + 1);
3596 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3600 /* Check for proper termination. */
3602 RExC_flags = oregflags;
3603 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3604 RExC_parse = oregcomp_parse;
3605 vFAIL("Unmatched (");
3608 else if (!paren && RExC_parse < RExC_end) {
3609 if (*RExC_parse == ')') {
3611 vFAIL("Unmatched )");
3614 FAIL("Junk on end of regexp"); /* "Can't happen". */
3622 - regbranch - one alternative of an | operator
3624 * Implements the concatenation operator.
3627 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
3629 register regnode *ret;
3630 register regnode *chain = NULL;
3631 register regnode *latest;
3632 I32 flags = 0, c = 0;
3637 if (!SIZE_ONLY && RExC_extralen)
3638 ret = reganode(pRExC_state, BRANCHJ,0);
3640 ret = reg_node(pRExC_state, BRANCH);
3641 Set_Node_Length(ret, 1);
3645 if (!first && SIZE_ONLY)
3646 RExC_extralen += 1; /* BRANCHJ */
3648 *flagp = WORST; /* Tentatively. */
3651 nextchar(pRExC_state);
3652 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3654 latest = regpiece(pRExC_state, &flags);
3655 if (latest == NULL) {
3656 if (flags & TRYAGAIN)
3660 else if (ret == NULL)
3662 *flagp |= flags&HASWIDTH;
3663 if (chain == NULL) /* First piece. */
3664 *flagp |= flags&SPSTART;
3667 regtail(pRExC_state, chain, latest);
3672 if (chain == NULL) { /* Loop ran zero times. */
3673 chain = reg_node(pRExC_state, NOTHING);
3678 *flagp |= flags&SIMPLE;
3685 - regpiece - something followed by possible [*+?]
3687 * Note that the branching code sequences used for ? and the general cases
3688 * of * and + are somewhat optimized: they use the same NOTHING node as
3689 * both the endmarker for their branch list and the body of the last branch.
3690 * It might seem that this node could be dispensed with entirely, but the
3691 * endmarker role is not redundant.
3694 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3696 register regnode *ret;
3698 register char *next;
3700 const char * const origparse = RExC_parse;
3703 I32 max = REG_INFTY;
3706 ret = regatom(pRExC_state, &flags);
3708 if (flags & TRYAGAIN)
3715 if (op == '{' && regcurly(RExC_parse)) {
3716 parse_start = RExC_parse; /* MJD */
3717 next = RExC_parse + 1;
3719 while (isDIGIT(*next) || *next == ',') {
3728 if (*next == '}') { /* got one */
3732 min = atoi(RExC_parse);
3736 maxpos = RExC_parse;
3738 if (!max && *maxpos != '0')
3739 max = REG_INFTY; /* meaning "infinity" */
3740 else if (max >= REG_INFTY)
3741 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
3743 nextchar(pRExC_state);
3746 if ((flags&SIMPLE)) {
3747 RExC_naughty += 2 + RExC_naughty / 2;
3748 reginsert(pRExC_state, CURLY, ret);
3749 Set_Node_Offset(ret, parse_start+1); /* MJD */
3750 Set_Node_Cur_Length(ret);
3753 regnode *w = reg_node(pRExC_state, WHILEM);
3756 regtail(pRExC_state, ret, w);
3757 if (!SIZE_ONLY && RExC_extralen) {
3758 reginsert(pRExC_state, LONGJMP,ret);
3759 reginsert(pRExC_state, NOTHING,ret);
3760 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3762 reginsert(pRExC_state, CURLYX,ret);
3764 Set_Node_Offset(ret, parse_start+1);
3765 Set_Node_Length(ret,
3766 op == '{' ? (RExC_parse - parse_start) : 1);
3768 if (!SIZE_ONLY && RExC_extralen)
3769 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3770 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
3772 RExC_whilem_seen++, RExC_extralen += 3;
3773 RExC_naughty += 4 + RExC_naughty; /* compound interest */
3781 if (max && max < min)
3782 vFAIL("Can't do {n,m} with n > m");
3784 ARG1_SET(ret, (U16)min);
3785 ARG2_SET(ret, (U16)max);
3797 #if 0 /* Now runtime fix should be reliable. */
3799 /* if this is reinstated, don't forget to put this back into perldiag:
3801 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3803 (F) The part of the regexp subject to either the * or + quantifier
3804 could match an empty string. The {#} shows in the regular
3805 expression about where the problem was discovered.
3809 if (!(flags&HASWIDTH) && op != '?')
3810 vFAIL("Regexp *+ operand could be empty");
3813 parse_start = RExC_parse;
3814 nextchar(pRExC_state);
3816 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
3818 if (op == '*' && (flags&SIMPLE)) {
3819 reginsert(pRExC_state, STAR, ret);
3823 else if (op == '*') {
3827 else if (op == '+' && (flags&SIMPLE)) {
3828 reginsert(pRExC_state, PLUS, ret);
3832 else if (op == '+') {
3836 else if (op == '?') {
3841 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
3843 "%.*s matches null string many times",
3844 RExC_parse - origparse,
3848 if (*RExC_parse == '?') {
3849 nextchar(pRExC_state);
3850 reginsert(pRExC_state, MINMOD, ret);
3851 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
3853 if (ISMULT2(RExC_parse)) {
3855 vFAIL("Nested quantifiers");
3862 - regatom - the lowest level
3864 * Optimization: gobbles an entire sequence of ordinary characters so that
3865 * it can turn them into a single node, which is smaller to store and
3866 * faster to run. Backslashed characters are exceptions, each becoming a
3867 * separate node; the code is simpler that way and it's not worth fixing.
3869 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
3871 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
3873 register regnode *ret = 0;
3875 char *parse_start = RExC_parse;
3877 *flagp = WORST; /* Tentatively. */
3880 switch (*RExC_parse) {
3882 RExC_seen_zerolen++;
3883 nextchar(pRExC_state);
3884 if (RExC_flags & PMf_MULTILINE)
3885 ret = reg_node(pRExC_state, MBOL);
3886 else if (RExC_flags & PMf_SINGLELINE)
3887 ret = reg_node(pRExC_state, SBOL);
3889 ret = reg_node(pRExC_state, BOL);
3890 Set_Node_Length(ret, 1); /* MJD */
3893 nextchar(pRExC_state);
3895 RExC_seen_zerolen++;
3896 if (RExC_flags & PMf_MULTILINE)
3897 ret = reg_node(pRExC_state, MEOL);
3898 else if (RExC_flags & PMf_SINGLELINE)
3899 ret = reg_node(pRExC_state, SEOL);
3901 ret = reg_node(pRExC_state, EOL);
3902 Set_Node_Length(ret, 1); /* MJD */
3905 nextchar(pRExC_state);
3906 if (RExC_flags & PMf_SINGLELINE)
3907 ret = reg_node(pRExC_state, SANY);
3909 ret = reg_node(pRExC_state, REG_ANY);
3910 *flagp |= HASWIDTH|SIMPLE;
3912 Set_Node_Length(ret, 1); /* MJD */
3916 char *oregcomp_parse = ++RExC_parse;
3917 ret = regclass(pRExC_state);
3918 if (*RExC_parse != ']') {
3919 RExC_parse = oregcomp_parse;
3920 vFAIL("Unmatched [");
3922 nextchar(pRExC_state);
3923 *flagp |= HASWIDTH|SIMPLE;
3924 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
3928 nextchar(pRExC_state);
3929 ret = reg(pRExC_state, 1, &flags);
3931 if (flags & TRYAGAIN) {
3932 if (RExC_parse == RExC_end) {
3933 /* Make parent create an empty node if needed. */
3941 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
3945 if (flags & TRYAGAIN) {
3949 vFAIL("Internal urp");
3950 /* Supposed to be caught earlier. */
3953 if (!regcurly(RExC_parse)) {
3962 vFAIL("Quantifier follows nothing");
3965 switch (*++RExC_parse) {
3967 RExC_seen_zerolen++;
3968 ret = reg_node(pRExC_state, SBOL);
3970 nextchar(pRExC_state);
3971 Set_Node_Length(ret, 2); /* MJD */
3974 ret = reg_node(pRExC_state, GPOS);
3975 RExC_seen |= REG_SEEN_GPOS;
3977 nextchar(pRExC_state);
3978 Set_Node_Length(ret, 2); /* MJD */
3981 ret = reg_node(pRExC_state, SEOL);
3983 RExC_seen_zerolen++; /* Do not optimize RE away */
3984 nextchar(pRExC_state);
3987 ret = reg_node(pRExC_state, EOS);
3989 RExC_seen_zerolen++; /* Do not optimize RE away */
3990 nextchar(pRExC_state);
3991 Set_Node_Length(ret, 2); /* MJD */
3994 ret = reg_node(pRExC_state, CANY);
3995 RExC_seen |= REG_SEEN_CANY;
3996 *flagp |= HASWIDTH|SIMPLE;
3997 nextchar(pRExC_state);
3998 Set_Node_Length(ret, 2); /* MJD */
4001 ret = reg_node(pRExC_state, CLUMP);
4003 nextchar(pRExC_state);
4004 Set_Node_Length(ret, 2); /* MJD */
4007 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4008 *flagp |= HASWIDTH|SIMPLE;
4009 nextchar(pRExC_state);
4010 Set_Node_Length(ret, 2); /* MJD */
4013 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4014 *flagp |= HASWIDTH|SIMPLE;
4015 nextchar(pRExC_state);
4016 Set_Node_Length(ret, 2); /* MJD */
4019 RExC_seen_zerolen++;
4020 RExC_seen |= REG_SEEN_LOOKBEHIND;
4021 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4023 nextchar(pRExC_state);
4024 Set_Node_Length(ret, 2); /* MJD */
4027 RExC_seen_zerolen++;
4028 RExC_seen |= REG_SEEN_LOOKBEHIND;
4029 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4031 nextchar(pRExC_state);
4032 Set_Node_Length(ret, 2); /* MJD */
4035 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4036 *flagp |= HASWIDTH|SIMPLE;
4037 nextchar(pRExC_state);
4038 Set_Node_Length(ret, 2); /* MJD */
4041 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4042 *flagp |= HASWIDTH|SIMPLE;
4043 nextchar(pRExC_state);
4044 Set_Node_Length(ret, 2); /* MJD */
4047 ret = reg_node(pRExC_state, DIGIT);
4048 *flagp |= HASWIDTH|SIMPLE;
4049 nextchar(pRExC_state);
4050 Set_Node_Length(ret, 2); /* MJD */
4053 ret = reg_node(pRExC_state, NDIGIT);
4054 *flagp |= HASWIDTH|SIMPLE;
4055 nextchar(pRExC_state);
4056 Set_Node_Length(ret, 2); /* MJD */
4061 char* oldregxend = RExC_end;
4062 char* parse_start = RExC_parse - 2;
4064 if (RExC_parse[1] == '{') {
4065 /* a lovely hack--pretend we saw [\pX] instead */
4066 RExC_end = strchr(RExC_parse, '}');
4068 U8 c = (U8)*RExC_parse;
4070 RExC_end = oldregxend;
4071 vFAIL2("Missing right brace on \\%c{}", c);
4076 RExC_end = RExC_parse + 2;
4077 if (RExC_end > oldregxend)
4078 RExC_end = oldregxend;
4082 ret = regclass(pRExC_state);
4084 RExC_end = oldregxend;
4087 Set_Node_Offset(ret, parse_start + 2);
4088 Set_Node_Cur_Length(ret);
4089 nextchar(pRExC_state);
4090 *flagp |= HASWIDTH|SIMPLE;
4103 case '1': case '2': case '3': case '4':
4104 case '5': case '6': case '7': case '8': case '9':
4106 const I32 num = atoi(RExC_parse);
4108 if (num > 9 && num >= RExC_npar)
4111 char * parse_start = RExC_parse - 1; /* MJD */
4112 while (isDIGIT(*RExC_parse))
4115 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4116 vFAIL("Reference to nonexistent group");
4118 ret = reganode(pRExC_state,
4119 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4123 /* override incorrect value set in reganode MJD */
4124 Set_Node_Offset(ret, parse_start+1);
4125 Set_Node_Cur_Length(ret); /* MJD */
4127 nextchar(pRExC_state);
4132 if (RExC_parse >= RExC_end)
4133 FAIL("Trailing \\");
4136 /* Do not generate "unrecognized" warnings here, we fall
4137 back into the quick-grab loop below */
4144 if (RExC_flags & PMf_EXTENDED) {
4145 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4146 if (RExC_parse < RExC_end)
4152 register STRLEN len;
4157 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4159 parse_start = RExC_parse - 1;
4165 ret = reg_node(pRExC_state,
4166 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4168 for (len = 0, p = RExC_parse - 1;
4169 len < 127 && p < RExC_end;
4174 if (RExC_flags & PMf_EXTENDED)
4175 p = regwhite(p, RExC_end);
4222 ender = ASCII_TO_NATIVE('\033');
4226 ender = ASCII_TO_NATIVE('\007');
4231 char* const e = strchr(p, '}');
4235 vFAIL("Missing right brace on \\x{}");
4238 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4239 | PERL_SCAN_DISALLOW_PREFIX;
4240 STRLEN numlen = e - p - 1;
4241 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4248 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4250 ender = grok_hex(p, &numlen, &flags, NULL);
4256 ender = UCHARAT(p++);
4257 ender = toCTRL(ender);
4259 case '0': case '1': case '2': case '3':case '4':
4260 case '5': case '6': case '7': case '8':case '9':
4262 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4265 ender = grok_oct(p, &numlen, &flags, NULL);
4275 FAIL("Trailing \\");
4278 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4279 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4280 goto normal_default;
4285 if (UTF8_IS_START(*p) && UTF) {
4287 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4295 if (RExC_flags & PMf_EXTENDED)
4296 p = regwhite(p, RExC_end);
4298 /* Prime the casefolded buffer. */
4299 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4301 if (ISMULT2(p)) { /* Back off on ?+*. */
4308 /* Emit all the Unicode characters. */
4310 for (foldbuf = tmpbuf;
4312 foldlen -= numlen) {
4313 ender = utf8_to_uvchr(foldbuf, &numlen);
4315 reguni(pRExC_state, ender, s, &unilen);
4318 /* In EBCDIC the numlen
4319 * and unilen can differ. */
4321 if (numlen >= foldlen)
4325 break; /* "Can't happen." */
4329 reguni(pRExC_state, ender, s, &unilen);
4338 REGC((char)ender, s++);
4346 /* Emit all the Unicode characters. */
4348 for (foldbuf = tmpbuf;
4350 foldlen -= numlen) {
4351 ender = utf8_to_uvchr(foldbuf, &numlen);
4353 reguni(pRExC_state, ender, s, &unilen);
4356 /* In EBCDIC the numlen
4357 * and unilen can differ. */
4359 if (numlen >= foldlen)
4367 reguni(pRExC_state, ender, s, &unilen);
4376 REGC((char)ender, s++);
4380 Set_Node_Cur_Length(ret); /* MJD */
4381 nextchar(pRExC_state);
4383 /* len is STRLEN which is unsigned, need to copy to signed */
4386 vFAIL("Internal disaster");
4390 if (len == 1 && UNI_IS_INVARIANT(ender))
4395 RExC_size += STR_SZ(len);
4397 RExC_emit += STR_SZ(len);
4402 /* If the encoding pragma is in effect recode the text of
4403 * any EXACT-kind nodes. */
4404 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
4405 STRLEN oldlen = STR_LEN(ret);
4406 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4410 if (sv_utf8_downgrade(sv, TRUE)) {
4411 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4412 const STRLEN newlen = SvCUR(sv);
4417 GET_RE_DEBUG_FLAGS_DECL;
4418 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4419 (int)oldlen, STRING(ret),
4421 Copy(s, STRING(ret), newlen, char);
4422 STR_LEN(ret) += newlen - oldlen;
4423 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4425 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4433 S_regwhite(pTHX_ char *p, const char *e)
4438 else if (*p == '#') {
4441 } while (p < e && *p != '\n');
4449 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4450 Character classes ([:foo:]) can also be negated ([:^foo:]).
4451 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4452 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4453 but trigger failures because they are currently unimplemented. */
4455 #define POSIXCC_DONE(c) ((c) == ':')
4456 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4457 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4460 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4462 I32 namedclass = OOB_NAMEDCLASS;
4464 if (value == '[' && RExC_parse + 1 < RExC_end &&
4465 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4466 POSIXCC(UCHARAT(RExC_parse))) {
4467 const char c = UCHARAT(RExC_parse);
4468 char* s = RExC_parse++;
4470 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4472 if (RExC_parse == RExC_end)
4473 /* Grandfather lone [:, [=, [. */
4476 const char* t = RExC_parse++; /* skip over the c */
4477 const char *posixcc;
4481 if (UCHARAT(RExC_parse) == ']') {
4482 RExC_parse++; /* skip over the ending ] */
4485 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4486 const I32 skip = t - posixcc;
4488 /* Initially switch on the length of the name. */
4491 if (memEQ(posixcc, "word", 4)) {
4492 /* this is not POSIX, this is the Perl \w */;
4494 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4498 /* Names all of length 5. */
4499 /* alnum alpha ascii blank cntrl digit graph lower
4500 print punct space upper */
4501 /* Offset 4 gives the best switch position. */
4502 switch (posixcc[4]) {
4504 if (memEQ(posixcc, "alph", 4)) {
4507 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4511 if (memEQ(posixcc, "spac", 4)) {
4514 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4518 if (memEQ(posixcc, "grap", 4)) {
4521 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4525 if (memEQ(posixcc, "asci", 4)) {
4528 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4532 if (memEQ(posixcc, "blan", 4)) {
4535 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4539 if (memEQ(posixcc, "cntr", 4)) {
4542 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4546 if (memEQ(posixcc, "alnu", 4)) {
4549 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4553 if (memEQ(posixcc, "lowe", 4)) {
4556 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4558 if (memEQ(posixcc, "uppe", 4)) {
4561 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4565 if (memEQ(posixcc, "digi", 4)) {
4568 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4570 if (memEQ(posixcc, "prin", 4)) {
4573 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4575 if (memEQ(posixcc, "punc", 4)) {
4578 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4584 if (memEQ(posixcc, "xdigit", 6)) {
4586 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4591 if (namedclass == OOB_NAMEDCLASS)
4593 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4596 assert (posixcc[skip] == ':');
4597 assert (posixcc[skip+1] == ']');
4598 } else if (!SIZE_ONLY) {
4599 /* [[=foo=]] and [[.foo.]] are still future. */
4601 /* adjust RExC_parse so the warning shows after
4603 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4605 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4608 /* Maternal grandfather:
4609 * "[:" ending in ":" but not in ":]" */
4619 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4621 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
4622 const char *s = RExC_parse;
4623 const char c = *s++;
4625 while(*s && isALNUM(*s))
4627 if (*s && c == *s && s[1] == ']') {
4628 if (ckWARN(WARN_REGEXP))
4630 "POSIX syntax [%c %c] belongs inside character classes",
4633 /* [[=foo=]] and [[.foo.]] are still future. */
4634 if (POSIXCC_NOTYET(c)) {
4635 /* adjust RExC_parse so the error shows after
4637 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4639 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4646 S_regclass(pTHX_ RExC_state_t *pRExC_state)
4649 register UV nextvalue;
4650 register IV prevvalue = OOB_UNICODE;
4651 register IV range = 0;
4652 register regnode *ret;
4655 char *rangebegin = 0;
4656 bool need_class = 0;
4657 SV *listsv = Nullsv;
4660 bool optimize_invert = TRUE;
4661 AV* unicode_alternate = 0;
4663 UV literal_endpoint = 0;
4666 ret = reganode(pRExC_state, ANYOF, 0);
4669 ANYOF_FLAGS(ret) = 0;
4671 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4675 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4679 RExC_size += ANYOF_SKIP;
4681 RExC_emit += ANYOF_SKIP;
4683 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4685 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4686 ANYOF_BITMAP_ZERO(ret);
4687 listsv = newSVpvn("# comment\n", 10);
4690 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4692 if (!SIZE_ONLY && POSIXCC(nextvalue))
4693 checkposixcc(pRExC_state);
4695 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4696 if (UCHARAT(RExC_parse) == ']')
4699 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4703 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4706 rangebegin = RExC_parse;
4708 value = utf8n_to_uvchr((U8*)RExC_parse,
4709 RExC_end - RExC_parse,
4711 RExC_parse += numlen;
4714 value = UCHARAT(RExC_parse++);
4715 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4716 if (value == '[' && POSIXCC(nextvalue))
4717 namedclass = regpposixcc(pRExC_state, value);
4718 else if (value == '\\') {
4720 value = utf8n_to_uvchr((U8*)RExC_parse,
4721 RExC_end - RExC_parse,
4723 RExC_parse += numlen;
4726 value = UCHARAT(RExC_parse++);
4727 /* Some compilers cannot handle switching on 64-bit integer
4728 * values, therefore value cannot be an UV. Yes, this will
4729 * be a problem later if we want switch on Unicode.
4730 * A similar issue a little bit later when switching on
4731 * namedclass. --jhi */
4732 switch ((I32)value) {
4733 case 'w': namedclass = ANYOF_ALNUM; break;
4734 case 'W': namedclass = ANYOF_NALNUM; break;
4735 case 's': namedclass = ANYOF_SPACE; break;
4736 case 'S': namedclass = ANYOF_NSPACE; break;
4737 case 'd': namedclass = ANYOF_DIGIT; break;
4738 case 'D': namedclass = ANYOF_NDIGIT; break;
4741 if (RExC_parse >= RExC_end)
4742 vFAIL2("Empty \\%c{}", (U8)value);
4743 if (*RExC_parse == '{') {
4744 const U8 c = (U8)value;
4745 e = strchr(RExC_parse++, '}');
4747 vFAIL2("Missing right brace on \\%c{}", c);
4748 while (isSPACE(UCHARAT(RExC_parse)))
4750 if (e == RExC_parse)
4751 vFAIL2("Empty \\%c{}", c);
4753 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4761 if (UCHARAT(RExC_parse) == '^') {
4764 value = value == 'p' ? 'P' : 'p'; /* toggle */
4765 while (isSPACE(UCHARAT(RExC_parse))) {
4771 Perl_sv_catpvf(aTHX_ listsv,
4772 "+utf8::%.*s\n", (int)n, RExC_parse);
4774 Perl_sv_catpvf(aTHX_ listsv,
4775 "!utf8::%.*s\n", (int)n, RExC_parse);
4778 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4779 namedclass = ANYOF_MAX; /* no official name, but it's named */
4781 case 'n': value = '\n'; break;
4782 case 'r': value = '\r'; break;
4783 case 't': value = '\t'; break;
4784 case 'f': value = '\f'; break;
4785 case 'b': value = '\b'; break;
4786 case 'e': value = ASCII_TO_NATIVE('\033');break;
4787 case 'a': value = ASCII_TO_NATIVE('\007');break;
4789 if (*RExC_parse == '{') {
4790 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4791 | PERL_SCAN_DISALLOW_PREFIX;
4792 e = strchr(RExC_parse++, '}');
4794 vFAIL("Missing right brace on \\x{}");
4796 numlen = e - RExC_parse;
4797 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4801 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4803 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
4804 RExC_parse += numlen;
4808 value = UCHARAT(RExC_parse++);
4809 value = toCTRL(value);
4811 case '0': case '1': case '2': case '3': case '4':
4812 case '5': case '6': case '7': case '8': case '9':
4816 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
4817 RExC_parse += numlen;
4821 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
4823 "Unrecognized escape \\%c in character class passed through",
4827 } /* end of \blah */
4833 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4835 if (!SIZE_ONLY && !need_class)
4836 ANYOF_CLASS_ZERO(ret);
4840 /* a bad range like a-\d, a-[:digit:] ? */
4843 if (ckWARN(WARN_REGEXP))
4845 "False [] range \"%*.*s\"",
4846 RExC_parse - rangebegin,
4847 RExC_parse - rangebegin,
4849 if (prevvalue < 256) {
4850 ANYOF_BITMAP_SET(ret, prevvalue);
4851 ANYOF_BITMAP_SET(ret, '-');
4854 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4855 Perl_sv_catpvf(aTHX_ listsv,
4856 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
4860 range = 0; /* this was not a true range */
4864 const char *what = NULL;
4867 if (namedclass > OOB_NAMEDCLASS)
4868 optimize_invert = FALSE;
4869 /* Possible truncation here but in some 64-bit environments
4870 * the compiler gets heartburn about switch on 64-bit values.
4871 * A similar issue a little earlier when switching on value.
4873 switch ((I32)namedclass) {
4876 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
4878 for (value = 0; value < 256; value++)
4880 ANYOF_BITMAP_SET(ret, value);
4887 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
4889 for (value = 0; value < 256; value++)
4890 if (!isALNUM(value))
4891 ANYOF_BITMAP_SET(ret, value);
4898 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
4900 for (value = 0; value < 256; value++)
4901 if (isALNUMC(value))
4902 ANYOF_BITMAP_SET(ret, value);
4909 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
4911 for (value = 0; value < 256; value++)
4912 if (!isALNUMC(value))
4913 ANYOF_BITMAP_SET(ret, value);
4920 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
4922 for (value = 0; value < 256; value++)
4924 ANYOF_BITMAP_SET(ret, value);
4931 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
4933 for (value = 0; value < 256; value++)
4934 if (!isALPHA(value))
4935 ANYOF_BITMAP_SET(ret, value);
4942 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
4945 for (value = 0; value < 128; value++)
4946 ANYOF_BITMAP_SET(ret, value);
4948 for (value = 0; value < 256; value++) {
4950 ANYOF_BITMAP_SET(ret, value);
4959 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
4962 for (value = 128; value < 256; value++)
4963 ANYOF_BITMAP_SET(ret, value);
4965 for (value = 0; value < 256; value++) {
4966 if (!isASCII(value))
4967 ANYOF_BITMAP_SET(ret, value);
4976 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4978 for (value = 0; value < 256; value++)
4980 ANYOF_BITMAP_SET(ret, value);
4987 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4989 for (value = 0; value < 256; value++)
4990 if (!isBLANK(value))
4991 ANYOF_BITMAP_SET(ret, value);
4998 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5000 for (value = 0; value < 256; value++)
5002 ANYOF_BITMAP_SET(ret, value);
5009 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5011 for (value = 0; value < 256; value++)
5012 if (!isCNTRL(value))
5013 ANYOF_BITMAP_SET(ret, value);
5020 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5022 /* consecutive digits assumed */
5023 for (value = '0'; value <= '9'; value++)
5024 ANYOF_BITMAP_SET(ret, value);
5031 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5033 /* consecutive digits assumed */
5034 for (value = 0; value < '0'; value++)
5035 ANYOF_BITMAP_SET(ret, value);
5036 for (value = '9' + 1; value < 256; value++)
5037 ANYOF_BITMAP_SET(ret, value);
5044 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5046 for (value = 0; value < 256; value++)
5048 ANYOF_BITMAP_SET(ret, value);
5055 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5057 for (value = 0; value < 256; value++)
5058 if (!isGRAPH(value))
5059 ANYOF_BITMAP_SET(ret, value);
5066 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5068 for (value = 0; value < 256; value++)
5070 ANYOF_BITMAP_SET(ret, value);
5077 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5079 for (value = 0; value < 256; value++)
5080 if (!isLOWER(value))
5081 ANYOF_BITMAP_SET(ret, value);
5088 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5090 for (value = 0; value < 256; value++)
5092 ANYOF_BITMAP_SET(ret, value);
5099 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5101 for (value = 0; value < 256; value++)
5102 if (!isPRINT(value))
5103 ANYOF_BITMAP_SET(ret, value);
5110 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5112 for (value = 0; value < 256; value++)
5113 if (isPSXSPC(value))
5114 ANYOF_BITMAP_SET(ret, value);
5121 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5123 for (value = 0; value < 256; value++)
5124 if (!isPSXSPC(value))
5125 ANYOF_BITMAP_SET(ret, value);
5132 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5134 for (value = 0; value < 256; value++)
5136 ANYOF_BITMAP_SET(ret, value);
5143 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5145 for (value = 0; value < 256; value++)
5146 if (!isPUNCT(value))
5147 ANYOF_BITMAP_SET(ret, value);
5154 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5156 for (value = 0; value < 256; value++)
5158 ANYOF_BITMAP_SET(ret, value);
5165 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5167 for (value = 0; value < 256; value++)
5168 if (!isSPACE(value))
5169 ANYOF_BITMAP_SET(ret, value);
5176 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5178 for (value = 0; value < 256; value++)
5180 ANYOF_BITMAP_SET(ret, value);
5187 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5189 for (value = 0; value < 256; value++)
5190 if (!isUPPER(value))
5191 ANYOF_BITMAP_SET(ret, value);
5198 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5200 for (value = 0; value < 256; value++)
5201 if (isXDIGIT(value))
5202 ANYOF_BITMAP_SET(ret, value);
5209 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5211 for (value = 0; value < 256; value++)
5212 if (!isXDIGIT(value))
5213 ANYOF_BITMAP_SET(ret, value);
5219 /* this is to handle \p and \P */
5222 vFAIL("Invalid [::] class");
5226 /* Strings such as "+utf8::isWord\n" */
5227 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5230 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5233 } /* end of namedclass \blah */
5236 if (prevvalue > (IV)value) /* b-a */ {
5237 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
5238 RExC_parse - rangebegin,
5239 RExC_parse - rangebegin,
5241 range = 0; /* not a valid range */
5245 prevvalue = value; /* save the beginning of the range */
5246 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5247 RExC_parse[1] != ']') {
5250 /* a bad range like \w-, [:word:]- ? */
5251 if (namedclass > OOB_NAMEDCLASS) {
5252 if (ckWARN(WARN_REGEXP))
5254 "False [] range \"%*.*s\"",
5255 RExC_parse - rangebegin,
5256 RExC_parse - rangebegin,
5259 ANYOF_BITMAP_SET(ret, '-');
5261 range = 1; /* yeah, it's a range! */
5262 continue; /* but do it the next time */
5266 /* now is the next time */
5270 if (prevvalue < 256) {
5271 const IV ceilvalue = value < 256 ? value : 255;
5274 /* In EBCDIC [\x89-\x91] should include
5275 * the \x8e but [i-j] should not. */
5276 if (literal_endpoint == 2 &&
5277 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5278 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5280 if (isLOWER(prevvalue)) {
5281 for (i = prevvalue; i <= ceilvalue; i++)
5283 ANYOF_BITMAP_SET(ret, i);
5285 for (i = prevvalue; i <= ceilvalue; i++)
5287 ANYOF_BITMAP_SET(ret, i);
5292 for (i = prevvalue; i <= ceilvalue; i++)
5293 ANYOF_BITMAP_SET(ret, i);
5295 if (value > 255 || UTF) {
5296 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5297 const UV natvalue = NATIVE_TO_UNI(value);
5299 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5300 if (prevnatvalue < natvalue) { /* what about > ? */
5301 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5302 prevnatvalue, natvalue);
5304 else if (prevnatvalue == natvalue) {
5305 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5307 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5309 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5311 /* If folding and foldable and a single
5312 * character, insert also the folded version
5313 * to the charclass. */
5315 if (foldlen == (STRLEN)UNISKIP(f))
5316 Perl_sv_catpvf(aTHX_ listsv,
5319 /* Any multicharacter foldings
5320 * require the following transform:
5321 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5322 * where E folds into "pq" and F folds
5323 * into "rst", all other characters
5324 * fold to single characters. We save
5325 * away these multicharacter foldings,
5326 * to be later saved as part of the
5327 * additional "s" data. */
5330 if (!unicode_alternate)
5331 unicode_alternate = newAV();
5332 sv = newSVpvn((char*)foldbuf, foldlen);
5334 av_push(unicode_alternate, sv);
5338 /* If folding and the value is one of the Greek
5339 * sigmas insert a few more sigmas to make the
5340 * folding rules of the sigmas to work right.
5341 * Note that not all the possible combinations
5342 * are handled here: some of them are handled
5343 * by the standard folding rules, and some of
5344 * them (literal or EXACTF cases) are handled
5345 * during runtime in regexec.c:S_find_byclass(). */
5346 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5347 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5348 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5349 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5350 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5352 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5353 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5354 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5359 literal_endpoint = 0;
5363 range = 0; /* this range (if it was one) is done now */
5367 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5369 RExC_size += ANYOF_CLASS_ADD_SKIP;
5371 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5374 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5376 /* If the only flag is folding (plus possibly inversion). */
5377 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5379 for (value = 0; value < 256; ++value) {
5380 if (ANYOF_BITMAP_TEST(ret, value)) {
5381 UV fold = PL_fold[value];
5384 ANYOF_BITMAP_SET(ret, fold);
5387 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5390 /* optimize inverted simple patterns (e.g. [^a-z]) */
5391 if (!SIZE_ONLY && optimize_invert &&
5392 /* If the only flag is inversion. */
5393 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5394 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5395 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5396 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5403 /* The 0th element stores the character class description
5404 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5405 * to initialize the appropriate swash (which gets stored in
5406 * the 1st element), and also useful for dumping the regnode.
5407 * The 2nd element stores the multicharacter foldings,
5408 * used later (regexec.c:S_reginclass()). */
5409 av_store(av, 0, listsv);
5410 av_store(av, 1, NULL);
5411 av_store(av, 2, (SV*)unicode_alternate);
5412 rv = newRV_noinc((SV*)av);
5413 n = add_data(pRExC_state, 1, "s");
5414 RExC_rx->data->data[n] = (void*)rv;
5422 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5424 char* retval = RExC_parse++;
5427 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5428 RExC_parse[2] == '#') {
5429 while (*RExC_parse != ')') {
5430 if (RExC_parse == RExC_end)
5431 FAIL("Sequence (?#... not terminated");
5437 if (RExC_flags & PMf_EXTENDED) {
5438 if (isSPACE(*RExC_parse)) {
5442 else if (*RExC_parse == '#') {
5443 while (RExC_parse < RExC_end)
5444 if (*RExC_parse++ == '\n') break;
5453 - reg_node - emit a node
5455 STATIC regnode * /* Location. */
5456 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5458 register regnode *ptr;
5459 regnode * const ret = RExC_emit;
5462 SIZE_ALIGN(RExC_size);
5467 NODE_ALIGN_FILL(ret);
5469 FILL_ADVANCE_NODE(ptr, op);
5470 if (RExC_offsets) { /* MJD */
5471 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5472 "reg_node", __LINE__,
5474 RExC_emit - RExC_emit_start > RExC_offsets[0]
5475 ? "Overwriting end of array!\n" : "OK",
5476 RExC_emit - RExC_emit_start,
5477 RExC_parse - RExC_start,
5479 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5488 - reganode - emit a node with an argument
5490 STATIC regnode * /* Location. */
5491 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5493 register regnode *ptr;
5494 regnode * const ret = RExC_emit;
5497 SIZE_ALIGN(RExC_size);
5502 NODE_ALIGN_FILL(ret);
5504 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5505 if (RExC_offsets) { /* MJD */
5506 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5510 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5511 "Overwriting end of array!\n" : "OK",
5512 RExC_emit - RExC_emit_start,
5513 RExC_parse - RExC_start,
5515 Set_Cur_Node_Offset;
5524 - reguni - emit (if appropriate) a Unicode character
5527 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
5529 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5533 - reginsert - insert an operator in front of already-emitted operand
5535 * Means relocating the operand.
5538 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5540 register regnode *src;
5541 register regnode *dst;
5542 register regnode *place;
5543 const int offset = regarglen[(U8)op];
5545 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5548 RExC_size += NODE_STEP_REGNODE + offset;
5553 RExC_emit += NODE_STEP_REGNODE + offset;
5555 while (src > opnd) {
5556 StructCopy(--src, --dst, regnode);
5557 if (RExC_offsets) { /* MJD 20010112 */
5558 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5562 dst - RExC_emit_start > RExC_offsets[0]
5563 ? "Overwriting end of array!\n" : "OK",
5564 src - RExC_emit_start,
5565 dst - RExC_emit_start,
5567 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5568 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5573 place = opnd; /* Op node, where operand used to be. */
5574 if (RExC_offsets) { /* MJD */
5575 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5579 place - RExC_emit_start > RExC_offsets[0]
5580 ? "Overwriting end of array!\n" : "OK",
5581 place - RExC_emit_start,
5582 RExC_parse - RExC_start,
5584 Set_Node_Offset(place, RExC_parse);
5585 Set_Node_Length(place, 1);
5587 src = NEXTOPER(place);
5588 FILL_ADVANCE_NODE(place, op);
5589 Zero(src, offset, regnode);
5593 - regtail - set the next-pointer at the end of a node chain of p to val.
5596 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5598 register regnode *scan;
5603 /* Find last node. */
5606 regnode * const temp = regnext(scan);
5612 if (reg_off_by_arg[OP(scan)]) {
5613 ARG_SET(scan, val - scan);
5616 NEXT_OFF(scan) = val - scan;
5621 - regoptail - regtail on operand of first argument; nop if operandless
5624 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
5626 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
5627 if (p == NULL || SIZE_ONLY)
5629 if (PL_regkind[(U8)OP(p)] == BRANCH) {
5630 regtail(pRExC_state, NEXTOPER(p), val);
5632 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
5633 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
5640 - regcurly - a little FSA that accepts {\d+,?\d*}
5643 S_regcurly(pTHX_ register const char *s)
5662 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5665 Perl_regdump(pTHX_ regexp *r)
5668 SV *sv = sv_newmortal();
5670 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
5672 /* Header fields of interest. */
5673 if (r->anchored_substr)
5674 PerlIO_printf(Perl_debug_log,
5675 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5677 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5678 SvPVX_const(r->anchored_substr),
5680 SvTAIL(r->anchored_substr) ? "$" : "",
5681 (IV)r->anchored_offset);
5682 else if (r->anchored_utf8)
5683 PerlIO_printf(Perl_debug_log,
5684 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5686 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5687 SvPVX_const(r->anchored_utf8),
5689 SvTAIL(r->anchored_utf8) ? "$" : "",
5690 (IV)r->anchored_offset);
5691 if (r->float_substr)
5692 PerlIO_printf(Perl_debug_log,
5693 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5695 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
5696 SvPVX_const(r->float_substr),
5698 SvTAIL(r->float_substr) ? "$" : "",
5699 (IV)r->float_min_offset, (UV)r->float_max_offset);
5700 else if (r->float_utf8)
5701 PerlIO_printf(Perl_debug_log,
5702 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
5704 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5705 SvPVX_const(r->float_utf8),
5707 SvTAIL(r->float_utf8) ? "$" : "",
5708 (IV)r->float_min_offset, (UV)r->float_max_offset);
5709 if (r->check_substr || r->check_utf8)
5710 PerlIO_printf(Perl_debug_log,
5711 r->check_substr == r->float_substr
5712 && r->check_utf8 == r->float_utf8
5713 ? "(checking floating" : "(checking anchored");
5714 if (r->reganch & ROPT_NOSCAN)
5715 PerlIO_printf(Perl_debug_log, " noscan");
5716 if (r->reganch & ROPT_CHECK_ALL)
5717 PerlIO_printf(Perl_debug_log, " isall");
5718 if (r->check_substr || r->check_utf8)
5719 PerlIO_printf(Perl_debug_log, ") ");
5721 if (r->regstclass) {
5722 regprop(sv, r->regstclass);
5723 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
5725 if (r->reganch & ROPT_ANCH) {
5726 PerlIO_printf(Perl_debug_log, "anchored");
5727 if (r->reganch & ROPT_ANCH_BOL)
5728 PerlIO_printf(Perl_debug_log, "(BOL)");
5729 if (r->reganch & ROPT_ANCH_MBOL)
5730 PerlIO_printf(Perl_debug_log, "(MBOL)");
5731 if (r->reganch & ROPT_ANCH_SBOL)
5732 PerlIO_printf(Perl_debug_log, "(SBOL)");
5733 if (r->reganch & ROPT_ANCH_GPOS)
5734 PerlIO_printf(Perl_debug_log, "(GPOS)");
5735 PerlIO_putc(Perl_debug_log, ' ');
5737 if (r->reganch & ROPT_GPOS_SEEN)
5738 PerlIO_printf(Perl_debug_log, "GPOS ");
5739 if (r->reganch & ROPT_SKIP)
5740 PerlIO_printf(Perl_debug_log, "plus ");
5741 if (r->reganch & ROPT_IMPLICIT)
5742 PerlIO_printf(Perl_debug_log, "implicit ");
5743 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
5744 if (r->reganch & ROPT_EVAL_SEEN)
5745 PerlIO_printf(Perl_debug_log, "with eval ");
5746 PerlIO_printf(Perl_debug_log, "\n");
5748 const U32 len = r->offsets[0];
5749 GET_RE_DEBUG_FLAGS_DECL;
5752 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5753 for (i = 1; i <= len; i++)
5754 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5755 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5756 PerlIO_printf(Perl_debug_log, "\n");
5759 #endif /* DEBUGGING */
5763 - regprop - printable representation of opcode
5766 Perl_regprop(pTHX_ SV *sv, const regnode *o)
5771 sv_setpvn(sv, "", 0);
5772 if (OP(o) >= reg_num) /* regnode.type is unsigned */
5773 /* It would be nice to FAIL() here, but this may be called from
5774 regexec.c, and it would be hard to supply pRExC_state. */
5775 Perl_croak(aTHX_ "Corrupted regexp opcode");
5776 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
5778 k = PL_regkind[(U8)OP(o)];
5781 SV *dsv = sv_2mortal(newSVpvn("", 0));
5782 /* Using is_utf8_string() is a crude hack but it may
5783 * be the best for now since we have no flag "this EXACTish
5784 * node was UTF-8" --jhi */
5785 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5786 const char *s = do_utf8 ?
5787 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5788 UNI_DISPLAY_REGEX) :
5790 const int len = do_utf8 ?
5793 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5797 } else if (k == TRIE) {/*
5798 this isn't always safe, as Pl_regdata may not be for this regex yet
5799 (depending on where its called from) so its being moved to dumpuntil
5801 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5802 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5805 trie->uniquecharcount,
5808 } else if (k == CURLY) {
5809 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
5810 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5811 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
5813 else if (k == WHILEM && o->flags) /* Ordinal/of */
5814 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
5815 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
5816 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5817 else if (k == LOGICAL)
5818 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
5819 else if (k == ANYOF) {
5820 int i, rangestart = -1;
5821 U8 flags = ANYOF_FLAGS(o);
5822 const char * const anyofs[] = { /* Should be synchronized with
5823 * ANYOF_ #xdefines in regcomp.h */
5856 if (flags & ANYOF_LOCALE)
5857 sv_catpv(sv, "{loc}");
5858 if (flags & ANYOF_FOLD)
5859 sv_catpv(sv, "{i}");
5860 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
5861 if (flags & ANYOF_INVERT)
5863 for (i = 0; i <= 256; i++) {
5864 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5865 if (rangestart == -1)
5867 } else if (rangestart != -1) {
5868 if (i <= rangestart + 3)
5869 for (; rangestart < i; rangestart++)
5870 put_byte(sv, rangestart);
5872 put_byte(sv, rangestart);
5874 put_byte(sv, i - 1);
5880 if (o->flags & ANYOF_CLASS)
5881 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5882 if (ANYOF_CLASS_TEST(o,i))
5883 sv_catpv(sv, anyofs[i]);
5885 if (flags & ANYOF_UNICODE)
5886 sv_catpv(sv, "{unicode}");
5887 else if (flags & ANYOF_UNICODE_ALL)
5888 sv_catpv(sv, "{unicode_all}");
5892 SV *sw = regclass_swash(o, FALSE, &lv, 0);
5896 U8 s[UTF8_MAXBYTES_CASE+1];
5898 for (i = 0; i <= 256; i++) { /* just the first 256 */
5899 uvchr_to_utf8(s, i);
5901 if (i < 256 && swash_fetch(sw, s, TRUE)) {
5902 if (rangestart == -1)
5904 } else if (rangestart != -1) {
5907 if (i <= rangestart + 3)
5908 for (; rangestart < i; rangestart++) {
5910 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
5915 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
5918 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
5925 sv_catpv(sv, "..."); /* et cetera */
5929 char *s = savesvpv(lv);
5932 while(*s && *s != '\n') s++;
5935 const char *t = ++s;
5953 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5955 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
5956 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
5957 #endif /* DEBUGGING */
5961 Perl_re_intuit_string(pTHX_ regexp *prog)
5962 { /* Assume that RE_INTUIT is set */
5963 GET_RE_DEBUG_FLAGS_DECL;
5966 const char *s = SvPV_nolen_const(prog->check_substr
5967 ? prog->check_substr : prog->check_utf8);
5969 if (!PL_colorset) reginitcolors();
5970 PerlIO_printf(Perl_debug_log,
5971 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
5973 prog->check_substr ? "" : "utf8 ",
5974 PL_colors[5],PL_colors[0],
5977 (strlen(s) > 60 ? "..." : ""));
5980 return prog->check_substr ? prog->check_substr : prog->check_utf8;
5984 Perl_pregfree(pTHX_ struct regexp *r)
5988 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
5989 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
5993 if (!r || (--r->refcnt > 0))
5995 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
5996 const char *s = (r->reganch & ROPT_UTF8)
5997 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
5998 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
5999 const int len = SvCUR(dsv);
6002 PerlIO_printf(Perl_debug_log,
6003 "%sFreeing REx:%s %s%*.*s%s%s\n",
6004 PL_colors[4],PL_colors[5],PL_colors[0],
6007 len > 60 ? "..." : "");
6011 Safefree(r->precomp);
6012 if (r->offsets) /* 20010421 MJD */
6013 Safefree(r->offsets);
6014 RX_MATCH_COPY_FREE(r);
6015 #ifdef PERL_OLD_COPY_ON_WRITE
6017 SvREFCNT_dec(r->saved_copy);
6020 if (r->anchored_substr)
6021 SvREFCNT_dec(r->anchored_substr);
6022 if (r->anchored_utf8)
6023 SvREFCNT_dec(r->anchored_utf8);
6024 if (r->float_substr)
6025 SvREFCNT_dec(r->float_substr);
6027 SvREFCNT_dec(r->float_utf8);
6028 Safefree(r->substrs);
6031 int n = r->data->count;
6032 PAD* new_comppad = NULL;
6037 /* If you add a ->what type here, update the comment in regcomp.h */
6038 switch (r->data->what[n]) {
6040 SvREFCNT_dec((SV*)r->data->data[n]);
6043 Safefree(r->data->data[n]);
6046 new_comppad = (AV*)r->data->data[n];
6049 if (new_comppad == NULL)
6050 Perl_croak(aTHX_ "panic: pregfree comppad");
6051 PAD_SAVE_LOCAL(old_comppad,
6052 /* Watch out for global destruction's random ordering. */
6053 (SvTYPE(new_comppad) == SVt_PVAV) ?
6054 new_comppad : Null(PAD *)
6057 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6060 op_free((OP_4tree*)r->data->data[n]);
6062 PAD_RESTORE_LOCAL(old_comppad);
6063 SvREFCNT_dec((SV*)new_comppad);
6070 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6073 refcount = trie->refcount--;
6077 Safefree(trie->charmap);
6078 if (trie->widecharmap)
6079 SvREFCNT_dec((SV*)trie->widecharmap);
6081 Safefree(trie->states);
6083 Safefree(trie->trans);
6086 SvREFCNT_dec((SV*)trie->words);
6087 if (trie->revcharmap)
6088 SvREFCNT_dec((SV*)trie->revcharmap);
6090 Safefree(r->data->data[n]); /* do this last!!!! */
6095 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6098 Safefree(r->data->what);
6101 Safefree(r->startp);
6107 - regnext - dig the "next" pointer out of a node
6110 Perl_regnext(pTHX_ register regnode *p)
6112 register I32 offset;
6114 if (p == &PL_regdummy)
6117 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6125 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6128 STRLEN l1 = strlen(pat1);
6129 STRLEN l2 = strlen(pat2);
6132 const char *message;
6138 Copy(pat1, buf, l1 , char);
6139 Copy(pat2, buf + l1, l2 , char);
6140 buf[l1 + l2] = '\n';
6141 buf[l1 + l2 + 1] = '\0';
6143 /* ANSI variant takes additional second argument */
6144 va_start(args, pat2);
6148 msv = vmess(buf, &args);
6150 message = SvPV_const(msv,l1);
6153 Copy(message, buf, l1 , char);
6154 buf[l1-1] = '\0'; /* Overwrite \n */
6155 Perl_croak(aTHX_ "%s", buf);
6158 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6161 Perl_save_re_context(pTHX)
6163 SAVEI32(PL_reg_flags); /* from regexec.c */
6165 SAVEPPTR(PL_reginput); /* String-input pointer. */
6166 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6167 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
6168 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6169 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6170 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
6171 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
6172 SAVEPPTR(PL_regtill); /* How far we are required to go. */
6173 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
6174 PL_reg_start_tmp = 0;
6175 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6176 PL_reg_start_tmpl = 0;
6177 SAVEVPTR(PL_regdata);
6178 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6179 SAVEI32(PL_regnarrate); /* from regexec.c */
6180 SAVEVPTR(PL_regprogram); /* from regexec.c */
6181 SAVEINT(PL_regindent); /* from regexec.c */
6182 SAVEVPTR(PL_regcc); /* from regexec.c */
6183 SAVEVPTR(PL_curcop);
6184 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6185 SAVEVPTR(PL_reg_re); /* from regexec.c */
6186 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6187 SAVESPTR(PL_reg_sv); /* from regexec.c */
6188 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
6189 SAVEVPTR(PL_reg_magic); /* from regexec.c */
6190 SAVEI32(PL_reg_oldpos); /* from regexec.c */
6191 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6192 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
6193 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6194 PL_reg_oldsaved = Nullch;
6195 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6196 PL_reg_oldsavedlen = 0;
6197 #ifdef PERL_OLD_COPY_ON_WRITE
6201 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6203 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6204 PL_reg_leftiter = 0;
6205 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6206 PL_reg_poscache = Nullch;
6207 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6208 PL_reg_poscache_size = 0;
6209 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
6210 SAVEI32(PL_regnpar); /* () count. */
6211 SAVEI32(PL_regsize); /* from regexec.c */
6214 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6217 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6219 for (i = 1; i <= rx->nparens; i++) {
6221 char digits[TYPE_CHARS(long)];
6222 sprintf(digits, "%lu", (long)i);
6223 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6230 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
6235 clear_re(pTHX_ void *r)
6237 ReREFCNT_dec((regexp *)r);
6243 S_put_byte(pTHX_ SV *sv, int c)
6245 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6246 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6247 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6248 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6250 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6255 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6257 register U8 op = EXACT; /* Arbitrary non-END op. */
6258 register regnode *next;
6260 while (op != END && (!last || node < last)) {
6261 /* While that wasn't END last time... */
6267 next = regnext(node);
6269 if (OP(node) == OPTIMIZED)
6272 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6273 (int)(2*l + 1), "", SvPVX_const(sv));
6274 if (next == NULL) /* Next ptr. */
6275 PerlIO_printf(Perl_debug_log, "(0)");
6277 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6278 (void)PerlIO_putc(Perl_debug_log, '\n');
6280 if (PL_regkind[(U8)op] == BRANCHJ) {
6281 register regnode *nnode = (OP(next) == LONGJMP
6284 if (last && nnode > last)
6286 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6288 else if (PL_regkind[(U8)op] == BRANCH) {
6289 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6291 else if ( PL_regkind[(U8)op] == TRIE ) {
6292 const I32 n = ARG(node);
6293 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6294 const I32 arry_len = av_len(trie->words)+1;
6296 PerlIO_printf(Perl_debug_log,
6297 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6301 (int)trie->charcount,
6302 trie->uniquecharcount,
6303 (IV)trie->laststate-1,
6304 node->flags ? " EVAL mode" : "");
6306 for (word_idx=0; word_idx < arry_len; word_idx++) {
6307 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6309 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6312 SvPV_nolen_const(*elem_ptr),
6317 PerlIO_printf(Perl_debug_log, "(0)\n");
6319 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6325 node = NEXTOPER(node);
6326 node += regarglen[(U8)op];
6329 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6330 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6331 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6333 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6334 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6337 else if ( op == PLUS || op == STAR) {
6338 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6340 else if (op == ANYOF) {
6341 /* arglen 1 + class block */
6342 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6343 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6344 node = NEXTOPER(node);
6346 else if (PL_regkind[(U8)op] == EXACT) {
6347 /* Literal string, where present. */
6348 node += NODE_SZ_STR(node) - 1;
6349 node = NEXTOPER(node);
6352 node = NEXTOPER(node);
6353 node += regarglen[(U8)op];
6355 if (op == CURLYX || op == OPEN)
6357 else if (op == WHILEM)
6363 #endif /* DEBUGGING */
6367 * c-indentation-style: bsd
6369 * indent-tabs-mode: t
6372 * ex: set ts=8 sts=4 sw=4 noet: