5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
176 /* Length of a variant. */
178 typedef struct scan_data_t {
184 I32 last_end; /* min value, <0 unless valid. */
187 SV **longest; /* Either &l_fixed, or &l_float. */
191 I32 offset_float_min;
192 I32 offset_float_max;
196 struct regnode_charclass_class *start_class;
200 * Forward declarations for pregcomp()'s friends.
203 static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL 0x0001
208 #define SF_BEFORE_MEOL 0x0002
209 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
213 # define SF_FIX_SHIFT_EOL (0+2)
214 # define SF_FL_SHIFT_EOL (0+4)
216 # define SF_FIX_SHIFT_EOL (+2)
217 # define SF_FL_SHIFT_EOL (+4)
220 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF 0x0040
226 #define SF_HAS_PAR 0x0080
227 #define SF_IN_PAR 0x0100
228 #define SF_HAS_EVAL 0x0200
229 #define SCF_DO_SUBSTR 0x0400
230 #define SCF_DO_STCLASS_AND 0x0800
231 #define SCF_DO_STCLASS_OR 0x1000
232 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS 0x2000
235 #define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
237 #define UTF (RExC_utf8 != 0)
238 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
239 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
241 #define OOB_UNICODE 12345678
242 #define OOB_NAMEDCLASS -1
244 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248 /* length of regex to show in messages that don't mark a position within */
249 #define RegexLengthToShowInErrorMessages 127
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
256 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
257 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
259 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 #define FAIL(msg) STMT_START { \
267 const char *ellipses = ""; \
268 IV len = RExC_end - RExC_precomp; \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
284 #define Simple_vFAIL(m) STMT_START { \
285 const IV offset = RExC_parse - RExC_precomp; \
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
293 #define vFAIL(m) STMT_START { \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
300 * Like Simple_vFAIL(), but accepts two arguments.
302 #define Simple_vFAIL2(m,a1) STMT_START { \
303 const IV offset = RExC_parse - RExC_precomp; \
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
311 #define vFAIL2(m,a1) STMT_START { \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
319 * Like Simple_vFAIL(), but accepts three arguments.
321 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
322 const IV offset = RExC_parse - RExC_precomp; \
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
330 #define vFAIL3(m,a1,a2) STMT_START { \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
337 * Like Simple_vFAIL(), but accepts four arguments.
339 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
340 const IV offset = RExC_parse - RExC_precomp; \
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
345 #define vWARN(loc,m) STMT_START { \
346 const IV offset = loc - RExC_precomp; \
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351 #define vWARNdep(loc,m) STMT_START { \
352 const IV offset = loc - RExC_precomp; \
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359 #define vWARN2(loc, m, a1) STMT_START { \
360 const IV offset = loc - RExC_precomp; \
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365 #define vWARN3(loc, m, a1, a2) STMT_START { \
366 const IV offset = loc - RExC_precomp; \
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
372 const IV offset = loc - RExC_precomp; \
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
378 const IV offset = loc - RExC_precomp; \
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 /* Allow for side effects in s */
385 #define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389 /* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
393 * Position is 1 indexed.
396 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
399 __LINE__, (node), (int)(byte))); \
401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
403 RExC_offsets[2*(node)-1] = (byte); \
408 #define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
412 #define Set_Node_Length_To_R(node,len) STMT_START { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
415 __LINE__, (int)(node), (int)(len))); \
417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
419 RExC_offsets[2*(node)] = (len); \
424 #define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427 #define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
430 /* Get offsets and lengths */
431 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
434 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
440 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441 #define EXPERIMENTAL_INPLACESCAN
444 static void clear_re(pTHX_ void *r);
446 /* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
451 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
457 SvSetMagicSV(*data->longest, data->last_found);
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
464 data->flags &= ~SF_FIX_BEFORE_EOL;
467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
468 data->offset_float_max = (l
469 ? data->last_start_max
470 : data->pos_min + data->pos_delta);
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
473 if (data->flags & SF_BEFORE_EOL)
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
477 data->flags &= ~SF_FL_BEFORE_EOL;
480 SvCUR_set(data->last_found, 0);
482 SV * const sv = data->last_found;
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
490 data->flags &= ~SF_BEFORE_EOL;
493 /* Can match anything (initialization) */
495 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
497 ANYOF_CLASS_ZERO(cl);
498 ANYOF_BITMAP_SETALL(cl);
499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
501 cl->flags |= ANYOF_LOCALE;
504 /* Can match anything (initialization) */
506 S_cl_is_anything(const struct regnode_charclass_class *cl)
510 for (value = 0; value <= ANYOF_MAX; value += 2)
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
520 /* Can match anything (initialization) */
522 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 Zero(cl, 1, struct regnode_charclass_class);
526 cl_anything(pRExC_state, cl);
530 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 Zero(cl, 1, struct regnode_charclass_class);
534 cl_anything(pRExC_state, cl);
536 cl->flags |= ANYOF_LOCALE;
539 /* 'And' a given class with another one. Can create false positives */
540 /* We assume that cl is not inverted */
542 S_cl_and(struct regnode_charclass_class *cl,
543 const struct regnode_charclass_class *and_with)
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
570 cl->flags &= ~ANYOF_UNICODE_ALL;
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
573 cl->flags &= ~ANYOF_UNICODE;
576 /* 'OR' a given class with another one. Can create false positives */
577 /* We assume that cl is not inverted */
579 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
581 if (or_with->flags & ANYOF_INVERT) {
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
600 cl_anything(pRExC_state, cl);
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && (!(or_with->flags & ANYOF_FOLD)
606 || (cl->flags & ANYOF_FOLD)) ) {
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
619 cl_anything(pRExC_state, cl);
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
638 make_trie(startbranch,first,last,tail,flags,depth)
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
648 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
650 A trie is an N'ary tree where the branches are determined by digital
651 decomposition of the key. IE, at the root node you look up the 1st character and
652 follow that branch repeat until you find the end of the branches. Nodes can be
653 marked as "accepting" meaning they represent a complete word. Eg:
657 would convert into the following structure. Numbers represent states, letters
658 following numbers represent valid transitions on the letter from that state, if
659 the number is in square brackets it represents an accepting state, otherwise it
660 will be in parenthesis.
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
666 (1) +-i->(6)-+-s->[7]
668 +-s->(3)-+-h->(4)-+-e->[5]
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
672 This shows that when matching against the string 'hers' we will begin at state 1
673 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676 single traverse. We store a mapping from accepting to state to which word was
677 matched, and then when we have multiple possibilities we try to complete the
678 rest of the regex in the order in which they occured in the alternation.
680 The only prior NFA like behaviour that would be changed by the TRIE support is
681 the silent ignoring of duplicate alternations which are of the form:
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
685 Thus EVAL blocks follwing a trie may be called a different number of times with
686 and without the optimisation. With the optimisations dupes will be silently
687 ignored. This inconsistant behaviour of EVAL type nodes is well established as
688 the following demonstrates:
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
692 which prints out 'word' three times, but
694 'words'=~/(word|word|word)(?{ print $1 })S/
696 which doesnt print it out at all. This is due to other optimisations kicking in.
698 Example of what happens on a structural level:
700 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
702 1: CURLYM[1] {1,32767}(18)
713 This would be optimizable with startbranch=5, first=5, last=16, tail=16
714 and should turn into:
716 1: CURLYM[1] {1,32767}(18)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
726 Cases where tail != last would be like /(?foo|bar)baz/:
736 which would be optimizable with startbranch=1, first=1, last=7, tail=8
737 and would end up looking like:
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
747 d = uvuni_to_utf8_flags(d, uv, 0);
749 is the recommended Unicode-aware way of saying
754 #define TRIE_STORE_REVCHAR \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 if (UTF) SvUTF8_on(tmp); \
758 av_push( TRIE_REVCHARMAP(trie), tmp ); \
761 #define TRIE_READ_CHAR STMT_START { \
765 if ( foldlen > 0 ) { \
766 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
771 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
772 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
773 foldlen -= UNISKIP( uvc ); \
774 scan = foldbuf + UNISKIP( uvc ); \
777 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
786 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
787 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
788 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
789 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
791 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
792 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
793 TRIE_LIST_LEN( state ) *= 2; \
794 Renew( trie->states[ state ].trans.list, \
795 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
798 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
799 TRIE_LIST_CUR( state )++; \
802 #define TRIE_LIST_NEW(state) STMT_START { \
803 Newxz( trie->states[ state ].trans.list, \
804 4, reg_trie_trans_le ); \
805 TRIE_LIST_CUR( state ) = 1; \
806 TRIE_LIST_LEN( state ) = 4; \
809 #define TRIE_HANDLE_WORD(state) STMT_START { \
810 if ( !trie->states[ state ].wordnum ) { \
811 /* we haven't inserted this word into the structure yet. */ \
813 trie->wordlen[ curword ] = wordlen; \
814 trie->states[ state ].wordnum = ++curword; \
816 /* store the word for dumping */ \
818 if (OP(noper) != NOTHING) \
819 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
821 tmp = newSVpvn( "", 0 ); \
822 if ( UTF ) SvUTF8_on( tmp ); \
823 av_push( trie->words, tmp ); \
826 NOOP; /* It's a dupe. So ignore it. */ \
833 dump_trie_interim_list(trie,next_alloc)
834 dump_trie_interim_table(trie,next_alloc)
836 These routines dump out a trie in a somewhat readable format.
837 The _interim_ variants are used for debugging the interim
838 tables that are used to generate the final compressed
839 representation which is what dump_trie expects.
841 Part of the reason for their existance is to provide a form
842 of documentation as to how the different representations function.
848 Dumps the final compressed table form of the trie to Perl_debug_log.
849 Used for debugging make_trie().
853 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
856 SV *sv=sv_newmortal();
857 int colwidth= trie->widecharmap ? 6 : 4;
858 GET_RE_DEBUG_FLAGS_DECL;
861 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
862 (int)depth * 2 + 2,"",
863 "Match","Base","Ofs" );
865 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
866 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
868 PerlIO_printf( Perl_debug_log, "%*s",
870 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
871 PL_colors[0], PL_colors[1],
872 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
873 PERL_PV_ESCAPE_FIRSTCHAR
878 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
879 (int)depth * 2 + 2,"");
881 for( state = 0 ; state < trie->uniquecharcount ; state++ )
882 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
883 PerlIO_printf( Perl_debug_log, "\n");
885 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
886 const U32 base = trie->states[ state ].trans.base;
888 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
890 if ( trie->states[ state ].wordnum ) {
891 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
893 PerlIO_printf( Perl_debug_log, "%6s", "" );
896 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
901 while( ( base + ofs < trie->uniquecharcount ) ||
902 ( base + ofs - trie->uniquecharcount < trie->lasttrans
903 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
906 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
908 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
909 if ( ( base + ofs >= trie->uniquecharcount ) &&
910 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
911 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
913 PerlIO_printf( Perl_debug_log, "%*"UVXf,
915 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
917 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
921 PerlIO_printf( Perl_debug_log, "]");
924 PerlIO_printf( Perl_debug_log, "\n" );
928 dump_trie_interim_list(trie,next_alloc)
929 Dumps a fully constructed but uncompressed trie in list form.
930 List tries normally only are used for construction when the number of
931 possible chars (trie->uniquecharcount) is very high.
932 Used for debugging make_trie().
935 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
938 SV *sv=sv_newmortal();
939 int colwidth= trie->widecharmap ? 6 : 4;
940 GET_RE_DEBUG_FLAGS_DECL;
941 /* print out the table precompression. */
942 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
943 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
944 "------:-----+-----------------\n" );
946 for( state=1 ; state < next_alloc ; state ++ ) {
949 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
950 (int)depth * 2 + 2,"", (UV)state );
951 if ( ! trie->states[ state ].wordnum ) {
952 PerlIO_printf( Perl_debug_log, "%5s| ","");
954 PerlIO_printf( Perl_debug_log, "W%4x| ",
955 trie->states[ state ].wordnum
958 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
959 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
961 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
963 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
964 PL_colors[0], PL_colors[1],
965 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
966 PERL_PV_ESCAPE_FIRSTCHAR
968 TRIE_LIST_ITEM(state,charid).forid,
969 (UV)TRIE_LIST_ITEM(state,charid).newstate
973 PerlIO_printf( Perl_debug_log, "\n");
978 dump_trie_interim_table(trie,next_alloc)
979 Dumps a fully constructed but uncompressed trie in table form.
980 This is the normal DFA style state transition table, with a few
981 twists to facilitate compression later.
982 Used for debugging make_trie().
985 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
989 SV *sv=sv_newmortal();
990 int colwidth= trie->widecharmap ? 6 : 4;
991 GET_RE_DEBUG_FLAGS_DECL;
994 print out the table precompression so that we can do a visual check
995 that they are identical.
998 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1000 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1001 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
1003 PerlIO_printf( Perl_debug_log, "%*s",
1005 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1006 PL_colors[0], PL_colors[1],
1007 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1008 PERL_PV_ESCAPE_FIRSTCHAR
1014 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1016 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1017 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1020 PerlIO_printf( Perl_debug_log, "\n" );
1022 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1024 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1025 (int)depth * 2 + 2,"",
1026 (UV)TRIE_NODENUM( state ) );
1028 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1029 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1031 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1033 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1035 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1036 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1038 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1039 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1046 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1047 ( ( base + charid >= ucharcount \
1048 && base + charid < ubound \
1049 && state == trie->trans[ base - ucharcount + charid ].check \
1050 && trie->trans[ base - ucharcount + charid ].next ) \
1051 ? trie->trans[ base - ucharcount + charid ].next \
1052 : ( state==1 ? special : 0 ) \
1056 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1058 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1060 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1061 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1064 We find the fail state for each state in the trie, this state is the longest proper
1065 suffix of the current states 'word' that is also a proper prefix of another word in our
1066 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1067 the DFA not to have to restart after its tried and failed a word at a given point, it
1068 simply continues as though it had been matching the other word in the first place.
1070 'abcdgu'=~/abcdefg|cdgu/
1071 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1072 fail, which would bring use to the state representing 'd' in the second word where we would
1073 try 'g' and succeed, prodceding to match 'cdgu'.
1075 /* add a fail transition */
1076 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1078 const U32 ucharcount = trie->uniquecharcount;
1079 const U32 numstates = trie->laststate;
1080 const U32 ubound = trie->lasttrans + ucharcount;
1084 U32 base = trie->states[ 1 ].trans.base;
1087 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1088 GET_RE_DEBUG_FLAGS_DECL;
1090 PERL_UNUSED_ARG(depth);
1094 ARG_SET( stclass, data_slot );
1095 Newxz( aho, 1, reg_ac_data );
1096 RExC_rx->data->data[ data_slot ] = (void*)aho;
1098 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1099 (trie->laststate+1)*sizeof(reg_trie_state));
1100 Newxz( q, numstates, U32);
1101 Newxz( aho->fail, numstates, U32 );
1104 fail[ 0 ] = fail[ 1 ] = 1;
1106 for ( charid = 0; charid < ucharcount ; charid++ ) {
1107 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1109 q[ q_write ] = newstate;
1110 /* set to point at the root */
1111 fail[ q[ q_write++ ] ]=1;
1114 while ( q_read < q_write) {
1115 const U32 cur = q[ q_read++ % numstates ];
1116 base = trie->states[ cur ].trans.base;
1118 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1119 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1121 U32 fail_state = cur;
1124 fail_state = fail[ fail_state ];
1125 fail_base = aho->states[ fail_state ].trans.base;
1126 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1128 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1129 fail[ ch_state ] = fail_state;
1130 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1132 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1134 q[ q_write++ % numstates] = ch_state;
1139 DEBUG_TRIE_COMPILE_MORE_r({
1140 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1141 for( q_read=2; q_read<numstates; q_read++ ) {
1142 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1144 PerlIO_printf(Perl_debug_log, "\n");
1147 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1153 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1156 /* first pass, loop through and scan words */
1157 reg_trie_data *trie;
1159 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1164 /* we just use folder as a flag in utf8 */
1165 const U8 * const folder = ( flags == EXACTF
1167 : ( flags == EXACTFL
1173 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1174 SV *re_trie_maxbuff;
1176 /* these are only used during construction but are useful during
1177 * debugging so we store them in the struct when debugging.
1178 * Wordcount is actually superfluous in debugging as we have
1179 * (AV*)trie->words to use for it, but that's not available when
1180 * not debugging... We could make the macro use the AV during
1181 * debugging though...
1183 U16 trie_wordcount=0;
1184 STRLEN trie_charcount=0;
1185 /*U32 trie_laststate=0;*/
1186 AV *trie_revcharmap;
1188 GET_RE_DEBUG_FLAGS_DECL;
1190 PERL_UNUSED_ARG(depth);
1193 Newxz( trie, 1, reg_trie_data );
1195 trie->startstate = 1;
1196 RExC_rx->data->data[ data_slot ] = (void*)trie;
1197 Newxz( trie->charmap, 256, U16 );
1198 if (!(UTF && folder))
1199 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1201 trie->words = newAV();
1203 TRIE_REVCHARMAP(trie) = newAV();
1205 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1206 if (!SvIOK(re_trie_maxbuff)) {
1207 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1210 PerlIO_printf( Perl_debug_log,
1211 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1212 (int)depth * 2 + 2, "",
1213 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1214 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1216 /* -- First loop and Setup --
1218 We first traverse the branches and scan each word to determine if it
1219 contains widechars, and how many unique chars there are, this is
1220 important as we have to build a table with at least as many columns as we
1223 We use an array of integers to represent the character codes 0..255
1224 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1225 native representation of the character value as the key and IV's for the
1228 *TODO* If we keep track of how many times each character is used we can
1229 remap the columns so that the table compression later on is more
1230 efficient in terms of memory by ensuring most common value is in the
1231 middle and the least common are on the outside. IMO this would be better
1232 than a most to least common mapping as theres a decent chance the most
1233 common letter will share a node with the least common, meaning the node
1234 will not be compressable. With a middle is most common approach the worst
1235 case is when we have the least common nodes twice.
1239 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1240 regnode * const noper = NEXTOPER( cur );
1241 const U8 *uc = (U8*)STRING( noper );
1242 const U8 * const e = uc + STR_LEN( noper );
1244 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1245 const U8 *scan = (U8*)NULL;
1246 U32 wordlen = 0; /* required init */
1249 TRIE_WORDCOUNT(trie)++;
1250 if (OP(noper) == NOTHING) {
1255 TRIE_BITMAP_SET(trie,*uc);
1256 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1258 for ( ; uc < e ; uc += len ) {
1259 TRIE_CHARCOUNT(trie)++;
1263 if ( !trie->charmap[ uvc ] ) {
1264 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1266 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1271 if ( !trie->widecharmap )
1272 trie->widecharmap = newHV();
1274 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1277 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1279 if ( !SvTRUE( *svpp ) ) {
1280 sv_setiv( *svpp, ++trie->uniquecharcount );
1285 if( cur == first ) {
1288 } else if (chars < trie->minlen) {
1290 } else if (chars > trie->maxlen) {
1294 } /* end first pass */
1295 DEBUG_TRIE_COMPILE_r(
1296 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1297 (int)depth * 2 + 2,"",
1298 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1299 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1300 (int)trie->minlen, (int)trie->maxlen )
1302 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
1305 We now know what we are dealing with in terms of unique chars and
1306 string sizes so we can calculate how much memory a naive
1307 representation using a flat table will take. If it's over a reasonable
1308 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1309 conservative but potentially much slower representation using an array
1312 At the end we convert both representations into the same compressed
1313 form that will be used in regexec.c for matching with. The latter
1314 is a form that cannot be used to construct with but has memory
1315 properties similar to the list form and access properties similar
1316 to the table form making it both suitable for fast searches and
1317 small enough that its feasable to store for the duration of a program.
1319 See the comment in the code where the compressed table is produced
1320 inplace from the flat tabe representation for an explanation of how
1321 the compression works.
1326 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1328 Second Pass -- Array Of Lists Representation
1330 Each state will be represented by a list of charid:state records
1331 (reg_trie_trans_le) the first such element holds the CUR and LEN
1332 points of the allocated array. (See defines above).
1334 We build the initial structure using the lists, and then convert
1335 it into the compressed table form which allows faster lookups
1336 (but cant be modified once converted).
1339 STRLEN transcount = 1;
1341 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1345 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1347 regnode * const noper = NEXTOPER( cur );
1348 U8 *uc = (U8*)STRING( noper );
1349 const U8 * const e = uc + STR_LEN( noper );
1350 U32 state = 1; /* required init */
1351 U16 charid = 0; /* sanity init */
1352 U8 *scan = (U8*)NULL; /* sanity init */
1353 STRLEN foldlen = 0; /* required init */
1354 U32 wordlen = 0; /* required init */
1355 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1357 if (OP(noper) != NOTHING) {
1358 for ( ; uc < e ; uc += len ) {
1363 charid = trie->charmap[ uvc ];
1365 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1369 charid=(U16)SvIV( *svpp );
1378 if ( !trie->states[ state ].trans.list ) {
1379 TRIE_LIST_NEW( state );
1381 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1382 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1383 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1388 newstate = next_alloc++;
1389 TRIE_LIST_PUSH( state, charid, newstate );
1394 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1396 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1399 TRIE_HANDLE_WORD(state);
1401 } /* end second pass */
1403 TRIE_LASTSTATE(trie) = next_alloc;
1404 Renew( trie->states, next_alloc, reg_trie_state );
1406 /* and now dump it out before we compress it */
1407 DEBUG_TRIE_COMPILE_MORE_r(
1408 dump_trie_interim_list(trie,next_alloc,depth+1)
1411 Newxz( trie->trans, transcount ,reg_trie_trans );
1418 for( state=1 ; state < next_alloc ; state ++ ) {
1422 DEBUG_TRIE_COMPILE_MORE_r(
1423 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1427 if (trie->states[state].trans.list) {
1428 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1432 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1433 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1434 if ( forid < minid ) {
1436 } else if ( forid > maxid ) {
1440 if ( transcount < tp + maxid - minid + 1) {
1442 Renew( trie->trans, transcount, reg_trie_trans );
1443 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1445 base = trie->uniquecharcount + tp - minid;
1446 if ( maxid == minid ) {
1448 for ( ; zp < tp ; zp++ ) {
1449 if ( ! trie->trans[ zp ].next ) {
1450 base = trie->uniquecharcount + zp - minid;
1451 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1452 trie->trans[ zp ].check = state;
1458 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1459 trie->trans[ tp ].check = state;
1464 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1465 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1466 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1467 trie->trans[ tid ].check = state;
1469 tp += ( maxid - minid + 1 );
1471 Safefree(trie->states[ state ].trans.list);
1474 DEBUG_TRIE_COMPILE_MORE_r(
1475 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1478 trie->states[ state ].trans.base=base;
1480 trie->lasttrans = tp + 1;
1484 Second Pass -- Flat Table Representation.
1486 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1487 We know that we will need Charcount+1 trans at most to store the data
1488 (one row per char at worst case) So we preallocate both structures
1489 assuming worst case.
1491 We then construct the trie using only the .next slots of the entry
1494 We use the .check field of the first entry of the node temporarily to
1495 make compression both faster and easier by keeping track of how many non
1496 zero fields are in the node.
1498 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1501 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1502 number representing the first entry of the node, and state as a
1503 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1504 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1505 are 2 entrys per node. eg:
1513 The table is internally in the right hand, idx form. However as we also
1514 have to deal with the states array which is indexed by nodenum we have to
1515 use TRIE_NODENUM() to convert.
1520 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1522 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1523 next_alloc = trie->uniquecharcount + 1;
1526 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1528 regnode * const noper = NEXTOPER( cur );
1529 const U8 *uc = (U8*)STRING( noper );
1530 const U8 * const e = uc + STR_LEN( noper );
1532 U32 state = 1; /* required init */
1534 U16 charid = 0; /* sanity init */
1535 U32 accept_state = 0; /* sanity init */
1536 U8 *scan = (U8*)NULL; /* sanity init */
1538 STRLEN foldlen = 0; /* required init */
1539 U32 wordlen = 0; /* required init */
1540 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1542 if ( OP(noper) != NOTHING ) {
1543 for ( ; uc < e ; uc += len ) {
1548 charid = trie->charmap[ uvc ];
1550 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1551 charid = svpp ? (U16)SvIV(*svpp) : 0;
1555 if ( !trie->trans[ state + charid ].next ) {
1556 trie->trans[ state + charid ].next = next_alloc;
1557 trie->trans[ state ].check++;
1558 next_alloc += trie->uniquecharcount;
1560 state = trie->trans[ state + charid ].next;
1562 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1564 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1567 accept_state = TRIE_NODENUM( state );
1568 TRIE_HANDLE_WORD(accept_state);
1570 } /* end second pass */
1572 /* and now dump it out before we compress it */
1573 DEBUG_TRIE_COMPILE_MORE_r(
1574 dump_trie_interim_table(trie,next_alloc,depth+1)
1579 * Inplace compress the table.*
1581 For sparse data sets the table constructed by the trie algorithm will
1582 be mostly 0/FAIL transitions or to put it another way mostly empty.
1583 (Note that leaf nodes will not contain any transitions.)
1585 This algorithm compresses the tables by eliminating most such
1586 transitions, at the cost of a modest bit of extra work during lookup:
1588 - Each states[] entry contains a .base field which indicates the
1589 index in the state[] array wheres its transition data is stored.
1591 - If .base is 0 there are no valid transitions from that node.
1593 - If .base is nonzero then charid is added to it to find an entry in
1596 -If trans[states[state].base+charid].check!=state then the
1597 transition is taken to be a 0/Fail transition. Thus if there are fail
1598 transitions at the front of the node then the .base offset will point
1599 somewhere inside the previous nodes data (or maybe even into a node
1600 even earlier), but the .check field determines if the transition is
1603 The following process inplace converts the table to the compressed
1604 table: We first do not compress the root node 1,and mark its all its
1605 .check pointers as 1 and set its .base pointer as 1 as well. This
1606 allows to do a DFA construction from the compressed table later, and
1607 ensures that any .base pointers we calculate later are greater than
1610 - We set 'pos' to indicate the first entry of the second node.
1612 - We then iterate over the columns of the node, finding the first and
1613 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1614 and set the .check pointers accordingly, and advance pos
1615 appropriately and repreat for the next node. Note that when we copy
1616 the next pointers we have to convert them from the original
1617 NODEIDX form to NODENUM form as the former is not valid post
1620 - If a node has no transitions used we mark its base as 0 and do not
1621 advance the pos pointer.
1623 - If a node only has one transition we use a second pointer into the
1624 structure to fill in allocated fail transitions from other states.
1625 This pointer is independent of the main pointer and scans forward
1626 looking for null transitions that are allocated to a state. When it
1627 finds one it writes the single transition into the "hole". If the
1628 pointer doesnt find one the single transition is appeneded as normal.
1630 - Once compressed we can Renew/realloc the structures to release the
1633 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1634 specifically Fig 3.47 and the associated pseudocode.
1638 const U32 laststate = TRIE_NODENUM( next_alloc );
1641 TRIE_LASTSTATE(trie) = laststate;
1643 for ( state = 1 ; state < laststate ; state++ ) {
1645 const U32 stateidx = TRIE_NODEIDX( state );
1646 const U32 o_used = trie->trans[ stateidx ].check;
1647 U32 used = trie->trans[ stateidx ].check;
1648 trie->trans[ stateidx ].check = 0;
1650 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1651 if ( flag || trie->trans[ stateidx + charid ].next ) {
1652 if ( trie->trans[ stateidx + charid ].next ) {
1654 for ( ; zp < pos ; zp++ ) {
1655 if ( ! trie->trans[ zp ].next ) {
1659 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1660 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1661 trie->trans[ zp ].check = state;
1662 if ( ++zp > pos ) pos = zp;
1669 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1671 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1672 trie->trans[ pos ].check = state;
1677 trie->lasttrans = pos + 1;
1678 Renew( trie->states, laststate + 1, reg_trie_state);
1679 DEBUG_TRIE_COMPILE_MORE_r(
1680 PerlIO_printf( Perl_debug_log,
1681 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1682 (int)depth * 2 + 2,"",
1683 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1686 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1689 } /* end table compress */
1691 /* resize the trans array to remove unused space */
1692 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1694 /* and now dump out the compressed format */
1695 DEBUG_TRIE_COMPILE_r(
1696 dump_trie(trie,depth+1)
1699 { /* Modify the program and insert the new TRIE node*/
1701 U8 nodetype =(U8)(flags & 0xFF);
1708 This means we convert either the first branch or the first Exact,
1709 depending on whether the thing following (in 'last') is a branch
1710 or not and whther first is the startbranch (ie is it a sub part of
1711 the alternation or is it the whole thing.)
1712 Assuming its a sub part we conver the EXACT otherwise we convert
1713 the whole branch sequence, including the first.
1715 /* Find the node we are going to overwrite */
1716 if ( first == startbranch && OP( last ) != BRANCH ) {
1717 /* whole branch chain */
1720 const regnode *nop = NEXTOPER( convert );
1721 mjd_offset= Node_Offset((nop));
1722 mjd_nodelen= Node_Length((nop));
1725 /* branch sub-chain */
1726 convert = NEXTOPER( first );
1727 NEXT_OFF( first ) = (U16)(last - first);
1729 mjd_offset= Node_Offset((convert));
1730 mjd_nodelen= Node_Length((convert));
1734 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1735 (int)depth * 2 + 2, "",
1736 mjd_offset,mjd_nodelen)
1739 /* But first we check to see if there is a common prefix we can
1740 split out as an EXACT and put in front of the TRIE node. */
1741 trie->startstate= 1;
1742 if ( trie->bitmap && !trie->widecharmap ) {
1745 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1746 (int)depth * 2 + 2, "",
1747 TRIE_LASTSTATE(trie))
1749 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1753 const U32 base = trie->states[ state ].trans.base;
1755 if ( trie->states[state].wordnum )
1758 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1759 if ( ( base + ofs >= trie->uniquecharcount ) &&
1760 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1761 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1763 if ( ++count > 1 ) {
1764 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1765 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1766 if ( state == 1 ) break;
1768 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1770 PerlIO_printf(Perl_debug_log,
1771 "%*sNew Start State=%"UVuf" Class: [",
1772 (int)depth * 2 + 2, "",
1775 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1776 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1778 TRIE_BITMAP_SET(trie,*ch);
1780 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1782 PerlIO_printf(Perl_debug_log, (char*)ch)
1786 TRIE_BITMAP_SET(trie,*ch);
1788 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1789 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1795 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1796 const char *ch = SvPV_nolen_const( *tmp );
1798 PerlIO_printf( Perl_debug_log,
1799 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1800 (int)depth * 2 + 2, "",
1804 OP( convert ) = nodetype;
1805 str=STRING(convert);
1814 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1820 regnode *n = convert+NODE_SZ_STR(convert);
1821 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1822 trie->startstate = state;
1823 trie->minlen -= (state - 1);
1824 trie->maxlen -= (state - 1);
1826 regnode *fix = convert;
1828 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1829 while( ++fix < n ) {
1830 Set_Node_Offset_Length(fix, 0, 0);
1836 NEXT_OFF(convert) = (U16)(tail - convert);
1840 if ( trie->maxlen ) {
1841 OP( convert ) = TRIE;
1842 NEXT_OFF( convert ) = (U16)(tail - convert);
1843 ARG_SET( convert, data_slot );
1845 /* store the type in the flags */
1846 convert->flags = nodetype;
1847 /* XXX We really should free up the resource in trie now, as we wont use them */
1849 /* needed for dumping*/
1851 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1852 regnode *opt = convert;
1853 while (++opt<optimize) {
1854 Set_Node_Offset_Length(opt,0,0);
1856 /* We now need to mark all of the space originally used by the
1857 branches as optimized away. This keeps the dumpuntil from
1858 throwing a wobbly as it doesnt use regnext() to traverse the
1860 We also "fix" the offsets
1862 while( optimize < last ) {
1863 mjd_nodelen += Node_Length((optimize));
1864 OP( optimize ) = OPTIMIZED;
1865 Set_Node_Offset_Length(optimize,0,0);
1868 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1870 } /* end node insert */
1872 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1878 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1879 * These need to be revisited when a newer toolchain becomes available.
1881 #if defined(__sparc64__) && defined(__GNUC__)
1882 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1883 # undef SPARC64_GCC_WORKAROUND
1884 # define SPARC64_GCC_WORKAROUND 1
1888 #define DEBUG_PEEP(str,scan,depth) \
1889 DEBUG_OPTIMISE_r({ \
1890 SV * const mysv=sv_newmortal(); \
1891 regnode *Next = regnext(scan); \
1892 regprop(RExC_rx, mysv, scan); \
1893 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1894 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1895 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1898 #define JOIN_EXACT(scan,min,flags) \
1899 if (PL_regkind[OP(scan)] == EXACT) \
1900 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1903 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1904 /* Merge several consecutive EXACTish nodes into one. */
1905 regnode *n = regnext(scan);
1907 regnode *next = scan + NODE_SZ_STR(scan);
1911 regnode *stop = scan;
1912 GET_RE_DEBUG_FLAGS_DECL;
1914 PERL_UNUSED_ARG(depth);
1916 #ifndef EXPERIMENTAL_INPLACESCAN
1917 PERL_UNUSED_ARG(flags);
1918 PERL_UNUSED_ARG(val);
1920 DEBUG_PEEP("join",scan,depth);
1922 /* Skip NOTHING, merge EXACT*. */
1924 ( PL_regkind[OP(n)] == NOTHING ||
1925 (stringok && (OP(n) == OP(scan))))
1927 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1929 if (OP(n) == TAIL || n > next)
1931 if (PL_regkind[OP(n)] == NOTHING) {
1932 DEBUG_PEEP("skip:",n,depth);
1933 NEXT_OFF(scan) += NEXT_OFF(n);
1934 next = n + NODE_STEP_REGNODE;
1941 else if (stringok) {
1942 const int oldl = STR_LEN(scan);
1943 regnode * const nnext = regnext(n);
1945 DEBUG_PEEP("merg",n,depth);
1948 if (oldl + STR_LEN(n) > U8_MAX)
1950 NEXT_OFF(scan) += NEXT_OFF(n);
1951 STR_LEN(scan) += STR_LEN(n);
1952 next = n + NODE_SZ_STR(n);
1953 /* Now we can overwrite *n : */
1954 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1962 #ifdef EXPERIMENTAL_INPLACESCAN
1963 if (flags && !NEXT_OFF(n)) {
1964 DEBUG_PEEP("atch", val, depth);
1965 if (reg_off_by_arg[OP(n)]) {
1966 ARG_SET(n, val - n);
1969 NEXT_OFF(n) = val - n;
1976 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1978 Two problematic code points in Unicode casefolding of EXACT nodes:
1980 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1981 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1987 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1988 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1990 This means that in case-insensitive matching (or "loose matching",
1991 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1992 length of the above casefolded versions) can match a target string
1993 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1994 This would rather mess up the minimum length computation.
1996 What we'll do is to look for the tail four bytes, and then peek
1997 at the preceding two bytes to see whether we need to decrease
1998 the minimum length by four (six minus two).
2000 Thanks to the design of UTF-8, there cannot be false matches:
2001 A sequence of valid UTF-8 bytes cannot be a subsequence of
2002 another valid sequence of UTF-8 bytes.
2005 char * const s0 = STRING(scan), *s, *t;
2006 char * const s1 = s0 + STR_LEN(scan) - 1;
2007 char * const s2 = s1 - 4;
2008 const char t0[] = "\xcc\x88\xcc\x81";
2009 const char * const t1 = t0 + 3;
2012 s < s2 && (t = ninstr(s, s1, t0, t1));
2014 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2015 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2022 n = scan + NODE_SZ_STR(scan);
2024 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2031 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2035 /* REx optimizer. Converts nodes into quickier variants "in place".
2036 Finds fixed substrings. */
2038 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2039 to the position after last scanned or to NULL. */
2044 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2045 regnode *last, scan_data_t *data, U32 flags, U32 depth)
2046 /* scanp: Start here (read-write). */
2047 /* deltap: Write maxlen-minlen here. */
2048 /* last: Stop before this one. */
2051 I32 min = 0, pars = 0, code;
2052 regnode *scan = *scanp, *next;
2054 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2055 int is_inf_internal = 0; /* The studied chunk is infinite */
2056 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2057 scan_data_t data_fake;
2058 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2059 SV *re_trie_maxbuff = NULL;
2061 GET_RE_DEBUG_FLAGS_DECL;
2063 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2066 while (scan && OP(scan) != END && scan < last) {
2067 /* Peephole optimizer: */
2068 DEBUG_PEEP("Peep",scan,depth);
2070 JOIN_EXACT(scan,&min,0);
2072 /* Follow the next-chain of the current node and optimize
2073 away all the NOTHINGs from it. */
2074 if (OP(scan) != CURLYX) {
2075 const int max = (reg_off_by_arg[OP(scan)]
2077 /* I32 may be smaller than U16 on CRAYs! */
2078 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2079 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2083 /* Skip NOTHING and LONGJMP. */
2084 while ((n = regnext(n))
2085 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2086 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2087 && off + noff < max)
2089 if (reg_off_by_arg[OP(scan)])
2092 NEXT_OFF(scan) = off;
2097 /* The principal pseudo-switch. Cannot be a switch, since we
2098 look into several different things. */
2099 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2100 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2101 next = regnext(scan);
2103 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2105 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2106 I32 max1 = 0, min1 = I32_MAX, num = 0;
2107 struct regnode_charclass_class accum;
2108 regnode * const startbranch=scan;
2110 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2111 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2112 if (flags & SCF_DO_STCLASS)
2113 cl_init_zero(pRExC_state, &accum);
2115 while (OP(scan) == code) {
2116 I32 deltanext, minnext, f = 0, fake;
2117 struct regnode_charclass_class this_class;
2120 data_fake.flags = 0;
2122 data_fake.whilem_c = data->whilem_c;
2123 data_fake.last_closep = data->last_closep;
2126 data_fake.last_closep = &fake;
2127 next = regnext(scan);
2128 scan = NEXTOPER(scan);
2130 scan = NEXTOPER(scan);
2131 if (flags & SCF_DO_STCLASS) {
2132 cl_init(pRExC_state, &this_class);
2133 data_fake.start_class = &this_class;
2134 f = SCF_DO_STCLASS_AND;
2136 if (flags & SCF_WHILEM_VISITED_POS)
2137 f |= SCF_WHILEM_VISITED_POS;
2139 /* we suppose the run is continuous, last=next...*/
2140 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2141 next, &data_fake, f,depth+1);
2144 if (max1 < minnext + deltanext)
2145 max1 = minnext + deltanext;
2146 if (deltanext == I32_MAX)
2147 is_inf = is_inf_internal = 1;
2149 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2152 if (data_fake.flags & SF_HAS_EVAL)
2153 data->flags |= SF_HAS_EVAL;
2154 data->whilem_c = data_fake.whilem_c;
2156 if (flags & SCF_DO_STCLASS)
2157 cl_or(pRExC_state, &accum, &this_class);
2158 if (code == SUSPEND)
2161 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2163 if (flags & SCF_DO_SUBSTR) {
2164 data->pos_min += min1;
2165 data->pos_delta += max1 - min1;
2166 if (max1 != min1 || is_inf)
2167 data->longest = &(data->longest_float);
2170 delta += max1 - min1;
2171 if (flags & SCF_DO_STCLASS_OR) {
2172 cl_or(pRExC_state, data->start_class, &accum);
2174 cl_and(data->start_class, &and_with);
2175 flags &= ~SCF_DO_STCLASS;
2178 else if (flags & SCF_DO_STCLASS_AND) {
2180 cl_and(data->start_class, &accum);
2181 flags &= ~SCF_DO_STCLASS;
2184 /* Switch to OR mode: cache the old value of
2185 * data->start_class */
2186 StructCopy(data->start_class, &and_with,
2187 struct regnode_charclass_class);
2188 flags &= ~SCF_DO_STCLASS_AND;
2189 StructCopy(&accum, data->start_class,
2190 struct regnode_charclass_class);
2191 flags |= SCF_DO_STCLASS_OR;
2192 data->start_class->flags |= ANYOF_EOS;
2198 Assuming this was/is a branch we are dealing with: 'scan' now
2199 points at the item that follows the branch sequence, whatever
2200 it is. We now start at the beginning of the sequence and look
2206 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2208 If we can find such a subseqence we need to turn the first
2209 element into a trie and then add the subsequent branch exact
2210 strings to the trie.
2214 1. patterns where the whole set of branch can be converted to a trie,
2216 2. patterns where only a subset of the alternations can be
2217 converted to a trie.
2219 In case 1 we can replace the whole set with a single regop
2220 for the trie. In case 2 we need to keep the start and end
2223 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2224 becomes BRANCH TRIE; BRANCH X;
2226 Hypthetically when we know the regex isnt anchored we can
2227 turn a case 1 into a DFA and let it rip... Every time it finds a match
2228 it would just call its tail, no WHILEM/CURLY needed.
2231 if (PERL_ENABLE_TRIE_OPTIMISATION) {
2233 if (!re_trie_maxbuff) {
2234 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2235 if (!SvIOK(re_trie_maxbuff))
2236 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2238 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2240 regnode *first = (regnode *)NULL;
2241 regnode *last = (regnode *)NULL;
2242 regnode *tail = scan;
2247 SV * const mysv = sv_newmortal(); /* for dumping */
2249 /* var tail is used because there may be a TAIL
2250 regop in the way. Ie, the exacts will point to the
2251 thing following the TAIL, but the last branch will
2252 point at the TAIL. So we advance tail. If we
2253 have nested (?:) we may have to move through several
2257 while ( OP( tail ) == TAIL ) {
2258 /* this is the TAIL generated by (?:) */
2259 tail = regnext( tail );
2264 regprop(RExC_rx, mysv, tail );
2265 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2266 (int)depth * 2 + 2, "",
2267 "Looking for TRIE'able sequences. Tail node is: ",
2268 SvPV_nolen_const( mysv )
2274 step through the branches, cur represents each
2275 branch, noper is the first thing to be matched
2276 as part of that branch and noper_next is the
2277 regnext() of that node. if noper is an EXACT
2278 and noper_next is the same as scan (our current
2279 position in the regex) then the EXACT branch is
2280 a possible optimization target. Once we have
2281 two or more consequetive such branches we can
2282 create a trie of the EXACT's contents and stich
2283 it in place. If the sequence represents all of
2284 the branches we eliminate the whole thing and
2285 replace it with a single TRIE. If it is a
2286 subsequence then we need to stitch it in. This
2287 means the first branch has to remain, and needs
2288 to be repointed at the item on the branch chain
2289 following the last branch optimized. This could
2290 be either a BRANCH, in which case the
2291 subsequence is internal, or it could be the
2292 item following the branch sequence in which
2293 case the subsequence is at the end.
2297 /* dont use tail as the end marker for this traverse */
2298 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2299 regnode * const noper = NEXTOPER( cur );
2300 regnode * const noper_next = regnext( noper );
2303 regprop(RExC_rx, mysv, cur);
2304 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2305 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2307 regprop(RExC_rx, mysv, noper);
2308 PerlIO_printf( Perl_debug_log, " -> %s",
2309 SvPV_nolen_const(mysv));
2312 regprop(RExC_rx, mysv, noper_next );
2313 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2314 SvPV_nolen_const(mysv));
2316 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2317 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2319 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2320 : PL_regkind[ OP( noper ) ] == EXACT )
2321 || OP(noper) == NOTHING )
2322 && noper_next == tail && count<U16_MAX)
2325 if ( !first || optype == NOTHING ) {
2326 if (!first) first = cur;
2327 optype = OP( noper );
2333 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2335 if ( PL_regkind[ OP( noper ) ] == EXACT
2336 && noper_next == tail )
2340 optype = OP( noper );
2350 regprop(RExC_rx, mysv, cur);
2351 PerlIO_printf( Perl_debug_log,
2352 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2353 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2357 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2358 #ifdef TRIE_STUDY_OPT
2359 if ( made && startbranch == first ) {
2360 if ( OP(first)!=TRIE )
2361 flags |= SCF_EXACT_TRIE;
2363 regnode *chk=*scanp;
2364 while ( OP( chk ) == OPEN )
2365 chk = regnext( chk );
2367 flags |= SCF_EXACT_TRIE;
2376 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2377 scan = NEXTOPER(NEXTOPER(scan));
2378 } else /* single branch is optimized. */
2379 scan = NEXTOPER(scan);
2382 else if (OP(scan) == EXACT) {
2383 I32 l = STR_LEN(scan);
2386 const U8 * const s = (U8*)STRING(scan);
2387 l = utf8_length(s, s + l);
2388 uc = utf8_to_uvchr(s, NULL);
2390 uc = *((U8*)STRING(scan));
2393 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2394 /* The code below prefers earlier match for fixed
2395 offset, later match for variable offset. */
2396 if (data->last_end == -1) { /* Update the start info. */
2397 data->last_start_min = data->pos_min;
2398 data->last_start_max = is_inf
2399 ? I32_MAX : data->pos_min + data->pos_delta;
2401 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2403 SvUTF8_on(data->last_found);
2405 SV * const sv = data->last_found;
2406 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2407 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2408 if (mg && mg->mg_len >= 0)
2409 mg->mg_len += utf8_length((U8*)STRING(scan),
2410 (U8*)STRING(scan)+STR_LEN(scan));
2412 data->last_end = data->pos_min + l;
2413 data->pos_min += l; /* As in the first entry. */
2414 data->flags &= ~SF_BEFORE_EOL;
2416 if (flags & SCF_DO_STCLASS_AND) {
2417 /* Check whether it is compatible with what we know already! */
2421 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2422 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2423 && (!(data->start_class->flags & ANYOF_FOLD)
2424 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2427 ANYOF_CLASS_ZERO(data->start_class);
2428 ANYOF_BITMAP_ZERO(data->start_class);
2430 ANYOF_BITMAP_SET(data->start_class, uc);
2431 data->start_class->flags &= ~ANYOF_EOS;
2433 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2435 else if (flags & SCF_DO_STCLASS_OR) {
2436 /* false positive possible if the class is case-folded */
2438 ANYOF_BITMAP_SET(data->start_class, uc);
2440 data->start_class->flags |= ANYOF_UNICODE_ALL;
2441 data->start_class->flags &= ~ANYOF_EOS;
2442 cl_and(data->start_class, &and_with);
2444 flags &= ~SCF_DO_STCLASS;
2446 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2447 I32 l = STR_LEN(scan);
2448 UV uc = *((U8*)STRING(scan));
2450 /* Search for fixed substrings supports EXACT only. */
2451 if (flags & SCF_DO_SUBSTR) {
2453 scan_commit(pRExC_state, data);
2456 const U8 * const s = (U8 *)STRING(scan);
2457 l = utf8_length(s, s + l);
2458 uc = utf8_to_uvchr(s, NULL);
2461 if (flags & SCF_DO_SUBSTR)
2463 if (flags & SCF_DO_STCLASS_AND) {
2464 /* Check whether it is compatible with what we know already! */
2468 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2469 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2470 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2472 ANYOF_CLASS_ZERO(data->start_class);
2473 ANYOF_BITMAP_ZERO(data->start_class);
2475 ANYOF_BITMAP_SET(data->start_class, uc);
2476 data->start_class->flags &= ~ANYOF_EOS;
2477 data->start_class->flags |= ANYOF_FOLD;
2478 if (OP(scan) == EXACTFL)
2479 data->start_class->flags |= ANYOF_LOCALE;
2482 else if (flags & SCF_DO_STCLASS_OR) {
2483 if (data->start_class->flags & ANYOF_FOLD) {
2484 /* false positive possible if the class is case-folded.
2485 Assume that the locale settings are the same... */
2487 ANYOF_BITMAP_SET(data->start_class, uc);
2488 data->start_class->flags &= ~ANYOF_EOS;
2490 cl_and(data->start_class, &and_with);
2492 flags &= ~SCF_DO_STCLASS;
2494 #ifdef TRIE_STUDY_OPT
2495 else if (OP(scan) == TRIE) {
2496 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2497 min += trie->minlen;
2498 delta += (trie->maxlen - trie->minlen);
2499 flags &= ~SCF_DO_STCLASS; /* xxx */
2500 if (flags & SCF_DO_SUBSTR) {
2501 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2502 data->pos_min += trie->minlen;
2503 data->pos_delta += (trie->maxlen - trie->minlen);
2504 if (trie->maxlen != trie->minlen)
2505 data->longest = &(data->longest_float);
2509 else if (strchr((const char*)PL_varies,OP(scan))) {
2510 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2511 I32 f = flags, pos_before = 0;
2512 regnode * const oscan = scan;
2513 struct regnode_charclass_class this_class;
2514 struct regnode_charclass_class *oclass = NULL;
2515 I32 next_is_eval = 0;
2517 switch (PL_regkind[OP(scan)]) {
2518 case WHILEM: /* End of (?:...)* . */
2519 scan = NEXTOPER(scan);
2522 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2523 next = NEXTOPER(scan);
2524 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2526 maxcount = REG_INFTY;
2527 next = regnext(scan);
2528 scan = NEXTOPER(scan);
2532 if (flags & SCF_DO_SUBSTR)
2537 if (flags & SCF_DO_STCLASS) {
2539 maxcount = REG_INFTY;
2540 next = regnext(scan);
2541 scan = NEXTOPER(scan);
2544 is_inf = is_inf_internal = 1;
2545 scan = regnext(scan);
2546 if (flags & SCF_DO_SUBSTR) {
2547 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2548 data->longest = &(data->longest_float);
2550 goto optimize_curly_tail;
2552 mincount = ARG1(scan);
2553 maxcount = ARG2(scan);
2554 next = regnext(scan);
2555 if (OP(scan) == CURLYX) {
2556 I32 lp = (data ? *(data->last_closep) : 0);
2557 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2559 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2560 next_is_eval = (OP(scan) == EVAL);
2562 if (flags & SCF_DO_SUBSTR) {
2563 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2564 pos_before = data->pos_min;
2568 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2570 data->flags |= SF_IS_INF;
2572 if (flags & SCF_DO_STCLASS) {
2573 cl_init(pRExC_state, &this_class);
2574 oclass = data->start_class;
2575 data->start_class = &this_class;
2576 f |= SCF_DO_STCLASS_AND;
2577 f &= ~SCF_DO_STCLASS_OR;
2579 /* These are the cases when once a subexpression
2580 fails at a particular position, it cannot succeed
2581 even after backtracking at the enclosing scope.
2583 XXXX what if minimal match and we are at the
2584 initial run of {n,m}? */
2585 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2586 f &= ~SCF_WHILEM_VISITED_POS;
2588 /* This will finish on WHILEM, setting scan, or on NULL: */
2589 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2591 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2593 if (flags & SCF_DO_STCLASS)
2594 data->start_class = oclass;
2595 if (mincount == 0 || minnext == 0) {
2596 if (flags & SCF_DO_STCLASS_OR) {
2597 cl_or(pRExC_state, data->start_class, &this_class);
2599 else if (flags & SCF_DO_STCLASS_AND) {
2600 /* Switch to OR mode: cache the old value of
2601 * data->start_class */
2602 StructCopy(data->start_class, &and_with,
2603 struct regnode_charclass_class);
2604 flags &= ~SCF_DO_STCLASS_AND;
2605 StructCopy(&this_class, data->start_class,
2606 struct regnode_charclass_class);
2607 flags |= SCF_DO_STCLASS_OR;
2608 data->start_class->flags |= ANYOF_EOS;
2610 } else { /* Non-zero len */
2611 if (flags & SCF_DO_STCLASS_OR) {
2612 cl_or(pRExC_state, data->start_class, &this_class);
2613 cl_and(data->start_class, &and_with);
2615 else if (flags & SCF_DO_STCLASS_AND)
2616 cl_and(data->start_class, &this_class);
2617 flags &= ~SCF_DO_STCLASS;
2619 if (!scan) /* It was not CURLYX, but CURLY. */
2621 if ( /* ? quantifier ok, except for (?{ ... }) */
2622 (next_is_eval || !(mincount == 0 && maxcount == 1))
2623 && (minnext == 0) && (deltanext == 0)
2624 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2625 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2626 && ckWARN(WARN_REGEXP))
2629 "Quantifier unexpected on zero-length expression");
2632 min += minnext * mincount;
2633 is_inf_internal |= ((maxcount == REG_INFTY
2634 && (minnext + deltanext) > 0)
2635 || deltanext == I32_MAX);
2636 is_inf |= is_inf_internal;
2637 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2639 /* Try powerful optimization CURLYX => CURLYN. */
2640 if ( OP(oscan) == CURLYX && data
2641 && data->flags & SF_IN_PAR
2642 && !(data->flags & SF_HAS_EVAL)
2643 && !deltanext && minnext == 1 ) {
2644 /* Try to optimize to CURLYN. */
2645 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2646 regnode * const nxt1 = nxt;
2653 if (!strchr((const char*)PL_simple,OP(nxt))
2654 && !(PL_regkind[OP(nxt)] == EXACT
2655 && STR_LEN(nxt) == 1))
2661 if (OP(nxt) != CLOSE)
2663 /* Now we know that nxt2 is the only contents: */
2664 oscan->flags = (U8)ARG(nxt);
2666 OP(nxt1) = NOTHING; /* was OPEN. */
2668 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2669 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2670 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2671 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2672 OP(nxt + 1) = OPTIMIZED; /* was count. */
2673 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2678 /* Try optimization CURLYX => CURLYM. */
2679 if ( OP(oscan) == CURLYX && data
2680 && !(data->flags & SF_HAS_PAR)
2681 && !(data->flags & SF_HAS_EVAL)
2682 && !deltanext /* atom is fixed width */
2683 && minnext != 0 /* CURLYM can't handle zero width */
2685 /* XXXX How to optimize if data == 0? */
2686 /* Optimize to a simpler form. */
2687 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2691 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2692 && (OP(nxt2) != WHILEM))
2694 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2695 /* Need to optimize away parenths. */
2696 if (data->flags & SF_IN_PAR) {
2697 /* Set the parenth number. */
2698 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2700 if (OP(nxt) != CLOSE)
2701 FAIL("Panic opt close");
2702 oscan->flags = (U8)ARG(nxt);
2703 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2704 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2706 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2707 OP(nxt + 1) = OPTIMIZED; /* was count. */
2708 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2709 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2712 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2713 regnode *nnxt = regnext(nxt1);
2716 if (reg_off_by_arg[OP(nxt1)])
2717 ARG_SET(nxt1, nxt2 - nxt1);
2718 else if (nxt2 - nxt1 < U16_MAX)
2719 NEXT_OFF(nxt1) = nxt2 - nxt1;
2721 OP(nxt) = NOTHING; /* Cannot beautify */
2726 /* Optimize again: */
2727 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2733 else if ((OP(oscan) == CURLYX)
2734 && (flags & SCF_WHILEM_VISITED_POS)
2735 /* See the comment on a similar expression above.
2736 However, this time it not a subexpression
2737 we care about, but the expression itself. */
2738 && (maxcount == REG_INFTY)
2739 && data && ++data->whilem_c < 16) {
2740 /* This stays as CURLYX, we can put the count/of pair. */
2741 /* Find WHILEM (as in regexec.c) */
2742 regnode *nxt = oscan + NEXT_OFF(oscan);
2744 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2746 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2747 | (RExC_whilem_seen << 4)); /* On WHILEM */
2749 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2751 if (flags & SCF_DO_SUBSTR) {
2752 SV *last_str = NULL;
2753 int counted = mincount != 0;
2755 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2756 #if defined(SPARC64_GCC_WORKAROUND)
2759 const char *s = NULL;
2762 if (pos_before >= data->last_start_min)
2765 b = data->last_start_min;
2768 s = SvPV_const(data->last_found, l);
2769 old = b - data->last_start_min;
2772 I32 b = pos_before >= data->last_start_min
2773 ? pos_before : data->last_start_min;
2775 const char * const s = SvPV_const(data->last_found, l);
2776 I32 old = b - data->last_start_min;
2780 old = utf8_hop((U8*)s, old) - (U8*)s;
2783 /* Get the added string: */
2784 last_str = newSVpvn(s + old, l);
2786 SvUTF8_on(last_str);
2787 if (deltanext == 0 && pos_before == b) {
2788 /* What was added is a constant string */
2790 SvGROW(last_str, (mincount * l) + 1);
2791 repeatcpy(SvPVX(last_str) + l,
2792 SvPVX_const(last_str), l, mincount - 1);
2793 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2794 /* Add additional parts. */
2795 SvCUR_set(data->last_found,
2796 SvCUR(data->last_found) - l);
2797 sv_catsv(data->last_found, last_str);
2799 SV * sv = data->last_found;
2801 SvUTF8(sv) && SvMAGICAL(sv) ?
2802 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2803 if (mg && mg->mg_len >= 0)
2804 mg->mg_len += CHR_SVLEN(last_str);
2806 data->last_end += l * (mincount - 1);
2809 /* start offset must point into the last copy */
2810 data->last_start_min += minnext * (mincount - 1);
2811 data->last_start_max += is_inf ? I32_MAX
2812 : (maxcount - 1) * (minnext + data->pos_delta);
2815 /* It is counted once already... */
2816 data->pos_min += minnext * (mincount - counted);
2817 data->pos_delta += - counted * deltanext +
2818 (minnext + deltanext) * maxcount - minnext * mincount;
2819 if (mincount != maxcount) {
2820 /* Cannot extend fixed substrings found inside
2822 scan_commit(pRExC_state,data);
2823 if (mincount && last_str) {
2824 SV * const sv = data->last_found;
2825 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2826 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2830 sv_setsv(sv, last_str);
2831 data->last_end = data->pos_min;
2832 data->last_start_min =
2833 data->pos_min - CHR_SVLEN(last_str);
2834 data->last_start_max = is_inf
2836 : data->pos_min + data->pos_delta
2837 - CHR_SVLEN(last_str);
2839 data->longest = &(data->longest_float);
2841 SvREFCNT_dec(last_str);
2843 if (data && (fl & SF_HAS_EVAL))
2844 data->flags |= SF_HAS_EVAL;
2845 optimize_curly_tail:
2846 if (OP(oscan) != CURLYX) {
2847 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2849 NEXT_OFF(oscan) += NEXT_OFF(next);
2852 default: /* REF and CLUMP only? */
2853 if (flags & SCF_DO_SUBSTR) {
2854 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2855 data->longest = &(data->longest_float);
2857 is_inf = is_inf_internal = 1;
2858 if (flags & SCF_DO_STCLASS_OR)
2859 cl_anything(pRExC_state, data->start_class);
2860 flags &= ~SCF_DO_STCLASS;
2864 else if (strchr((const char*)PL_simple,OP(scan))) {
2867 if (flags & SCF_DO_SUBSTR) {
2868 scan_commit(pRExC_state,data);
2872 if (flags & SCF_DO_STCLASS) {
2873 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2875 /* Some of the logic below assumes that switching
2876 locale on will only add false positives. */
2877 switch (PL_regkind[OP(scan)]) {
2881 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2882 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2883 cl_anything(pRExC_state, data->start_class);
2886 if (OP(scan) == SANY)
2888 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2889 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2890 || (data->start_class->flags & ANYOF_CLASS));
2891 cl_anything(pRExC_state, data->start_class);
2893 if (flags & SCF_DO_STCLASS_AND || !value)
2894 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2897 if (flags & SCF_DO_STCLASS_AND)
2898 cl_and(data->start_class,
2899 (struct regnode_charclass_class*)scan);
2901 cl_or(pRExC_state, data->start_class,
2902 (struct regnode_charclass_class*)scan);
2905 if (flags & SCF_DO_STCLASS_AND) {
2906 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2907 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2908 for (value = 0; value < 256; value++)
2909 if (!isALNUM(value))
2910 ANYOF_BITMAP_CLEAR(data->start_class, value);
2914 if (data->start_class->flags & ANYOF_LOCALE)
2915 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2917 for (value = 0; value < 256; value++)
2919 ANYOF_BITMAP_SET(data->start_class, value);
2924 if (flags & SCF_DO_STCLASS_AND) {
2925 if (data->start_class->flags & ANYOF_LOCALE)
2926 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2929 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2930 data->start_class->flags |= ANYOF_LOCALE;
2934 if (flags & SCF_DO_STCLASS_AND) {
2935 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2936 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2937 for (value = 0; value < 256; value++)
2939 ANYOF_BITMAP_CLEAR(data->start_class, value);
2943 if (data->start_class->flags & ANYOF_LOCALE)
2944 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2946 for (value = 0; value < 256; value++)
2947 if (!isALNUM(value))
2948 ANYOF_BITMAP_SET(data->start_class, value);
2953 if (flags & SCF_DO_STCLASS_AND) {
2954 if (data->start_class->flags & ANYOF_LOCALE)
2955 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2958 data->start_class->flags |= ANYOF_LOCALE;
2959 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2963 if (flags & SCF_DO_STCLASS_AND) {
2964 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2965 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2966 for (value = 0; value < 256; value++)
2967 if (!isSPACE(value))
2968 ANYOF_BITMAP_CLEAR(data->start_class, value);
2972 if (data->start_class->flags & ANYOF_LOCALE)
2973 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2975 for (value = 0; value < 256; value++)
2977 ANYOF_BITMAP_SET(data->start_class, value);
2982 if (flags & SCF_DO_STCLASS_AND) {
2983 if (data->start_class->flags & ANYOF_LOCALE)
2984 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2987 data->start_class->flags |= ANYOF_LOCALE;
2988 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2992 if (flags & SCF_DO_STCLASS_AND) {
2993 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2994 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2995 for (value = 0; value < 256; value++)
2997 ANYOF_BITMAP_CLEAR(data->start_class, value);
3001 if (data->start_class->flags & ANYOF_LOCALE)
3002 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3004 for (value = 0; value < 256; value++)
3005 if (!isSPACE(value))
3006 ANYOF_BITMAP_SET(data->start_class, value);
3011 if (flags & SCF_DO_STCLASS_AND) {
3012 if (data->start_class->flags & ANYOF_LOCALE) {
3013 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3014 for (value = 0; value < 256; value++)
3015 if (!isSPACE(value))
3016 ANYOF_BITMAP_CLEAR(data->start_class, value);
3020 data->start_class->flags |= ANYOF_LOCALE;
3021 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3025 if (flags & SCF_DO_STCLASS_AND) {
3026 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3027 for (value = 0; value < 256; value++)
3028 if (!isDIGIT(value))
3029 ANYOF_BITMAP_CLEAR(data->start_class, value);
3032 if (data->start_class->flags & ANYOF_LOCALE)
3033 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3035 for (value = 0; value < 256; value++)
3037 ANYOF_BITMAP_SET(data->start_class, value);
3042 if (flags & SCF_DO_STCLASS_AND) {
3043 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3044 for (value = 0; value < 256; value++)
3046 ANYOF_BITMAP_CLEAR(data->start_class, value);
3049 if (data->start_class->flags & ANYOF_LOCALE)
3050 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3052 for (value = 0; value < 256; value++)
3053 if (!isDIGIT(value))
3054 ANYOF_BITMAP_SET(data->start_class, value);
3059 if (flags & SCF_DO_STCLASS_OR)
3060 cl_and(data->start_class, &and_with);
3061 flags &= ~SCF_DO_STCLASS;
3064 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3065 data->flags |= (OP(scan) == MEOL
3069 else if ( PL_regkind[OP(scan)] == BRANCHJ
3070 /* Lookbehind, or need to calculate parens/evals/stclass: */
3071 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3072 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3073 /* Lookahead/lookbehind */
3074 I32 deltanext, minnext, fake = 0;
3076 struct regnode_charclass_class intrnl;
3079 data_fake.flags = 0;
3081 data_fake.whilem_c = data->whilem_c;
3082 data_fake.last_closep = data->last_closep;
3085 data_fake.last_closep = &fake;
3086 if ( flags & SCF_DO_STCLASS && !scan->flags
3087 && OP(scan) == IFMATCH ) { /* Lookahead */
3088 cl_init(pRExC_state, &intrnl);
3089 data_fake.start_class = &intrnl;
3090 f |= SCF_DO_STCLASS_AND;
3092 if (flags & SCF_WHILEM_VISITED_POS)
3093 f |= SCF_WHILEM_VISITED_POS;
3094 next = regnext(scan);
3095 nscan = NEXTOPER(NEXTOPER(scan));
3096 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3099 vFAIL("Variable length lookbehind not implemented");
3101 else if (minnext > U8_MAX) {
3102 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3104 scan->flags = (U8)minnext;
3107 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3109 if (data_fake.flags & SF_HAS_EVAL)
3110 data->flags |= SF_HAS_EVAL;
3111 data->whilem_c = data_fake.whilem_c;
3113 if (f & SCF_DO_STCLASS_AND) {
3114 const int was = (data->start_class->flags & ANYOF_EOS);
3116 cl_and(data->start_class, &intrnl);
3118 data->start_class->flags |= ANYOF_EOS;
3121 else if (OP(scan) == OPEN) {
3124 else if (OP(scan) == CLOSE) {
3125 if ((I32)ARG(scan) == is_par) {
3126 next = regnext(scan);
3128 if ( next && (OP(next) != WHILEM) && next < last)
3129 is_par = 0; /* Disable optimization */
3132 *(data->last_closep) = ARG(scan);
3134 else if (OP(scan) == EVAL) {
3136 data->flags |= SF_HAS_EVAL;
3138 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3139 if (flags & SCF_DO_SUBSTR) {
3140 scan_commit(pRExC_state,data);
3141 data->longest = &(data->longest_float);
3143 is_inf = is_inf_internal = 1;
3144 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3145 cl_anything(pRExC_state, data->start_class);
3146 flags &= ~SCF_DO_STCLASS;
3148 /* Else: zero-length, ignore. */
3149 scan = regnext(scan);
3154 *deltap = is_inf_internal ? I32_MAX : delta;
3155 if (flags & SCF_DO_SUBSTR && is_inf)
3156 data->pos_delta = I32_MAX - data->pos_min;
3157 if (is_par > U8_MAX)
3159 if (is_par && pars==1 && data) {
3160 data->flags |= SF_IN_PAR;
3161 data->flags &= ~SF_HAS_PAR;
3163 else if (pars && data) {
3164 data->flags |= SF_HAS_PAR;
3165 data->flags &= ~SF_IN_PAR;
3167 if (flags & SCF_DO_STCLASS_OR)
3168 cl_and(data->start_class, &and_with);
3169 if (flags & SCF_EXACT_TRIE)
3170 data->flags |= SCF_EXACT_TRIE;
3175 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3177 if (RExC_rx->data) {
3178 Renewc(RExC_rx->data,
3179 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3180 char, struct reg_data);
3181 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3182 RExC_rx->data->count += n;
3185 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3186 char, struct reg_data);
3187 Newx(RExC_rx->data->what, n, U8);
3188 RExC_rx->data->count = n;
3190 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3191 return RExC_rx->data->count - n;
3194 #ifndef PERL_IN_XSUB_RE
3196 Perl_reginitcolors(pTHX)
3199 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3201 char *t = savepv(s);
3205 t = strchr(t, '\t');
3211 PL_colors[i] = t = (char *)"";
3216 PL_colors[i++] = (char *)"";
3224 - pregcomp - compile a regular expression into internal code
3226 * We can't allocate space until we know how big the compiled form will be,
3227 * but we can't compile it (and thus know how big it is) until we've got a
3228 * place to put the code. So we cheat: we compile it twice, once with code
3229 * generation turned off and size counting turned on, and once "for real".
3230 * This also means that we don't allocate space until we are sure that the
3231 * thing really will compile successfully, and we never have to move the
3232 * code and thus invalidate pointers into it. (Note that it has to be in
3233 * one piece because free() must be able to free it all.) [NB: not true in perl]
3235 * Beware that the optimization-preparation code in here knows about some
3236 * of the structure of the compiled regexp. [I'll say.]
3239 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3250 RExC_state_t RExC_state;
3251 RExC_state_t * const pRExC_state = &RExC_state;
3252 #ifdef TRIE_STUDY_OPT
3254 RExC_state_t copyRExC_state;
3257 GET_RE_DEBUG_FLAGS_DECL;
3260 FAIL("NULL regexp argument");
3262 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3265 DEBUG_r(if (!PL_colorset) reginitcolors());
3267 SV *dsv= sv_newmortal();
3268 RE_PV_QUOTED_DECL(s, RExC_utf8,
3269 dsv, RExC_precomp, (xend - exp), 60);
3270 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3271 PL_colors[4],PL_colors[5],s);
3273 RExC_flags = pm->op_pmflags;
3277 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3278 RExC_seen_evals = 0;
3281 /* First pass: determine size, legality. */
3288 RExC_emit = &PL_regdummy;
3289 RExC_whilem_seen = 0;
3290 #if 0 /* REGC() is (currently) a NOP at the first pass.
3291 * Clever compilers notice this and complain. --jhi */
3292 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3294 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3295 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3296 RExC_precomp = NULL;
3299 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3300 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3301 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3304 RExC_lastparse=NULL;
3308 /* Small enough for pointer-storage convention?
3309 If extralen==0, this means that we will not need long jumps. */
3310 if (RExC_size >= 0x10000L && RExC_extralen)
3311 RExC_size += RExC_extralen;
3314 if (RExC_whilem_seen > 15)
3315 RExC_whilem_seen = 15;
3317 /* Allocate space and initialize. */
3318 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3321 FAIL("Regexp out of space");
3324 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3325 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3328 r->prelen = xend - exp;
3329 r->precomp = savepvn(RExC_precomp, r->prelen);
3331 #ifdef PERL_OLD_COPY_ON_WRITE
3332 r->saved_copy = NULL;
3334 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3335 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3336 r->lastparen = 0; /* mg.c reads this. */
3338 r->substrs = 0; /* Useful during FAIL. */
3339 r->startp = 0; /* Useful during FAIL. */
3340 r->endp = 0; /* Useful during FAIL. */
3342 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3344 r->offsets[0] = RExC_size;
3346 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3347 "%s %"UVuf" bytes for offset annotations.\n",
3348 r->offsets ? "Got" : "Couldn't get",
3349 (UV)((2*RExC_size+1) * sizeof(U32))));
3353 /* Second pass: emit code. */
3354 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3359 RExC_emit_start = r->program;
3360 RExC_emit = r->program;
3361 /* Store the count of eval-groups for security checks: */
3362 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3363 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3365 if (reg(pRExC_state, 0, &flags,1) == NULL)
3367 /* XXXX To minimize changes to RE engine we always allocate
3368 3-units-long substrs field. */
3369 Newx(r->substrs, 1, struct reg_substr_data);
3372 Zero(r->substrs, 1, struct reg_substr_data);
3373 StructCopy(&zero_scan_data, &data, scan_data_t);
3375 #ifdef TRIE_STUDY_OPT
3377 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3378 RExC_state=copyRExC_state;
3379 if (data.longest_fixed)
3380 SvREFCNT_dec(data.longest_fixed);
3381 if (data.longest_float)
3382 SvREFCNT_dec(data.longest_float);
3383 if (data.last_found)
3384 SvREFCNT_dec(data.last_found);
3386 copyRExC_state=RExC_state;
3389 /* Dig out information for optimizations. */
3390 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3391 pm->op_pmflags = RExC_flags;
3393 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3394 r->regstclass = NULL;
3395 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3396 r->reganch |= ROPT_NAUGHTY;
3397 scan = r->program + 1; /* First BRANCH. */
3399 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3400 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3402 STRLEN longest_float_length, longest_fixed_length;
3403 struct regnode_charclass_class ch_class; /* pointed to by data */
3405 I32 last_close = 0; /* pointed to by data */
3408 /* Skip introductions and multiplicators >= 1. */
3409 while ((OP(first) == OPEN && (sawopen = 1)) ||
3410 /* An OR of *one* alternative - should not happen now. */
3411 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3412 /* for now we can't handle lookbehind IFMATCH*/
3413 (OP(first) == IFMATCH && !first->flags) ||
3414 (OP(first) == PLUS) ||
3415 (OP(first) == MINMOD) ||
3416 /* An {n,m} with n>0 */
3417 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3419 DEBUG_PEEP("first:",first,0);
3420 if (OP(first) == PLUS)
3423 first += regarglen[OP(first)];
3424 if (OP(first) == IFMATCH) {
3425 first = NEXTOPER(first);
3426 first += EXTRA_STEP_2ARGS;
3427 } else /* XXX possible optimisation for /(?=)/ */
3428 first = NEXTOPER(first);
3431 /* Starting-point info. */
3433 /* Ignore EXACT as we deal with it later. */
3434 if (PL_regkind[OP(first)] == EXACT) {
3435 if (OP(first) == EXACT)
3436 NOOP; /* Empty, get anchored substr later. */
3437 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3438 r->regstclass = first;
3441 else if (OP(first) == TRIE &&
3442 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3444 /* this can happen only on restudy */
3445 struct regnode_1 *trie_op;
3446 Newxz(trie_op,1,struct regnode_1);
3447 StructCopy(first,trie_op,struct regnode_1);
3448 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3449 r->regstclass = (regnode *)trie_op;
3452 else if (strchr((const char*)PL_simple,OP(first)))
3453 r->regstclass = first;
3454 else if (PL_regkind[OP(first)] == BOUND ||
3455 PL_regkind[OP(first)] == NBOUND)
3456 r->regstclass = first;
3457 else if (PL_regkind[OP(first)] == BOL) {
3458 r->reganch |= (OP(first) == MBOL
3460 : (OP(first) == SBOL
3463 first = NEXTOPER(first);
3466 else if (OP(first) == GPOS) {
3467 r->reganch |= ROPT_ANCH_GPOS;
3468 first = NEXTOPER(first);
3471 else if (!sawopen && (OP(first) == STAR &&
3472 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3473 !(r->reganch & ROPT_ANCH) )
3475 /* turn .* into ^.* with an implied $*=1 */
3477 (OP(NEXTOPER(first)) == REG_ANY)
3480 r->reganch |= type | ROPT_IMPLICIT;
3481 first = NEXTOPER(first);
3484 if (sawplus && (!sawopen || !RExC_sawback)
3485 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3486 /* x+ must match at the 1st pos of run of x's */
3487 r->reganch |= ROPT_SKIP;
3489 /* Scan is after the zeroth branch, first is atomic matcher. */
3490 #ifdef TRIE_STUDY_OPT
3493 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3494 (IV)(first - scan + 1))
3498 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3499 (IV)(first - scan + 1))
3505 * If there's something expensive in the r.e., find the
3506 * longest literal string that must appear and make it the
3507 * regmust. Resolve ties in favor of later strings, since
3508 * the regstart check works with the beginning of the r.e.
3509 * and avoiding duplication strengthens checking. Not a
3510 * strong reason, but sufficient in the absence of others.
3511 * [Now we resolve ties in favor of the earlier string if
3512 * it happens that c_offset_min has been invalidated, since the
3513 * earlier string may buy us something the later one won't.]
3517 data.longest_fixed = newSVpvs("");
3518 data.longest_float = newSVpvs("");
3519 data.last_found = newSVpvs("");
3520 data.longest = &(data.longest_fixed);
3522 if (!r->regstclass) {
3523 cl_init(pRExC_state, &ch_class);
3524 data.start_class = &ch_class;
3525 stclass_flag = SCF_DO_STCLASS_AND;
3526 } else /* XXXX Check for BOUND? */
3528 data.last_closep = &last_close;
3530 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3531 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3533 #ifdef TRIE_STUDY_OPT
3534 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3539 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3540 && data.last_start_min == 0 && data.last_end > 0
3541 && !RExC_seen_zerolen
3542 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3543 r->reganch |= ROPT_CHECK_ALL;
3544 scan_commit(pRExC_state, &data);
3545 SvREFCNT_dec(data.last_found);
3547 longest_float_length = CHR_SVLEN(data.longest_float);
3548 if (longest_float_length
3549 || (data.flags & SF_FL_BEFORE_EOL
3550 && (!(data.flags & SF_FL_BEFORE_MEOL)
3551 || (RExC_flags & PMf_MULTILINE)))) {
3554 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3555 && data.offset_fixed == data.offset_float_min
3556 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3557 goto remove_float; /* As in (a)+. */
3559 if (SvUTF8(data.longest_float)) {
3560 r->float_utf8 = data.longest_float;
3561 r->float_substr = NULL;
3563 r->float_substr = data.longest_float;
3564 r->float_utf8 = NULL;
3566 r->float_min_offset = data.offset_float_min;
3567 r->float_max_offset = data.offset_float_max;
3568 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3569 && (!(data.flags & SF_FL_BEFORE_MEOL)
3570 || (RExC_flags & PMf_MULTILINE)));
3571 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3575 r->float_substr = r->float_utf8 = NULL;
3576 SvREFCNT_dec(data.longest_float);
3577 longest_float_length = 0;
3580 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3581 if (longest_fixed_length
3582 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3583 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3584 || (RExC_flags & PMf_MULTILINE)))) {
3587 if (SvUTF8(data.longest_fixed)) {
3588 r->anchored_utf8 = data.longest_fixed;
3589 r->anchored_substr = NULL;
3591 r->anchored_substr = data.longest_fixed;
3592 r->anchored_utf8 = NULL;
3594 r->anchored_offset = data.offset_fixed;
3595 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3596 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3597 || (RExC_flags & PMf_MULTILINE)));
3598 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3601 r->anchored_substr = r->anchored_utf8 = NULL;
3602 SvREFCNT_dec(data.longest_fixed);
3603 longest_fixed_length = 0;
3606 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3607 r->regstclass = NULL;
3608 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3610 && !(data.start_class->flags & ANYOF_EOS)
3611 && !cl_is_anything(data.start_class))
3613 const I32 n = add_data(pRExC_state, 1, "f");
3615 Newx(RExC_rx->data->data[n], 1,
3616 struct regnode_charclass_class);
3617 StructCopy(data.start_class,
3618 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3619 struct regnode_charclass_class);
3620 r->regstclass = (regnode*)RExC_rx->data->data[n];
3621 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3622 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3623 regprop(r, sv, (regnode*)data.start_class);
3624 PerlIO_printf(Perl_debug_log,
3625 "synthetic stclass \"%s\".\n",
3626 SvPVX_const(sv));});
3629 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3630 if (longest_fixed_length > longest_float_length) {
3631 r->check_substr = r->anchored_substr;
3632 r->check_utf8 = r->anchored_utf8;
3633 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3634 if (r->reganch & ROPT_ANCH_SINGLE)
3635 r->reganch |= ROPT_NOSCAN;
3638 r->check_substr = r->float_substr;
3639 r->check_utf8 = r->float_utf8;
3640 r->check_offset_min = data.offset_float_min;
3641 r->check_offset_max = data.offset_float_max;
3643 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3644 This should be changed ASAP! */
3645 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3646 r->reganch |= RE_USE_INTUIT;
3647 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3648 r->reganch |= RE_INTUIT_TAIL;
3652 /* Several toplevels. Best we can is to set minlen. */
3654 struct regnode_charclass_class ch_class;
3657 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3659 scan = r->program + 1;
3660 cl_init(pRExC_state, &ch_class);
3661 data.start_class = &ch_class;
3662 data.last_closep = &last_close;
3664 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3665 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3667 #ifdef TRIE_STUDY_OPT
3668 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3673 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3674 = r->float_substr = r->float_utf8 = NULL;
3675 if (!(data.start_class->flags & ANYOF_EOS)
3676 && !cl_is_anything(data.start_class))
3678 const I32 n = add_data(pRExC_state, 1, "f");
3680 Newx(RExC_rx->data->data[n], 1,
3681 struct regnode_charclass_class);
3682 StructCopy(data.start_class,
3683 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3684 struct regnode_charclass_class);
3685 r->regstclass = (regnode*)RExC_rx->data->data[n];
3686 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3687 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3688 regprop(r, sv, (regnode*)data.start_class);
3689 PerlIO_printf(Perl_debug_log,
3690 "synthetic stclass \"%s\".\n",
3691 SvPVX_const(sv));});
3696 if (RExC_seen & REG_SEEN_GPOS)
3697 r->reganch |= ROPT_GPOS_SEEN;
3698 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3699 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3700 if (RExC_seen & REG_SEEN_EVAL)
3701 r->reganch |= ROPT_EVAL_SEEN;
3702 if (RExC_seen & REG_SEEN_CANY)
3703 r->reganch |= ROPT_CANY_SEEN;
3704 Newxz(r->startp, RExC_npar, I32);
3705 Newxz(r->endp, RExC_npar, I32);
3707 DEBUG_r( RX_DEBUG_on(r) );
3709 PerlIO_printf(Perl_debug_log,"Final program:\n");
3712 DEBUG_OFFSETS_r(if (r->offsets) {
3713 const U32 len = r->offsets[0];
3715 GET_RE_DEBUG_FLAGS_DECL;
3716 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3717 for (i = 1; i <= len; i++) {
3718 if (r->offsets[i*2-1] || r->offsets[i*2])
3719 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3720 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3722 PerlIO_printf(Perl_debug_log, "\n");
3728 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3729 int rem=(int)(RExC_end - RExC_parse); \
3738 if (RExC_lastparse!=RExC_parse) \
3739 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3742 iscut ? "..." : "<" \
3745 PerlIO_printf(Perl_debug_log,"%16s",""); \
3750 num=REG_NODE_NUM(RExC_emit); \
3751 if (RExC_lastnum!=num) \
3752 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3754 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3755 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3756 (int)((depth*2)), "", \
3760 RExC_lastparse=RExC_parse; \
3765 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3766 DEBUG_PARSE_MSG((funcname)); \
3767 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3770 - reg - regular expression, i.e. main body or parenthesized thing
3772 * Caller must absorb opening parenthesis.
3774 * Combining parenthesis handling with the base level of regular expression
3775 * is a trifle forced, but the need to tie the tails of the branches to what
3776 * follows makes it hard to avoid.
3778 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3780 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3782 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3786 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3787 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3790 register regnode *ret; /* Will be the head of the group. */
3791 register regnode *br;
3792 register regnode *lastbr;
3793 register regnode *ender = NULL;
3794 register I32 parno = 0;
3796 const I32 oregflags = RExC_flags;
3797 bool have_branch = 0;
3800 /* for (?g), (?gc), and (?o) warnings; warning
3801 about (?c) will warn about (?g) -- japhy */
3803 #define WASTED_O 0x01
3804 #define WASTED_G 0x02
3805 #define WASTED_C 0x04
3806 #define WASTED_GC (0x02|0x04)
3807 I32 wastedflags = 0x00;
3809 char * parse_start = RExC_parse; /* MJD */
3810 char * const oregcomp_parse = RExC_parse;
3812 GET_RE_DEBUG_FLAGS_DECL;
3813 DEBUG_PARSE("reg ");
3816 *flagp = 0; /* Tentatively. */
3819 /* Make an OPEN node, if parenthesized. */
3821 if (*RExC_parse == '?') { /* (?...) */
3822 U32 posflags = 0, negflags = 0;
3823 U32 *flagsp = &posflags;
3824 bool is_logical = 0;
3825 const char * const seqstart = RExC_parse;
3828 paren = *RExC_parse++;
3829 ret = NULL; /* For look-ahead/behind. */
3831 case '<': /* (?<...) */
3832 RExC_seen |= REG_SEEN_LOOKBEHIND;
3833 if (*RExC_parse == '!')
3835 if (*RExC_parse != '=' && *RExC_parse != '!')
3838 case '=': /* (?=...) */
3839 case '!': /* (?!...) */
3840 RExC_seen_zerolen++;
3841 case ':': /* (?:...) */
3842 case '>': /* (?>...) */
3844 case '$': /* (?$...) */
3845 case '@': /* (?@...) */
3846 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3848 case '#': /* (?#...) */
3849 while (*RExC_parse && *RExC_parse != ')')
3851 if (*RExC_parse != ')')
3852 FAIL("Sequence (?#... not terminated");
3853 nextchar(pRExC_state);
3856 case 'p': /* (?p...) */
3857 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3858 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3860 case '?': /* (??...) */
3862 if (*RExC_parse != '{')
3864 paren = *RExC_parse++;
3866 case '{': /* (?{...}) */
3868 I32 count = 1, n = 0;
3870 char *s = RExC_parse;
3872 RExC_seen_zerolen++;
3873 RExC_seen |= REG_SEEN_EVAL;
3874 while (count && (c = *RExC_parse)) {
3885 if (*RExC_parse != ')') {
3887 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3891 OP_4tree *sop, *rop;
3892 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3895 Perl_save_re_context(aTHX);
3896 rop = sv_compile_2op(sv, &sop, "re", &pad);
3897 sop->op_private |= OPpREFCOUNTED;
3898 /* re_dup will OpREFCNT_inc */
3899 OpREFCNT_set(sop, 1);
3902 n = add_data(pRExC_state, 3, "nop");
3903 RExC_rx->data->data[n] = (void*)rop;
3904 RExC_rx->data->data[n+1] = (void*)sop;
3905 RExC_rx->data->data[n+2] = (void*)pad;
3908 else { /* First pass */
3909 if (PL_reginterp_cnt < ++RExC_seen_evals
3911 /* No compiled RE interpolated, has runtime
3912 components ===> unsafe. */
3913 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3914 if (PL_tainting && PL_tainted)
3915 FAIL("Eval-group in insecure regular expression");
3916 #if PERL_VERSION > 8
3917 if (IN_PERL_COMPILETIME)
3922 nextchar(pRExC_state);
3924 ret = reg_node(pRExC_state, LOGICAL);
3927 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3928 /* deal with the length of this later - MJD */
3931 ret = reganode(pRExC_state, EVAL, n);
3932 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3933 Set_Node_Offset(ret, parse_start);
3936 case '(': /* (?(?{...})...) and (?(?=...)...) */
3938 if (RExC_parse[0] == '?') { /* (?(?...)) */
3939 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3940 || RExC_parse[1] == '<'
3941 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3944 ret = reg_node(pRExC_state, LOGICAL);
3947 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3951 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3954 parno = atoi(RExC_parse++);
3956 while (isDIGIT(*RExC_parse))
3958 ret = reganode(pRExC_state, GROUPP, parno);
3960 if ((c = *nextchar(pRExC_state)) != ')')
3961 vFAIL("Switch condition not recognized");
3963 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3964 br = regbranch(pRExC_state, &flags, 1,depth+1);
3966 br = reganode(pRExC_state, LONGJMP, 0);
3968 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3969 c = *nextchar(pRExC_state);
3973 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3974 regbranch(pRExC_state, &flags, 1,depth+1);
3975 REGTAIL(pRExC_state, ret, lastbr);
3978 c = *nextchar(pRExC_state);
3983 vFAIL("Switch (?(condition)... contains too many branches");
3984 ender = reg_node(pRExC_state, TAIL);
3985 REGTAIL(pRExC_state, br, ender);
3987 REGTAIL(pRExC_state, lastbr, ender);
3988 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3991 REGTAIL(pRExC_state, ret, ender);
3995 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3999 RExC_parse--; /* for vFAIL to print correctly */
4000 vFAIL("Sequence (? incomplete");
4004 parse_flags: /* (?i) */
4005 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
4006 /* (?g), (?gc) and (?o) are useless here
4007 and must be globally applied -- japhy */
4009 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4010 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
4011 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
4012 if (! (wastedflags & wflagbit) ) {
4013 wastedflags |= wflagbit;
4016 "Useless (%s%c) - %suse /%c modifier",
4017 flagsp == &negflags ? "?-" : "?",
4019 flagsp == &negflags ? "don't " : "",
4025 else if (*RExC_parse == 'c') {
4026 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
4027 if (! (wastedflags & WASTED_C) ) {
4028 wastedflags |= WASTED_GC;
4031 "Useless (%sc) - %suse /gc modifier",
4032 flagsp == &negflags ? "?-" : "?",
4033 flagsp == &negflags ? "don't " : ""
4038 else { pmflag(flagsp, *RExC_parse); }
4042 if (*RExC_parse == '-') {
4044 wastedflags = 0; /* reset so (?g-c) warns twice */
4048 RExC_flags |= posflags;
4049 RExC_flags &= ~negflags;
4050 if (*RExC_parse == ':') {
4056 if (*RExC_parse != ')') {
4058 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
4060 nextchar(pRExC_state);
4068 ret = reganode(pRExC_state, OPEN, parno);
4069 Set_Node_Length(ret, 1); /* MJD */
4070 Set_Node_Offset(ret, RExC_parse); /* MJD */
4077 /* Pick up the branches, linking them together. */
4078 parse_start = RExC_parse; /* MJD */
4079 br = regbranch(pRExC_state, &flags, 1,depth+1);
4080 /* branch_len = (paren != 0); */
4084 if (*RExC_parse == '|') {
4085 if (!SIZE_ONLY && RExC_extralen) {
4086 reginsert(pRExC_state, BRANCHJ, br);
4089 reginsert(pRExC_state, BRANCH, br);
4090 Set_Node_Length(br, paren != 0);
4091 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4095 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4097 else if (paren == ':') {
4098 *flagp |= flags&SIMPLE;
4100 if (is_open) { /* Starts with OPEN. */
4101 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4103 else if (paren != '?') /* Not Conditional */
4105 *flagp |= flags & (SPSTART | HASWIDTH);
4107 while (*RExC_parse == '|') {
4108 if (!SIZE_ONLY && RExC_extralen) {
4109 ender = reganode(pRExC_state, LONGJMP,0);
4110 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4113 RExC_extralen += 2; /* Account for LONGJMP. */
4114 nextchar(pRExC_state);
4115 br = regbranch(pRExC_state, &flags, 0, depth+1);
4119 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4123 *flagp |= flags&SPSTART;
4126 if (have_branch || paren != ':') {
4127 /* Make a closing node, and hook it on the end. */
4130 ender = reg_node(pRExC_state, TAIL);
4133 ender = reganode(pRExC_state, CLOSE, parno);
4134 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4135 Set_Node_Length(ender,1); /* MJD */
4141 *flagp &= ~HASWIDTH;
4144 ender = reg_node(pRExC_state, SUCCEED);
4147 ender = reg_node(pRExC_state, END);
4150 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4152 if (have_branch && !SIZE_ONLY) {
4153 /* Hook the tails of the branches to the closing node. */
4154 for (br = ret; br; br = regnext(br)) {
4155 const U8 op = PL_regkind[OP(br)];
4157 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4159 else if (op == BRANCHJ) {
4160 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4168 static const char parens[] = "=!<,>";
4170 if (paren && (p = strchr(parens, paren))) {
4171 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4172 int flag = (p - parens) > 1;
4175 node = SUSPEND, flag = 0;
4176 reginsert(pRExC_state, node,ret);
4177 Set_Node_Cur_Length(ret);
4178 Set_Node_Offset(ret, parse_start + 1);
4180 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4184 /* Check for proper termination. */
4186 RExC_flags = oregflags;
4187 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4188 RExC_parse = oregcomp_parse;
4189 vFAIL("Unmatched (");
4192 else if (!paren && RExC_parse < RExC_end) {
4193 if (*RExC_parse == ')') {
4195 vFAIL("Unmatched )");
4198 FAIL("Junk on end of regexp"); /* "Can't happen". */
4206 - regbranch - one alternative of an | operator
4208 * Implements the concatenation operator.
4211 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4214 register regnode *ret;
4215 register regnode *chain = NULL;
4216 register regnode *latest;
4217 I32 flags = 0, c = 0;
4218 GET_RE_DEBUG_FLAGS_DECL;
4219 DEBUG_PARSE("brnc");
4223 if (!SIZE_ONLY && RExC_extralen)
4224 ret = reganode(pRExC_state, BRANCHJ,0);
4226 ret = reg_node(pRExC_state, BRANCH);
4227 Set_Node_Length(ret, 1);
4231 if (!first && SIZE_ONLY)
4232 RExC_extralen += 1; /* BRANCHJ */
4234 *flagp = WORST; /* Tentatively. */
4237 nextchar(pRExC_state);
4238 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4240 latest = regpiece(pRExC_state, &flags,depth+1);
4241 if (latest == NULL) {
4242 if (flags & TRYAGAIN)
4246 else if (ret == NULL)
4248 *flagp |= flags&HASWIDTH;
4249 if (chain == NULL) /* First piece. */
4250 *flagp |= flags&SPSTART;
4253 REGTAIL(pRExC_state, chain, latest);
4258 if (chain == NULL) { /* Loop ran zero times. */
4259 chain = reg_node(pRExC_state, NOTHING);
4264 *flagp |= flags&SIMPLE;
4271 - regpiece - something followed by possible [*+?]
4273 * Note that the branching code sequences used for ? and the general cases
4274 * of * and + are somewhat optimized: they use the same NOTHING node as
4275 * both the endmarker for their branch list and the body of the last branch.
4276 * It might seem that this node could be dispensed with entirely, but the
4277 * endmarker role is not redundant.
4280 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4283 register regnode *ret;
4285 register char *next;
4287 const char * const origparse = RExC_parse;
4289 I32 max = REG_INFTY;
4291 GET_RE_DEBUG_FLAGS_DECL;
4292 DEBUG_PARSE("piec");
4294 ret = regatom(pRExC_state, &flags,depth+1);
4296 if (flags & TRYAGAIN)
4303 if (op == '{' && regcurly(RExC_parse)) {
4304 const char *maxpos = NULL;
4305 parse_start = RExC_parse; /* MJD */
4306 next = RExC_parse + 1;
4307 while (isDIGIT(*next) || *next == ',') {
4316 if (*next == '}') { /* got one */
4320 min = atoi(RExC_parse);
4324 maxpos = RExC_parse;
4326 if (!max && *maxpos != '0')
4327 max = REG_INFTY; /* meaning "infinity" */
4328 else if (max >= REG_INFTY)
4329 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4331 nextchar(pRExC_state);
4334 if ((flags&SIMPLE)) {
4335 RExC_naughty += 2 + RExC_naughty / 2;
4336 reginsert(pRExC_state, CURLY, ret);
4337 Set_Node_Offset(ret, parse_start+1); /* MJD */
4338 Set_Node_Cur_Length(ret);
4341 regnode * const w = reg_node(pRExC_state, WHILEM);
4344 REGTAIL(pRExC_state, ret, w);
4345 if (!SIZE_ONLY && RExC_extralen) {
4346 reginsert(pRExC_state, LONGJMP,ret);
4347 reginsert(pRExC_state, NOTHING,ret);
4348 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4350 reginsert(pRExC_state, CURLYX,ret);
4352 Set_Node_Offset(ret, parse_start+1);
4353 Set_Node_Length(ret,
4354 op == '{' ? (RExC_parse - parse_start) : 1);
4356 if (!SIZE_ONLY && RExC_extralen)
4357 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4358 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4360 RExC_whilem_seen++, RExC_extralen += 3;
4361 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4369 if (max && max < min)
4370 vFAIL("Can't do {n,m} with n > m");
4372 ARG1_SET(ret, (U16)min);
4373 ARG2_SET(ret, (U16)max);
4385 #if 0 /* Now runtime fix should be reliable. */
4387 /* if this is reinstated, don't forget to put this back into perldiag:
4389 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4391 (F) The part of the regexp subject to either the * or + quantifier
4392 could match an empty string. The {#} shows in the regular
4393 expression about where the problem was discovered.
4397 if (!(flags&HASWIDTH) && op != '?')
4398 vFAIL("Regexp *+ operand could be empty");
4401 parse_start = RExC_parse;
4402 nextchar(pRExC_state);
4404 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4406 if (op == '*' && (flags&SIMPLE)) {
4407 reginsert(pRExC_state, STAR, ret);
4411 else if (op == '*') {
4415 else if (op == '+' && (flags&SIMPLE)) {
4416 reginsert(pRExC_state, PLUS, ret);
4420 else if (op == '+') {
4424 else if (op == '?') {
4429 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4431 "%.*s matches null string many times",
4432 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4436 if (*RExC_parse == '?') {
4437 nextchar(pRExC_state);
4438 reginsert(pRExC_state, MINMOD, ret);
4439 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4441 if (ISMULT2(RExC_parse)) {
4443 vFAIL("Nested quantifiers");
4450 - regatom - the lowest level
4452 * Optimization: gobbles an entire sequence of ordinary characters so that
4453 * it can turn them into a single node, which is smaller to store and
4454 * faster to run. Backslashed characters are exceptions, each becoming a
4455 * separate node; the code is simpler that way and it's not worth fixing.
4457 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4458 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4461 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4464 register regnode *ret = NULL;
4466 char *parse_start = RExC_parse;
4467 GET_RE_DEBUG_FLAGS_DECL;
4468 DEBUG_PARSE("atom");
4469 *flagp = WORST; /* Tentatively. */
4472 switch (*RExC_parse) {
4474 RExC_seen_zerolen++;
4475 nextchar(pRExC_state);
4476 if (RExC_flags & PMf_MULTILINE)
4477 ret = reg_node(pRExC_state, MBOL);
4478 else if (RExC_flags & PMf_SINGLELINE)
4479 ret = reg_node(pRExC_state, SBOL);
4481 ret = reg_node(pRExC_state, BOL);
4482 Set_Node_Length(ret, 1); /* MJD */
4485 nextchar(pRExC_state);
4487 RExC_seen_zerolen++;
4488 if (RExC_flags & PMf_MULTILINE)
4489 ret = reg_node(pRExC_state, MEOL);
4490 else if (RExC_flags & PMf_SINGLELINE)
4491 ret = reg_node(pRExC_state, SEOL);
4493 ret = reg_node(pRExC_state, EOL);
4494 Set_Node_Length(ret, 1); /* MJD */
4497 nextchar(pRExC_state);
4498 if (RExC_flags & PMf_SINGLELINE)
4499 ret = reg_node(pRExC_state, SANY);
4501 ret = reg_node(pRExC_state, REG_ANY);
4502 *flagp |= HASWIDTH|SIMPLE;
4504 Set_Node_Length(ret, 1); /* MJD */
4508 char * const oregcomp_parse = ++RExC_parse;
4509 ret = regclass(pRExC_state,depth+1);
4510 if (*RExC_parse != ']') {
4511 RExC_parse = oregcomp_parse;
4512 vFAIL("Unmatched [");
4514 nextchar(pRExC_state);
4515 *flagp |= HASWIDTH|SIMPLE;
4516 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4520 nextchar(pRExC_state);
4521 ret = reg(pRExC_state, 1, &flags,depth+1);
4523 if (flags & TRYAGAIN) {
4524 if (RExC_parse == RExC_end) {
4525 /* Make parent create an empty node if needed. */
4533 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4537 if (flags & TRYAGAIN) {
4541 vFAIL("Internal urp");
4542 /* Supposed to be caught earlier. */
4545 if (!regcurly(RExC_parse)) {
4554 vFAIL("Quantifier follows nothing");
4557 switch (*++RExC_parse) {
4559 RExC_seen_zerolen++;
4560 ret = reg_node(pRExC_state, SBOL);
4562 nextchar(pRExC_state);
4563 Set_Node_Length(ret, 2); /* MJD */
4566 ret = reg_node(pRExC_state, GPOS);
4567 RExC_seen |= REG_SEEN_GPOS;
4569 nextchar(pRExC_state);
4570 Set_Node_Length(ret, 2); /* MJD */
4573 ret = reg_node(pRExC_state, SEOL);
4575 RExC_seen_zerolen++; /* Do not optimize RE away */
4576 nextchar(pRExC_state);
4579 ret = reg_node(pRExC_state, EOS);
4581 RExC_seen_zerolen++; /* Do not optimize RE away */
4582 nextchar(pRExC_state);
4583 Set_Node_Length(ret, 2); /* MJD */
4586 ret = reg_node(pRExC_state, CANY);
4587 RExC_seen |= REG_SEEN_CANY;
4588 *flagp |= HASWIDTH|SIMPLE;
4589 nextchar(pRExC_state);
4590 Set_Node_Length(ret, 2); /* MJD */
4593 ret = reg_node(pRExC_state, CLUMP);
4595 nextchar(pRExC_state);
4596 Set_Node_Length(ret, 2); /* MJD */
4599 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4600 *flagp |= HASWIDTH|SIMPLE;
4601 nextchar(pRExC_state);
4602 Set_Node_Length(ret, 2); /* MJD */
4605 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4606 *flagp |= HASWIDTH|SIMPLE;
4607 nextchar(pRExC_state);
4608 Set_Node_Length(ret, 2); /* MJD */
4611 RExC_seen_zerolen++;
4612 RExC_seen |= REG_SEEN_LOOKBEHIND;
4613 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4615 nextchar(pRExC_state);
4616 Set_Node_Length(ret, 2); /* MJD */
4619 RExC_seen_zerolen++;
4620 RExC_seen |= REG_SEEN_LOOKBEHIND;
4621 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4623 nextchar(pRExC_state);
4624 Set_Node_Length(ret, 2); /* MJD */
4627 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4628 *flagp |= HASWIDTH|SIMPLE;
4629 nextchar(pRExC_state);
4630 Set_Node_Length(ret, 2); /* MJD */
4633 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4634 *flagp |= HASWIDTH|SIMPLE;
4635 nextchar(pRExC_state);
4636 Set_Node_Length(ret, 2); /* MJD */
4639 ret = reg_node(pRExC_state, DIGIT);
4640 *flagp |= HASWIDTH|SIMPLE;
4641 nextchar(pRExC_state);
4642 Set_Node_Length(ret, 2); /* MJD */
4645 ret = reg_node(pRExC_state, NDIGIT);
4646 *flagp |= HASWIDTH|SIMPLE;
4647 nextchar(pRExC_state);
4648 Set_Node_Length(ret, 2); /* MJD */
4653 char* const oldregxend = RExC_end;
4654 char* parse_start = RExC_parse - 2;
4656 if (RExC_parse[1] == '{') {
4657 /* a lovely hack--pretend we saw [\pX] instead */
4658 RExC_end = strchr(RExC_parse, '}');
4660 const U8 c = (U8)*RExC_parse;
4662 RExC_end = oldregxend;
4663 vFAIL2("Missing right brace on \\%c{}", c);
4668 RExC_end = RExC_parse + 2;
4669 if (RExC_end > oldregxend)
4670 RExC_end = oldregxend;
4674 ret = regclass(pRExC_state,depth+1);
4676 RExC_end = oldregxend;
4679 Set_Node_Offset(ret, parse_start + 2);
4680 Set_Node_Cur_Length(ret);
4681 nextchar(pRExC_state);
4682 *flagp |= HASWIDTH|SIMPLE;
4695 case '1': case '2': case '3': case '4':
4696 case '5': case '6': case '7': case '8': case '9':
4698 const I32 num = atoi(RExC_parse);
4700 if (num > 9 && num >= RExC_npar)
4703 char * const parse_start = RExC_parse - 1; /* MJD */
4704 while (isDIGIT(*RExC_parse))
4707 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4708 vFAIL("Reference to nonexistent group");
4710 ret = reganode(pRExC_state,
4711 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4715 /* override incorrect value set in reganode MJD */
4716 Set_Node_Offset(ret, parse_start+1);
4717 Set_Node_Cur_Length(ret); /* MJD */
4719 nextchar(pRExC_state);
4724 if (RExC_parse >= RExC_end)
4725 FAIL("Trailing \\");
4728 /* Do not generate "unrecognized" warnings here, we fall
4729 back into the quick-grab loop below */
4736 if (RExC_flags & PMf_EXTENDED) {
4737 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4739 if (RExC_parse < RExC_end)
4745 register STRLEN len;
4750 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4752 parse_start = RExC_parse - 1;
4758 ret = reg_node(pRExC_state,
4759 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4761 for (len = 0, p = RExC_parse - 1;
4762 len < 127 && p < RExC_end;
4765 char * const oldp = p;
4767 if (RExC_flags & PMf_EXTENDED)
4768 p = regwhite(p, RExC_end);
4815 ender = ASCII_TO_NATIVE('\033');
4819 ender = ASCII_TO_NATIVE('\007');
4824 char* const e = strchr(p, '}');
4828 vFAIL("Missing right brace on \\x{}");
4831 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4832 | PERL_SCAN_DISALLOW_PREFIX;
4833 STRLEN numlen = e - p - 1;
4834 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4841 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4843 ender = grok_hex(p, &numlen, &flags, NULL);
4849 ender = UCHARAT(p++);
4850 ender = toCTRL(ender);
4852 case '0': case '1': case '2': case '3':case '4':
4853 case '5': case '6': case '7': case '8':case '9':
4855 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4858 ender = grok_oct(p, &numlen, &flags, NULL);
4868 FAIL("Trailing \\");
4871 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4872 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4873 goto normal_default;
4878 if (UTF8_IS_START(*p) && UTF) {
4880 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4881 &numlen, UTF8_ALLOW_DEFAULT);
4888 if (RExC_flags & PMf_EXTENDED)
4889 p = regwhite(p, RExC_end);
4891 /* Prime the casefolded buffer. */
4892 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4894 if (ISMULT2(p)) { /* Back off on ?+*. */
4899 /* Emit all the Unicode characters. */
4901 for (foldbuf = tmpbuf;
4903 foldlen -= numlen) {
4904 ender = utf8_to_uvchr(foldbuf, &numlen);
4906 const STRLEN unilen = reguni(pRExC_state, ender, s);
4909 /* In EBCDIC the numlen
4910 * and unilen can differ. */
4912 if (numlen >= foldlen)
4916 break; /* "Can't happen." */
4920 const STRLEN unilen = reguni(pRExC_state, ender, s);
4929 REGC((char)ender, s++);
4935 /* Emit all the Unicode characters. */
4937 for (foldbuf = tmpbuf;
4939 foldlen -= numlen) {
4940 ender = utf8_to_uvchr(foldbuf, &numlen);
4942 const STRLEN unilen = reguni(pRExC_state, ender, s);
4945 /* In EBCDIC the numlen
4946 * and unilen can differ. */
4948 if (numlen >= foldlen)
4956 const STRLEN unilen = reguni(pRExC_state, ender, s);
4965 REGC((char)ender, s++);
4969 Set_Node_Cur_Length(ret); /* MJD */
4970 nextchar(pRExC_state);
4972 /* len is STRLEN which is unsigned, need to copy to signed */
4975 vFAIL("Internal disaster");
4979 if (len == 1 && UNI_IS_INVARIANT(ender))
4983 RExC_size += STR_SZ(len);
4986 RExC_emit += STR_SZ(len);
4992 /* If the encoding pragma is in effect recode the text of
4993 * any EXACT-kind nodes. */
4994 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4995 const STRLEN oldlen = STR_LEN(ret);
4996 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
5000 if (sv_utf8_downgrade(sv, TRUE)) {
5001 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5002 const STRLEN newlen = SvCUR(sv);
5007 GET_RE_DEBUG_FLAGS_DECL;
5008 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
5009 (int)oldlen, STRING(ret),
5011 Copy(s, STRING(ret), newlen, char);
5012 STR_LEN(ret) += newlen - oldlen;
5013 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5015 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5023 S_regwhite(char *p, const char *e)
5028 else if (*p == '#') {
5031 } while (p < e && *p != '\n');
5039 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5040 Character classes ([:foo:]) can also be negated ([:^foo:]).
5041 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5042 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
5043 but trigger failures because they are currently unimplemented. */
5045 #define POSIXCC_DONE(c) ((c) == ':')
5046 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5047 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5050 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
5053 I32 namedclass = OOB_NAMEDCLASS;
5055 if (value == '[' && RExC_parse + 1 < RExC_end &&
5056 /* I smell either [: or [= or [. -- POSIX has been here, right? */
5057 POSIXCC(UCHARAT(RExC_parse))) {
5058 const char c = UCHARAT(RExC_parse);
5059 char* const s = RExC_parse++;
5061 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
5063 if (RExC_parse == RExC_end)
5064 /* Grandfather lone [:, [=, [. */
5067 const char* const t = RExC_parse++; /* skip over the c */
5070 if (UCHARAT(RExC_parse) == ']') {
5071 const char *posixcc = s + 1;
5072 RExC_parse++; /* skip over the ending ] */
5075 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5076 const I32 skip = t - posixcc;
5078 /* Initially switch on the length of the name. */
5081 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5082 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5085 /* Names all of length 5. */
5086 /* alnum alpha ascii blank cntrl digit graph lower
5087 print punct space upper */
5088 /* Offset 4 gives the best switch position. */
5089 switch (posixcc[4]) {
5091 if (memEQ(posixcc, "alph", 4)) /* alpha */
5092 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5095 if (memEQ(posixcc, "spac", 4)) /* space */
5096 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5099 if (memEQ(posixcc, "grap", 4)) /* graph */
5100 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5103 if (memEQ(posixcc, "asci", 4)) /* ascii */
5104 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5107 if (memEQ(posixcc, "blan", 4)) /* blank */
5108 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5111 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5112 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5115 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5116 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5119 if (memEQ(posixcc, "lowe", 4)) /* lower */
5120 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5121 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5122 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5125 if (memEQ(posixcc, "digi", 4)) /* digit */
5126 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5127 else if (memEQ(posixcc, "prin", 4)) /* print */
5128 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5129 else if (memEQ(posixcc, "punc", 4)) /* punct */
5130 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5135 if (memEQ(posixcc, "xdigit", 6))
5136 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5140 if (namedclass == OOB_NAMEDCLASS)
5141 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5143 assert (posixcc[skip] == ':');
5144 assert (posixcc[skip+1] == ']');
5145 } else if (!SIZE_ONLY) {
5146 /* [[=foo=]] and [[.foo.]] are still future. */
5148 /* adjust RExC_parse so the warning shows after
5150 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5152 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5155 /* Maternal grandfather:
5156 * "[:" ending in ":" but not in ":]" */
5166 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5169 if (POSIXCC(UCHARAT(RExC_parse))) {
5170 const char *s = RExC_parse;
5171 const char c = *s++;
5175 if (*s && c == *s && s[1] == ']') {
5176 if (ckWARN(WARN_REGEXP))
5178 "POSIX syntax [%c %c] belongs inside character classes",
5181 /* [[=foo=]] and [[.foo.]] are still future. */
5182 if (POSIXCC_NOTYET(c)) {
5183 /* adjust RExC_parse so the error shows after
5185 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5187 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5195 parse a class specification and produce either an ANYOF node that
5196 matches the pattern. If the pattern matches a single char only and
5197 that char is < 256 then we produce an EXACT node instead.
5200 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5204 register UV nextvalue;
5205 register IV prevvalue = OOB_UNICODE;
5206 register IV range = 0;
5207 register regnode *ret;
5210 char *rangebegin = NULL;
5211 bool need_class = 0;
5214 bool optimize_invert = TRUE;
5215 AV* unicode_alternate = NULL;
5217 UV literal_endpoint = 0;
5219 UV stored = 0; /* number of chars stored in the class */
5221 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5222 case we need to change the emitted regop to an EXACT. */
5223 const char * orig_parse = RExC_parse;
5224 GET_RE_DEBUG_FLAGS_DECL;
5226 PERL_UNUSED_ARG(depth);
5229 DEBUG_PARSE("clas");
5231 /* Assume we are going to generate an ANYOF node. */
5232 ret = reganode(pRExC_state, ANYOF, 0);
5235 ANYOF_FLAGS(ret) = 0;
5237 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5241 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5245 RExC_size += ANYOF_SKIP;
5246 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5249 RExC_emit += ANYOF_SKIP;
5251 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5253 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5254 ANYOF_BITMAP_ZERO(ret);
5255 listsv = newSVpvs("# comment\n");
5258 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5260 if (!SIZE_ONLY && POSIXCC(nextvalue))
5261 checkposixcc(pRExC_state);
5263 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5264 if (UCHARAT(RExC_parse) == ']')
5267 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5271 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5274 rangebegin = RExC_parse;
5276 value = utf8n_to_uvchr((U8*)RExC_parse,
5277 RExC_end - RExC_parse,
5278 &numlen, UTF8_ALLOW_DEFAULT);
5279 RExC_parse += numlen;
5282 value = UCHARAT(RExC_parse++);
5284 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5285 if (value == '[' && POSIXCC(nextvalue))
5286 namedclass = regpposixcc(pRExC_state, value);
5287 else if (value == '\\') {
5289 value = utf8n_to_uvchr((U8*)RExC_parse,
5290 RExC_end - RExC_parse,
5291 &numlen, UTF8_ALLOW_DEFAULT);
5292 RExC_parse += numlen;
5295 value = UCHARAT(RExC_parse++);
5296 /* Some compilers cannot handle switching on 64-bit integer
5297 * values, therefore value cannot be an UV. Yes, this will
5298 * be a problem later if we want switch on Unicode.
5299 * A similar issue a little bit later when switching on
5300 * namedclass. --jhi */
5301 switch ((I32)value) {
5302 case 'w': namedclass = ANYOF_ALNUM; break;
5303 case 'W': namedclass = ANYOF_NALNUM; break;
5304 case 's': namedclass = ANYOF_SPACE; break;
5305 case 'S': namedclass = ANYOF_NSPACE; break;
5306 case 'd': namedclass = ANYOF_DIGIT; break;
5307 case 'D': namedclass = ANYOF_NDIGIT; break;
5312 if (RExC_parse >= RExC_end)
5313 vFAIL2("Empty \\%c{}", (U8)value);
5314 if (*RExC_parse == '{') {
5315 const U8 c = (U8)value;
5316 e = strchr(RExC_parse++, '}');
5318 vFAIL2("Missing right brace on \\%c{}", c);
5319 while (isSPACE(UCHARAT(RExC_parse)))
5321 if (e == RExC_parse)
5322 vFAIL2("Empty \\%c{}", c);
5324 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5332 if (UCHARAT(RExC_parse) == '^') {
5335 value = value == 'p' ? 'P' : 'p'; /* toggle */
5336 while (isSPACE(UCHARAT(RExC_parse))) {
5341 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5342 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5345 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5346 namedclass = ANYOF_MAX; /* no official name, but it's named */
5349 case 'n': value = '\n'; break;
5350 case 'r': value = '\r'; break;
5351 case 't': value = '\t'; break;
5352 case 'f': value = '\f'; break;
5353 case 'b': value = '\b'; break;
5354 case 'e': value = ASCII_TO_NATIVE('\033');break;
5355 case 'a': value = ASCII_TO_NATIVE('\007');break;
5357 if (*RExC_parse == '{') {
5358 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5359 | PERL_SCAN_DISALLOW_PREFIX;
5360 char * const e = strchr(RExC_parse++, '}');
5362 vFAIL("Missing right brace on \\x{}");
5364 numlen = e - RExC_parse;
5365 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5369 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5371 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5372 RExC_parse += numlen;
5376 value = UCHARAT(RExC_parse++);
5377 value = toCTRL(value);
5379 case '0': case '1': case '2': case '3': case '4':
5380 case '5': case '6': case '7': case '8': case '9':
5384 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5385 RExC_parse += numlen;
5389 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5391 "Unrecognized escape \\%c in character class passed through",
5395 } /* end of \blah */
5401 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5403 if (!SIZE_ONLY && !need_class)
5404 ANYOF_CLASS_ZERO(ret);
5408 /* a bad range like a-\d, a-[:digit:] ? */
5411 if (ckWARN(WARN_REGEXP)) {
5413 RExC_parse >= rangebegin ?
5414 RExC_parse - rangebegin : 0;
5416 "False [] range \"%*.*s\"",
5419 if (prevvalue < 256) {
5420 ANYOF_BITMAP_SET(ret, prevvalue);
5421 ANYOF_BITMAP_SET(ret, '-');
5424 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5425 Perl_sv_catpvf(aTHX_ listsv,
5426 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5430 range = 0; /* this was not a true range */
5434 const char *what = NULL;
5437 if (namedclass > OOB_NAMEDCLASS)
5438 optimize_invert = FALSE;
5439 /* Possible truncation here but in some 64-bit environments
5440 * the compiler gets heartburn about switch on 64-bit values.
5441 * A similar issue a little earlier when switching on value.
5443 switch ((I32)namedclass) {
5446 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5448 for (value = 0; value < 256; value++)
5450 ANYOF_BITMAP_SET(ret, value);
5457 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5459 for (value = 0; value < 256; value++)
5460 if (!isALNUM(value))
5461 ANYOF_BITMAP_SET(ret, value);
5468 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5470 for (value = 0; value < 256; value++)
5471 if (isALNUMC(value))
5472 ANYOF_BITMAP_SET(ret, value);
5479 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5481 for (value = 0; value < 256; value++)
5482 if (!isALNUMC(value))
5483 ANYOF_BITMAP_SET(ret, value);
5490 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5492 for (value = 0; value < 256; value++)
5494 ANYOF_BITMAP_SET(ret, value);
5501 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5503 for (value = 0; value < 256; value++)
5504 if (!isALPHA(value))
5505 ANYOF_BITMAP_SET(ret, value);
5512 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5515 for (value = 0; value < 128; value++)
5516 ANYOF_BITMAP_SET(ret, value);
5518 for (value = 0; value < 256; value++) {
5520 ANYOF_BITMAP_SET(ret, value);
5529 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5532 for (value = 128; value < 256; value++)
5533 ANYOF_BITMAP_SET(ret, value);
5535 for (value = 0; value < 256; value++) {
5536 if (!isASCII(value))
5537 ANYOF_BITMAP_SET(ret, value);
5546 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5548 for (value = 0; value < 256; value++)
5550 ANYOF_BITMAP_SET(ret, value);
5557 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5559 for (value = 0; value < 256; value++)
5560 if (!isBLANK(value))
5561 ANYOF_BITMAP_SET(ret, value);
5568 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5570 for (value = 0; value < 256; value++)
5572 ANYOF_BITMAP_SET(ret, value);
5579 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5581 for (value = 0; value < 256; value++)
5582 if (!isCNTRL(value))
5583 ANYOF_BITMAP_SET(ret, value);
5590 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5592 /* consecutive digits assumed */
5593 for (value = '0'; value <= '9'; value++)
5594 ANYOF_BITMAP_SET(ret, value);
5601 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5603 /* consecutive digits assumed */
5604 for (value = 0; value < '0'; value++)
5605 ANYOF_BITMAP_SET(ret, value);
5606 for (value = '9' + 1; value < 256; value++)
5607 ANYOF_BITMAP_SET(ret, value);
5614 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5616 for (value = 0; value < 256; value++)
5618 ANYOF_BITMAP_SET(ret, value);
5625 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5627 for (value = 0; value < 256; value++)
5628 if (!isGRAPH(value))
5629 ANYOF_BITMAP_SET(ret, value);
5636 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5638 for (value = 0; value < 256; value++)
5640 ANYOF_BITMAP_SET(ret, value);
5647 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5649 for (value = 0; value < 256; value++)
5650 if (!isLOWER(value))
5651 ANYOF_BITMAP_SET(ret, value);
5658 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5660 for (value = 0; value < 256; value++)
5662 ANYOF_BITMAP_SET(ret, value);
5669 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5671 for (value = 0; value < 256; value++)
5672 if (!isPRINT(value))
5673 ANYOF_BITMAP_SET(ret, value);
5680 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5682 for (value = 0; value < 256; value++)
5683 if (isPSXSPC(value))
5684 ANYOF_BITMAP_SET(ret, value);
5691 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5693 for (value = 0; value < 256; value++)
5694 if (!isPSXSPC(value))
5695 ANYOF_BITMAP_SET(ret, value);
5702 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5704 for (value = 0; value < 256; value++)
5706 ANYOF_BITMAP_SET(ret, value);
5713 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5715 for (value = 0; value < 256; value++)
5716 if (!isPUNCT(value))
5717 ANYOF_BITMAP_SET(ret, value);
5724 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5726 for (value = 0; value < 256; value++)
5728 ANYOF_BITMAP_SET(ret, value);
5735 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5737 for (value = 0; value < 256; value++)
5738 if (!isSPACE(value))
5739 ANYOF_BITMAP_SET(ret, value);
5746 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5748 for (value = 0; value < 256; value++)
5750 ANYOF_BITMAP_SET(ret, value);
5757 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5759 for (value = 0; value < 256; value++)
5760 if (!isUPPER(value))
5761 ANYOF_BITMAP_SET(ret, value);
5768 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5770 for (value = 0; value < 256; value++)
5771 if (isXDIGIT(value))
5772 ANYOF_BITMAP_SET(ret, value);
5779 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5781 for (value = 0; value < 256; value++)
5782 if (!isXDIGIT(value))
5783 ANYOF_BITMAP_SET(ret, value);
5789 /* this is to handle \p and \P */
5792 vFAIL("Invalid [::] class");
5796 /* Strings such as "+utf8::isWord\n" */
5797 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5800 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5803 } /* end of namedclass \blah */
5806 if (prevvalue > (IV)value) /* b-a */ {
5807 const int w = RExC_parse - rangebegin;
5808 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5809 range = 0; /* not a valid range */
5813 prevvalue = value; /* save the beginning of the range */
5814 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5815 RExC_parse[1] != ']') {
5818 /* a bad range like \w-, [:word:]- ? */
5819 if (namedclass > OOB_NAMEDCLASS) {
5820 if (ckWARN(WARN_REGEXP)) {
5822 RExC_parse >= rangebegin ?
5823 RExC_parse - rangebegin : 0;
5825 "False [] range \"%*.*s\"",
5829 ANYOF_BITMAP_SET(ret, '-');
5831 range = 1; /* yeah, it's a range! */
5832 continue; /* but do it the next time */
5836 /* now is the next time */
5837 /*stored += (value - prevvalue + 1);*/
5839 if (prevvalue < 256) {
5840 const IV ceilvalue = value < 256 ? value : 255;
5843 /* In EBCDIC [\x89-\x91] should include
5844 * the \x8e but [i-j] should not. */
5845 if (literal_endpoint == 2 &&
5846 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5847 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5849 if (isLOWER(prevvalue)) {
5850 for (i = prevvalue; i <= ceilvalue; i++)
5852 ANYOF_BITMAP_SET(ret, i);
5854 for (i = prevvalue; i <= ceilvalue; i++)
5856 ANYOF_BITMAP_SET(ret, i);
5861 for (i = prevvalue; i <= ceilvalue; i++) {
5862 if (!ANYOF_BITMAP_TEST(ret,i)) {
5864 ANYOF_BITMAP_SET(ret, i);
5868 if (value > 255 || UTF) {
5869 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5870 const UV natvalue = NATIVE_TO_UNI(value);
5871 stored+=2; /* can't optimize this class */
5872 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5873 if (prevnatvalue < natvalue) { /* what about > ? */
5874 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5875 prevnatvalue, natvalue);
5877 else if (prevnatvalue == natvalue) {
5878 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5880 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5882 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5884 /* If folding and foldable and a single
5885 * character, insert also the folded version
5886 * to the charclass. */
5888 if (foldlen == (STRLEN)UNISKIP(f))
5889 Perl_sv_catpvf(aTHX_ listsv,
5892 /* Any multicharacter foldings
5893 * require the following transform:
5894 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5895 * where E folds into "pq" and F folds
5896 * into "rst", all other characters
5897 * fold to single characters. We save
5898 * away these multicharacter foldings,
5899 * to be later saved as part of the
5900 * additional "s" data. */
5903 if (!unicode_alternate)
5904 unicode_alternate = newAV();
5905 sv = newSVpvn((char*)foldbuf, foldlen);
5907 av_push(unicode_alternate, sv);
5911 /* If folding and the value is one of the Greek
5912 * sigmas insert a few more sigmas to make the
5913 * folding rules of the sigmas to work right.
5914 * Note that not all the possible combinations
5915 * are handled here: some of them are handled
5916 * by the standard folding rules, and some of
5917 * them (literal or EXACTF cases) are handled
5918 * during runtime in regexec.c:S_find_byclass(). */
5919 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5920 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5921 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5922 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5923 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5925 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5926 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5927 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5932 literal_endpoint = 0;
5936 range = 0; /* this range (if it was one) is done now */
5940 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5942 RExC_size += ANYOF_CLASS_ADD_SKIP;
5944 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5950 /****** !SIZE_ONLY AFTER HERE *********/
5952 if( stored == 1 && value < 256
5953 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5955 /* optimize single char class to an EXACT node
5956 but *only* when its not a UTF/high char */
5957 const char * cur_parse= RExC_parse;
5958 RExC_emit = (regnode *)orig_emit;
5959 RExC_parse = (char *)orig_parse;
5960 ret = reg_node(pRExC_state,
5961 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5962 RExC_parse = (char *)cur_parse;
5963 *STRING(ret)= (char)value;
5965 RExC_emit += STR_SZ(1);
5968 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5969 if ( /* If the only flag is folding (plus possibly inversion). */
5970 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5972 for (value = 0; value < 256; ++value) {
5973 if (ANYOF_BITMAP_TEST(ret, value)) {
5974 UV fold = PL_fold[value];
5977 ANYOF_BITMAP_SET(ret, fold);
5980 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5983 /* optimize inverted simple patterns (e.g. [^a-z]) */
5984 if (optimize_invert &&
5985 /* If the only flag is inversion. */
5986 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5987 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5988 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5989 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5992 AV * const av = newAV();
5994 /* The 0th element stores the character class description
5995 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5996 * to initialize the appropriate swash (which gets stored in
5997 * the 1st element), and also useful for dumping the regnode.
5998 * The 2nd element stores the multicharacter foldings,
5999 * used later (regexec.c:S_reginclass()). */
6000 av_store(av, 0, listsv);
6001 av_store(av, 1, NULL);
6002 av_store(av, 2, (SV*)unicode_alternate);
6003 rv = newRV_noinc((SV*)av);
6004 n = add_data(pRExC_state, 1, "s");
6005 RExC_rx->data->data[n] = (void*)rv;
6012 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
6014 char* const retval = RExC_parse++;
6017 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6018 RExC_parse[2] == '#') {
6019 while (*RExC_parse != ')') {
6020 if (RExC_parse == RExC_end)
6021 FAIL("Sequence (?#... not terminated");
6027 if (RExC_flags & PMf_EXTENDED) {
6028 if (isSPACE(*RExC_parse)) {
6032 else if (*RExC_parse == '#') {
6033 while (RExC_parse < RExC_end)
6034 if (*RExC_parse++ == '\n') break;
6043 - reg_node - emit a node
6045 STATIC regnode * /* Location. */
6046 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
6049 register regnode *ptr;
6050 regnode * const ret = RExC_emit;
6051 GET_RE_DEBUG_FLAGS_DECL;
6054 SIZE_ALIGN(RExC_size);
6058 NODE_ALIGN_FILL(ret);
6060 FILL_ADVANCE_NODE(ptr, op);
6061 if (RExC_offsets) { /* MJD */
6062 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
6063 "reg_node", __LINE__,
6065 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6066 ? "Overwriting end of array!\n" : "OK",
6067 (UV)(RExC_emit - RExC_emit_start),
6068 (UV)(RExC_parse - RExC_start),
6069 (UV)RExC_offsets[0]));
6070 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
6079 - reganode - emit a node with an argument
6081 STATIC regnode * /* Location. */
6082 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6085 register regnode *ptr;
6086 regnode * const ret = RExC_emit;
6087 GET_RE_DEBUG_FLAGS_DECL;
6090 SIZE_ALIGN(RExC_size);
6095 NODE_ALIGN_FILL(ret);
6097 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6098 if (RExC_offsets) { /* MJD */
6099 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6103 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6104 "Overwriting end of array!\n" : "OK",
6105 (UV)(RExC_emit - RExC_emit_start),
6106 (UV)(RExC_parse - RExC_start),
6107 (UV)RExC_offsets[0]));
6108 Set_Cur_Node_Offset;
6117 - reguni - emit (if appropriate) a Unicode character
6120 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6123 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6127 - reginsert - insert an operator in front of already-emitted operand
6129 * Means relocating the operand.
6132 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6135 register regnode *src;
6136 register regnode *dst;
6137 register regnode *place;
6138 const int offset = regarglen[(U8)op];
6139 GET_RE_DEBUG_FLAGS_DECL;
6140 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6143 RExC_size += NODE_STEP_REGNODE + offset;
6148 RExC_emit += NODE_STEP_REGNODE + offset;
6150 while (src > opnd) {
6151 StructCopy(--src, --dst, regnode);
6152 if (RExC_offsets) { /* MJD 20010112 */
6153 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6157 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6158 ? "Overwriting end of array!\n" : "OK",
6159 (UV)(src - RExC_emit_start),
6160 (UV)(dst - RExC_emit_start),
6161 (UV)RExC_offsets[0]));
6162 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6163 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6168 place = opnd; /* Op node, where operand used to be. */
6169 if (RExC_offsets) { /* MJD */
6170 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6174 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6175 ? "Overwriting end of array!\n" : "OK",
6176 (UV)(place - RExC_emit_start),
6177 (UV)(RExC_parse - RExC_start),
6179 Set_Node_Offset(place, RExC_parse);
6180 Set_Node_Length(place, 1);
6182 src = NEXTOPER(place);
6183 FILL_ADVANCE_NODE(place, op);
6184 Zero(src, offset, regnode);
6188 - regtail - set the next-pointer at the end of a node chain of p to val.
6189 - SEE ALSO: regtail_study
6191 /* TODO: All three parms should be const */
6193 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6196 register regnode *scan;
6197 GET_RE_DEBUG_FLAGS_DECL;
6199 PERL_UNUSED_ARG(depth);
6205 /* Find last node. */
6208 regnode * const temp = regnext(scan);
6210 SV * const mysv=sv_newmortal();
6211 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6212 regprop(RExC_rx, mysv, scan);
6213 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6214 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6221 if (reg_off_by_arg[OP(scan)]) {
6222 ARG_SET(scan, val - scan);
6225 NEXT_OFF(scan) = val - scan;
6231 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6232 - Look for optimizable sequences at the same time.
6233 - currently only looks for EXACT chains.
6235 This is expermental code. The idea is to use this routine to perform
6236 in place optimizations on branches and groups as they are constructed,
6237 with the long term intention of removing optimization from study_chunk so
6238 that it is purely analytical.
6240 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6241 to control which is which.
6244 /* TODO: All four parms should be const */
6247 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6250 register regnode *scan;
6252 #ifdef EXPERIMENTAL_INPLACESCAN
6256 GET_RE_DEBUG_FLAGS_DECL;
6262 /* Find last node. */
6266 regnode * const temp = regnext(scan);
6267 #ifdef EXPERIMENTAL_INPLACESCAN
6268 if (PL_regkind[OP(scan)] == EXACT)
6269 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6277 if( exact == PSEUDO )
6279 else if ( exact != OP(scan) )
6288 SV * const mysv=sv_newmortal();
6289 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6290 regprop(RExC_rx, mysv, scan);
6291 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6292 SvPV_nolen_const(mysv),
6294 REG_NODE_NUM(scan));
6301 SV * const mysv_val=sv_newmortal();
6302 DEBUG_PARSE_MSG("");
6303 regprop(RExC_rx, mysv_val, val);
6304 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6305 SvPV_nolen_const(mysv_val),
6310 if (reg_off_by_arg[OP(scan)]) {
6311 ARG_SET(scan, val - scan);
6314 NEXT_OFF(scan) = val - scan;
6322 - regcurly - a little FSA that accepts {\d+,?\d*}
6325 S_regcurly(register const char *s)
6344 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6347 Perl_regdump(pTHX_ const regexp *r)
6351 SV * const sv = sv_newmortal();
6352 SV *dsv= sv_newmortal();
6354 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
6356 /* Header fields of interest. */
6357 if (r->anchored_substr) {
6358 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
6359 RE_SV_DUMPLEN(r->anchored_substr), 30);
6360 PerlIO_printf(Perl_debug_log,
6361 "anchored %s%s at %"IVdf" ",
6362 s, RE_SV_TAIL(r->anchored_substr),
6363 (IV)r->anchored_offset);
6364 } else if (r->anchored_utf8) {
6365 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
6366 RE_SV_DUMPLEN(r->anchored_utf8), 30);
6367 PerlIO_printf(Perl_debug_log,
6368 "anchored utf8 %s%s at %"IVdf" ",
6369 s, RE_SV_TAIL(r->anchored_utf8),
6370 (IV)r->anchored_offset);
6372 if (r->float_substr) {
6373 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
6374 RE_SV_DUMPLEN(r->float_substr), 30);
6375 PerlIO_printf(Perl_debug_log,
6376 "floating %s%s at %"IVdf"..%"UVuf" ",
6377 s, RE_SV_TAIL(r->float_substr),
6378 (IV)r->float_min_offset, (UV)r->float_max_offset);
6379 } else if (r->float_utf8) {
6380 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
6381 RE_SV_DUMPLEN(r->float_utf8), 30);
6382 PerlIO_printf(Perl_debug_log,
6383 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
6384 s, RE_SV_TAIL(r->float_utf8),
6385 (IV)r->float_min_offset, (UV)r->float_max_offset);
6387 if (r->check_substr || r->check_utf8)
6388 PerlIO_printf(Perl_debug_log,
6389 r->check_substr == r->float_substr
6390 && r->check_utf8 == r->float_utf8
6391 ? "(checking floating" : "(checking anchored");
6392 if (r->reganch & ROPT_NOSCAN)
6393 PerlIO_printf(Perl_debug_log, " noscan");
6394 if (r->reganch & ROPT_CHECK_ALL)
6395 PerlIO_printf(Perl_debug_log, " isall");
6396 if (r->check_substr || r->check_utf8)
6397 PerlIO_printf(Perl_debug_log, ") ");
6399 if (r->regstclass) {
6400 regprop(r, sv, r->regstclass);
6401 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6403 if (r->reganch & ROPT_ANCH) {
6404 PerlIO_printf(Perl_debug_log, "anchored");
6405 if (r->reganch & ROPT_ANCH_BOL)
6406 PerlIO_printf(Perl_debug_log, "(BOL)");
6407 if (r->reganch & ROPT_ANCH_MBOL)
6408 PerlIO_printf(Perl_debug_log, "(MBOL)");
6409 if (r->reganch & ROPT_ANCH_SBOL)
6410 PerlIO_printf(Perl_debug_log, "(SBOL)");
6411 if (r->reganch & ROPT_ANCH_GPOS)
6412 PerlIO_printf(Perl_debug_log, "(GPOS)");
6413 PerlIO_putc(Perl_debug_log, ' ');
6415 if (r->reganch & ROPT_GPOS_SEEN)
6416 PerlIO_printf(Perl_debug_log, "GPOS ");
6417 if (r->reganch & ROPT_SKIP)
6418 PerlIO_printf(Perl_debug_log, "plus ");
6419 if (r->reganch & ROPT_IMPLICIT)
6420 PerlIO_printf(Perl_debug_log, "implicit ");
6421 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6422 if (r->reganch & ROPT_EVAL_SEEN)
6423 PerlIO_printf(Perl_debug_log, "with eval ");
6424 PerlIO_printf(Perl_debug_log, "\n");
6426 PERL_UNUSED_CONTEXT;
6428 #endif /* DEBUGGING */
6432 - regprop - printable representation of opcode
6435 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6441 sv_setpvn(sv, "", 0);
6442 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6443 /* It would be nice to FAIL() here, but this may be called from
6444 regexec.c, and it would be hard to supply pRExC_state. */
6445 Perl_croak(aTHX_ "Corrupted regexp opcode");
6446 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6448 k = PL_regkind[OP(o)];
6451 SV * const dsv = sv_2mortal(newSVpvs(""));
6452 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
6453 * is a crude hack but it may be the best for now since
6454 * we have no flag "this EXACTish node was UTF-8"
6456 const char * const s =
6457 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
6458 PL_colors[0], PL_colors[1],
6459 PERL_PV_ESCAPE_UNI_DETECT |
6460 PERL_PV_PRETTY_ELIPSES |
6463 Perl_sv_catpvf(aTHX_ sv, " %s", s );
6464 } else if (k == TRIE) {
6465 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6466 /* print the details of the trie in dumpuntil instead, as
6467 * prog->data isn't available here */
6468 } else if (k == CURLY) {
6469 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6470 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6471 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6473 else if (k == WHILEM && o->flags) /* Ordinal/of */
6474 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6475 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6476 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6477 else if (k == LOGICAL)
6478 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6479 else if (k == ANYOF) {
6480 int i, rangestart = -1;
6481 const U8 flags = ANYOF_FLAGS(o);
6483 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6484 static const char * const anyofs[] = {
6517 if (flags & ANYOF_LOCALE)
6518 sv_catpvs(sv, "{loc}");
6519 if (flags & ANYOF_FOLD)
6520 sv_catpvs(sv, "{i}");
6521 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6522 if (flags & ANYOF_INVERT)
6524 for (i = 0; i <= 256; i++) {
6525 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6526 if (rangestart == -1)
6528 } else if (rangestart != -1) {
6529 if (i <= rangestart + 3)
6530 for (; rangestart < i; rangestart++)
6531 put_byte(sv, rangestart);
6533 put_byte(sv, rangestart);
6535 put_byte(sv, i - 1);
6541 if (o->flags & ANYOF_CLASS)
6542 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6543 if (ANYOF_CLASS_TEST(o,i))
6544 sv_catpv(sv, anyofs[i]);
6546 if (flags & ANYOF_UNICODE)
6547 sv_catpvs(sv, "{unicode}");
6548 else if (flags & ANYOF_UNICODE_ALL)
6549 sv_catpvs(sv, "{unicode_all}");
6553 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6557 U8 s[UTF8_MAXBYTES_CASE+1];
6559 for (i = 0; i <= 256; i++) { /* just the first 256 */
6560 uvchr_to_utf8(s, i);
6562 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6563 if (rangestart == -1)
6565 } else if (rangestart != -1) {
6566 if (i <= rangestart + 3)
6567 for (; rangestart < i; rangestart++) {
6568 const U8 * const e = uvchr_to_utf8(s,rangestart);
6570 for(p = s; p < e; p++)
6574 const U8 *e = uvchr_to_utf8(s,rangestart);
6576 for (p = s; p < e; p++)
6579 e = uvchr_to_utf8(s, i-1);
6580 for (p = s; p < e; p++)
6587 sv_catpvs(sv, "..."); /* et cetera */
6591 char *s = savesvpv(lv);
6592 char * const origs = s;
6594 while (*s && *s != '\n')
6598 const char * const t = ++s;
6616 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6618 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6619 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
6621 PERL_UNUSED_CONTEXT;
6622 PERL_UNUSED_ARG(sv);
6624 PERL_UNUSED_ARG(prog);
6625 #endif /* DEBUGGING */
6629 Perl_re_intuit_string(pTHX_ regexp *prog)
6630 { /* Assume that RE_INTUIT is set */
6632 GET_RE_DEBUG_FLAGS_DECL;
6633 PERL_UNUSED_CONTEXT;
6637 const char * const s = SvPV_nolen_const(prog->check_substr
6638 ? prog->check_substr : prog->check_utf8);
6640 if (!PL_colorset) reginitcolors();
6641 PerlIO_printf(Perl_debug_log,
6642 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6644 prog->check_substr ? "" : "utf8 ",
6645 PL_colors[5],PL_colors[0],
6648 (strlen(s) > 60 ? "..." : ""));
6651 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6655 Perl_pregfree(pTHX_ struct regexp *r)
6661 GET_RE_DEBUG_FLAGS_DECL;
6663 if (!r || (--r->refcnt > 0))
6669 SV *dsv= sv_newmortal();
6670 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
6671 dsv, r->precomp, r->prelen, 60);
6672 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
6673 PL_colors[4],PL_colors[5],s);
6677 /* gcov results gave these as non-null 100% of the time, so there's no
6678 optimisation in checking them before calling Safefree */
6679 Safefree(r->precomp);
6680 Safefree(r->offsets); /* 20010421 MJD */
6681 RX_MATCH_COPY_FREE(r);
6682 #ifdef PERL_OLD_COPY_ON_WRITE
6684 SvREFCNT_dec(r->saved_copy);
6687 if (r->anchored_substr)
6688 SvREFCNT_dec(r->anchored_substr);
6689 if (r->anchored_utf8)
6690 SvREFCNT_dec(r->anchored_utf8);
6691 if (r->float_substr)
6692 SvREFCNT_dec(r->float_substr);
6694 SvREFCNT_dec(r->float_utf8);
6695 Safefree(r->substrs);
6698 int n = r->data->count;
6699 PAD* new_comppad = NULL;
6704 /* If you add a ->what type here, update the comment in regcomp.h */
6705 switch (r->data->what[n]) {
6707 SvREFCNT_dec((SV*)r->data->data[n]);
6710 Safefree(r->data->data[n]);
6713 new_comppad = (AV*)r->data->data[n];
6716 if (new_comppad == NULL)
6717 Perl_croak(aTHX_ "panic: pregfree comppad");
6718 PAD_SAVE_LOCAL(old_comppad,
6719 /* Watch out for global destruction's random ordering. */
6720 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6723 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6726 op_free((OP_4tree*)r->data->data[n]);
6728 PAD_RESTORE_LOCAL(old_comppad);
6729 SvREFCNT_dec((SV*)new_comppad);
6735 { /* Aho Corasick add-on structure for a trie node.
6736 Used in stclass optimization only */
6738 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6740 refcount = --aho->refcount;
6743 Safefree(aho->states);
6744 Safefree(aho->fail);
6745 aho->trie=NULL; /* not necessary to free this as it is
6746 handled by the 't' case */
6747 Safefree(r->data->data[n]); /* do this last!!!! */
6748 Safefree(r->regstclass);
6754 /* trie structure. */
6756 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6758 refcount = --trie->refcount;
6761 Safefree(trie->charmap);
6762 if (trie->widecharmap)
6763 SvREFCNT_dec((SV*)trie->widecharmap);
6764 Safefree(trie->states);
6765 Safefree(trie->trans);
6767 Safefree(trie->bitmap);
6769 Safefree(trie->wordlen);
6773 SvREFCNT_dec((SV*)trie->words);
6774 if (trie->revcharmap)
6775 SvREFCNT_dec((SV*)trie->revcharmap);
6778 Safefree(r->data->data[n]); /* do this last!!!! */
6783 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6786 Safefree(r->data->what);
6789 Safefree(r->startp);
6794 #ifndef PERL_IN_XSUB_RE
6796 - regnext - dig the "next" pointer out of a node
6799 Perl_regnext(pTHX_ register regnode *p)
6802 register I32 offset;
6804 if (p == &PL_regdummy)
6807 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6816 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6819 STRLEN l1 = strlen(pat1);
6820 STRLEN l2 = strlen(pat2);
6823 const char *message;
6829 Copy(pat1, buf, l1 , char);
6830 Copy(pat2, buf + l1, l2 , char);
6831 buf[l1 + l2] = '\n';
6832 buf[l1 + l2 + 1] = '\0';
6834 /* ANSI variant takes additional second argument */
6835 va_start(args, pat2);
6839 msv = vmess(buf, &args);
6841 message = SvPV_const(msv,l1);
6844 Copy(message, buf, l1 , char);
6845 buf[l1-1] = '\0'; /* Overwrite \n */
6846 Perl_croak(aTHX_ "%s", buf);
6849 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6851 #ifndef PERL_IN_XSUB_RE
6853 Perl_save_re_context(pTHX)
6857 struct re_save_state *state;
6859 SAVEVPTR(PL_curcop);
6860 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6862 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6863 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6864 SSPUSHINT(SAVEt_RE_STATE);
6866 Copy(&PL_reg_state, state, 1, struct re_save_state);
6868 PL_reg_start_tmp = 0;
6869 PL_reg_start_tmpl = 0;
6870 PL_reg_oldsaved = NULL;
6871 PL_reg_oldsavedlen = 0;
6873 PL_reg_leftiter = 0;
6874 PL_reg_poscache = NULL;
6875 PL_reg_poscache_size = 0;
6876 #ifdef PERL_OLD_COPY_ON_WRITE
6880 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6882 const REGEXP * const rx = PM_GETRE(PL_curpm);
6885 for (i = 1; i <= rx->nparens; i++) {
6886 char digits[TYPE_CHARS(long)];
6887 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
6888 GV *const *const gvp
6889 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6892 GV * const gv = *gvp;
6893 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6903 clear_re(pTHX_ void *r)
6906 ReREFCNT_dec((regexp *)r);
6912 S_put_byte(pTHX_ SV *sv, int c)
6914 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6915 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6916 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6917 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6919 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6922 #define CLEAR_OPTSTART \
6923 if (optstart) STMT_START { \
6924 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
6928 #define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6930 STATIC const regnode *
6931 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6932 const regnode *last, SV* sv, I32 l)
6935 register U8 op = EXACT; /* Arbitrary non-END op. */
6936 register const regnode *next;
6937 const regnode *optstart= NULL;
6938 GET_RE_DEBUG_FLAGS_DECL;
6940 while (op != END && (!last || node < last)) {
6941 /* While that wasn't END last time... */
6947 next = regnext((regnode *)node);
6950 if (OP(node) == OPTIMIZED) {
6951 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
6958 regprop(r, sv, node);
6959 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6960 (int)(2*l + 1), "", SvPVX_const(sv));
6962 if (OP(node) != OPTIMIZED) {
6963 if (next == NULL) /* Next ptr. */
6964 PerlIO_printf(Perl_debug_log, "(0)");
6966 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6967 (void)PerlIO_putc(Perl_debug_log, '\n');
6971 if (PL_regkind[(U8)op] == BRANCHJ) {
6974 register const regnode *nnode = (OP(next) == LONGJMP
6975 ? regnext((regnode *)next)
6977 if (last && nnode > last)
6979 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6982 else if (PL_regkind[(U8)op] == BRANCH) {
6984 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
6986 else if ( PL_regkind[(U8)op] == TRIE ) {
6987 const I32 n = ARG(node);
6988 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6989 const I32 arry_len = av_len(trie->words)+1;
6991 PerlIO_printf(Perl_debug_log,
6992 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
6996 TRIE_WORDCOUNT(trie),
6997 (int)TRIE_CHARCOUNT(trie),
6998 trie->uniquecharcount,
6999 (IV)TRIE_LASTSTATE(trie)-1,
7006 sv_setpvn(sv, "", 0);
7007 for (i = 0; i <= 256; i++) {
7008 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
7009 if (rangestart == -1)
7011 } else if (rangestart != -1) {
7012 if (i <= rangestart + 3)
7013 for (; rangestart < i; rangestart++)
7014 put_byte(sv, rangestart);
7016 put_byte(sv, rangestart);
7018 put_byte(sv, i - 1);
7023 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
7025 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
7027 for (word_idx=0; word_idx < arry_len; word_idx++) {
7028 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
7030 PerlIO_printf(Perl_debug_log, "%*s%s\n",
7032 pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
7033 PL_colors[0], PL_colors[1],
7034 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
7035 PERL_PV_PRETTY_ELIPSES |
7041 node = NEXTOPER(node);
7042 node += regarglen[(U8)op];
7045 else if ( op == CURLY) { /* "next" might be very big: optimizer */
7046 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7047 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7049 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
7051 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7054 else if ( op == PLUS || op == STAR) {
7055 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
7057 else if (op == ANYOF) {
7058 /* arglen 1 + class block */
7059 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7060 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7061 node = NEXTOPER(node);
7063 else if (PL_regkind[(U8)op] == EXACT) {
7064 /* Literal string, where present. */
7065 node += NODE_SZ_STR(node) - 1;
7066 node = NEXTOPER(node);
7069 node = NEXTOPER(node);
7070 node += regarglen[(U8)op];
7072 if (op == CURLYX || op == OPEN)
7074 else if (op == WHILEM)
7081 #endif /* DEBUGGING */
7085 * c-indentation-style: bsd
7087 * indent-tabs-mode: t
7090 * ex: set ts=8 sts=4 sw=4 noet: