5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
176 /* Length of a variant. */
178 typedef struct scan_data_t {
184 I32 last_end; /* min value, <0 unless valid. */
187 SV **longest; /* Either &l_fixed, or &l_float. */
191 I32 offset_float_min;
192 I32 offset_float_max;
196 struct regnode_charclass_class *start_class;
200 * Forward declarations for pregcomp()'s friends.
203 static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL 0x0001
208 #define SF_BEFORE_MEOL 0x0002
209 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
213 # define SF_FIX_SHIFT_EOL (0+2)
214 # define SF_FL_SHIFT_EOL (0+4)
216 # define SF_FIX_SHIFT_EOL (+2)
217 # define SF_FL_SHIFT_EOL (+4)
220 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF 0x0040
226 #define SF_HAS_PAR 0x0080
227 #define SF_IN_PAR 0x0100
228 #define SF_HAS_EVAL 0x0200
229 #define SCF_DO_SUBSTR 0x0400
230 #define SCF_DO_STCLASS_AND 0x0800
231 #define SCF_DO_STCLASS_OR 0x1000
232 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS 0x2000
235 #define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
237 #define UTF (RExC_utf8 != 0)
238 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
239 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
241 #define OOB_UNICODE 12345678
242 #define OOB_NAMEDCLASS -1
244 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248 /* length of regex to show in messages that don't mark a position within */
249 #define RegexLengthToShowInErrorMessages 127
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
256 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
257 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
259 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 #define FAIL(msg) STMT_START { \
267 const char *ellipses = ""; \
268 IV len = RExC_end - RExC_precomp; \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
284 #define Simple_vFAIL(m) STMT_START { \
285 const IV offset = RExC_parse - RExC_precomp; \
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
293 #define vFAIL(m) STMT_START { \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
300 * Like Simple_vFAIL(), but accepts two arguments.
302 #define Simple_vFAIL2(m,a1) STMT_START { \
303 const IV offset = RExC_parse - RExC_precomp; \
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
311 #define vFAIL2(m,a1) STMT_START { \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
319 * Like Simple_vFAIL(), but accepts three arguments.
321 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
322 const IV offset = RExC_parse - RExC_precomp; \
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
330 #define vFAIL3(m,a1,a2) STMT_START { \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
337 * Like Simple_vFAIL(), but accepts four arguments.
339 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
340 const IV offset = RExC_parse - RExC_precomp; \
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
345 #define vWARN(loc,m) STMT_START { \
346 const IV offset = loc - RExC_precomp; \
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351 #define vWARNdep(loc,m) STMT_START { \
352 const IV offset = loc - RExC_precomp; \
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359 #define vWARN2(loc, m, a1) STMT_START { \
360 const IV offset = loc - RExC_precomp; \
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365 #define vWARN3(loc, m, a1, a2) STMT_START { \
366 const IV offset = loc - RExC_precomp; \
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
372 const IV offset = loc - RExC_precomp; \
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
378 const IV offset = loc - RExC_precomp; \
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 /* Allow for side effects in s */
385 #define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389 /* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
393 * Position is 1 indexed.
396 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
399 __LINE__, (node), (int)(byte))); \
401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
403 RExC_offsets[2*(node)-1] = (byte); \
408 #define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
412 #define Set_Node_Length_To_R(node,len) STMT_START { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
415 __LINE__, (int)(node), (int)(len))); \
417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
419 RExC_offsets[2*(node)] = (len); \
424 #define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427 #define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
430 /* Get offsets and lengths */
431 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
434 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
440 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441 #define EXPERIMENTAL_INPLACESCAN
444 static void clear_re(pTHX_ void *r);
446 /* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
451 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
457 SvSetMagicSV(*data->longest, data->last_found);
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
464 data->flags &= ~SF_FIX_BEFORE_EOL;
467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
468 data->offset_float_max = (l
469 ? data->last_start_max
470 : data->pos_min + data->pos_delta);
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
473 if (data->flags & SF_BEFORE_EOL)
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
477 data->flags &= ~SF_FL_BEFORE_EOL;
480 SvCUR_set(data->last_found, 0);
482 SV * const sv = data->last_found;
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
490 data->flags &= ~SF_BEFORE_EOL;
493 /* Can match anything (initialization) */
495 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
497 ANYOF_CLASS_ZERO(cl);
498 ANYOF_BITMAP_SETALL(cl);
499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
501 cl->flags |= ANYOF_LOCALE;
504 /* Can match anything (initialization) */
506 S_cl_is_anything(const struct regnode_charclass_class *cl)
510 for (value = 0; value <= ANYOF_MAX; value += 2)
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
520 /* Can match anything (initialization) */
522 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 Zero(cl, 1, struct regnode_charclass_class);
526 cl_anything(pRExC_state, cl);
530 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 Zero(cl, 1, struct regnode_charclass_class);
534 cl_anything(pRExC_state, cl);
536 cl->flags |= ANYOF_LOCALE;
539 /* 'And' a given class with another one. Can create false positives */
540 /* We assume that cl is not inverted */
542 S_cl_and(struct regnode_charclass_class *cl,
543 const struct regnode_charclass_class *and_with)
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
570 cl->flags &= ~ANYOF_UNICODE_ALL;
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
573 cl->flags &= ~ANYOF_UNICODE;
576 /* 'OR' a given class with another one. Can create false positives */
577 /* We assume that cl is not inverted */
579 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
581 if (or_with->flags & ANYOF_INVERT) {
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
600 cl_anything(pRExC_state, cl);
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && (!(or_with->flags & ANYOF_FOLD)
606 || (cl->flags & ANYOF_FOLD)) ) {
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
619 cl_anything(pRExC_state, cl);
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
638 make_trie(startbranch,first,last,tail,flags,depth)
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
648 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
650 A trie is an N'ary tree where the branches are determined by digital
651 decomposition of the key. IE, at the root node you look up the 1st character and
652 follow that branch repeat until you find the end of the branches. Nodes can be
653 marked as "accepting" meaning they represent a complete word. Eg:
657 would convert into the following structure. Numbers represent states, letters
658 following numbers represent valid transitions on the letter from that state, if
659 the number is in square brackets it represents an accepting state, otherwise it
660 will be in parenthesis.
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
666 (1) +-i->(6)-+-s->[7]
668 +-s->(3)-+-h->(4)-+-e->[5]
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
672 This shows that when matching against the string 'hers' we will begin at state 1
673 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676 single traverse. We store a mapping from accepting to state to which word was
677 matched, and then when we have multiple possibilities we try to complete the
678 rest of the regex in the order in which they occured in the alternation.
680 The only prior NFA like behaviour that would be changed by the TRIE support is
681 the silent ignoring of duplicate alternations which are of the form:
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
685 Thus EVAL blocks follwing a trie may be called a different number of times with
686 and without the optimisation. With the optimisations dupes will be silently
687 ignored. This inconsistant behaviour of EVAL type nodes is well established as
688 the following demonstrates:
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
692 which prints out 'word' three times, but
694 'words'=~/(word|word|word)(?{ print $1 })S/
696 which doesnt print it out at all. This is due to other optimisations kicking in.
698 Example of what happens on a structural level:
700 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
702 1: CURLYM[1] {1,32767}(18)
713 This would be optimizable with startbranch=5, first=5, last=16, tail=16
714 and should turn into:
716 1: CURLYM[1] {1,32767}(18)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
726 Cases where tail != last would be like /(?foo|bar)baz/:
736 which would be optimizable with startbranch=1, first=1, last=7, tail=8
737 and would end up looking like:
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
747 d = uvuni_to_utf8_flags(d, uv, 0);
749 is the recommended Unicode-aware way of saying
754 #define TRIE_STORE_REVCHAR \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
760 #define TRIE_READ_CHAR STMT_START { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
785 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
790 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
801 #define TRIE_LIST_NEW(state) STMT_START { \
802 Newxz( trie->states[ state ].trans.list, \
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
808 #define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
812 trie->wordlen[ curword ] = wordlen; \
813 trie->states[ state ].wordnum = ++curword; \
815 /* store the word for dumping */ \
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
820 tmp = newSVpvn( "", 0 ); \
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
825 NOOP; /* It's a dupe. So ignore it. */ \
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
835 These routines dump out a trie in a somewhat readable format.
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
852 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
855 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
875 const U32 base = trie->states[ state ].trans.base;
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
909 PerlIO_printf( Perl_debug_log, "]");
912 PerlIO_printf( Perl_debug_log, "\n" );
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
923 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
932 for( state=1 ; state < next_alloc ; state ++ ) {
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
945 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
964 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
968 GET_RE_DEBUG_FLAGS_DECL;
971 print out the table precompression so that we can do a visual check
972 that they are identical.
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
978 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
990 PerlIO_printf( Perl_debug_log, "\n" );
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1013 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1023 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1025 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1045 const U32 ucharcount = trie->uniquecharcount;
1046 const U32 numstates = trie->laststate;
1047 const U32 ubound = trie->lasttrans + ucharcount;
1051 U32 base = trie->states[ 1 ].trans.base;
1054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1055 GET_RE_DEBUG_FLAGS_DECL;
1057 ARG_SET( stclass, data_slot );
1058 Newxz( aho, 1, reg_ac_data );
1059 RExC_rx->data->data[ data_slot ] = (void*)aho;
1061 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1062 (trie->laststate+1)*sizeof(reg_trie_state));
1063 Newxz( q, numstates, U32);
1064 Newxz( aho->fail, numstates, U32 );
1067 fail[ 0 ] = fail[ 1 ] = 1;
1069 for ( charid = 0; charid < ucharcount ; charid++ ) {
1070 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1072 q[ q_write ] = newstate;
1073 /* set to point at the root */
1074 fail[ q[ q_write++ ] ]=1;
1077 while ( q_read < q_write) {
1078 const U32 cur = q[ q_read++ % numstates ];
1079 base = trie->states[ cur ].trans.base;
1081 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1082 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1084 U32 fail_state = cur;
1087 fail_state = fail[ fail_state ];
1088 fail_base = aho->states[ fail_state ].trans.base;
1089 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1091 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1092 fail[ ch_state ] = fail_state;
1093 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1095 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1097 q[ q_write++ % numstates] = ch_state;
1102 DEBUG_TRIE_COMPILE_MORE_r({
1103 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1104 for( q_read=2; q_read<numstates; q_read++ ) {
1105 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1107 PerlIO_printf(Perl_debug_log, "\n");
1110 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1116 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1119 /* first pass, loop through and scan words */
1120 reg_trie_data *trie;
1122 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1127 /* we just use folder as a flag in utf8 */
1128 const U8 * const folder = ( flags == EXACTF
1130 : ( flags == EXACTFL
1136 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1137 SV *re_trie_maxbuff;
1139 /* these are only used during construction but are useful during
1140 * debugging so we store them in the struct when debugging.
1141 * Wordcount is actually superfluous in debugging as we have
1142 * (AV*)trie->words to use for it, but that's not available when
1143 * not debugging... We could make the macro use the AV during
1144 * debugging though...
1146 U16 trie_wordcount=0;
1147 STRLEN trie_charcount=0;
1148 /*U32 trie_laststate=0;*/
1149 AV *trie_revcharmap;
1151 GET_RE_DEBUG_FLAGS_DECL;
1153 Newxz( trie, 1, reg_trie_data );
1155 trie->startstate = 1;
1156 RExC_rx->data->data[ data_slot ] = (void*)trie;
1157 Newxz( trie->charmap, 256, U16 );
1158 if (!(UTF && folder))
1159 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1161 trie->words = newAV();
1163 TRIE_REVCHARMAP(trie) = newAV();
1165 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1166 if (!SvIOK(re_trie_maxbuff)) {
1167 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1170 PerlIO_printf( Perl_debug_log,
1171 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1172 (int)depth * 2 + 2, "",
1173 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1174 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1176 /* -- First loop and Setup --
1178 We first traverse the branches and scan each word to determine if it
1179 contains widechars, and how many unique chars there are, this is
1180 important as we have to build a table with at least as many columns as we
1183 We use an array of integers to represent the character codes 0..255
1184 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1185 native representation of the character value as the key and IV's for the
1188 *TODO* If we keep track of how many times each character is used we can
1189 remap the columns so that the table compression later on is more
1190 efficient in terms of memory by ensuring most common value is in the
1191 middle and the least common are on the outside. IMO this would be better
1192 than a most to least common mapping as theres a decent chance the most
1193 common letter will share a node with the least common, meaning the node
1194 will not be compressable. With a middle is most common approach the worst
1195 case is when we have the least common nodes twice.
1199 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1200 regnode * const noper = NEXTOPER( cur );
1201 const U8 *uc = (U8*)STRING( noper );
1202 const U8 * const e = uc + STR_LEN( noper );
1204 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1205 const U8 *scan = (U8*)NULL;
1206 U32 wordlen = 0; /* required init */
1209 TRIE_WORDCOUNT(trie)++;
1210 if (OP(noper) == NOTHING) {
1215 TRIE_BITMAP_SET(trie,*uc);
1216 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1218 for ( ; uc < e ; uc += len ) {
1219 TRIE_CHARCOUNT(trie)++;
1223 if ( !trie->charmap[ uvc ] ) {
1224 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1226 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1231 if ( !trie->widecharmap )
1232 trie->widecharmap = newHV();
1234 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1237 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1239 if ( !SvTRUE( *svpp ) ) {
1240 sv_setiv( *svpp, ++trie->uniquecharcount );
1245 if( cur == first ) {
1248 } else if (chars < trie->minlen) {
1250 } else if (chars > trie->maxlen) {
1254 } /* end first pass */
1255 DEBUG_TRIE_COMPILE_r(
1256 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1257 (int)depth * 2 + 2,"",
1258 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1259 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1260 (int)trie->minlen, (int)trie->maxlen )
1262 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
1265 We now know what we are dealing with in terms of unique chars and
1266 string sizes so we can calculate how much memory a naive
1267 representation using a flat table will take. If it's over a reasonable
1268 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1269 conservative but potentially much slower representation using an array
1272 At the end we convert both representations into the same compressed
1273 form that will be used in regexec.c for matching with. The latter
1274 is a form that cannot be used to construct with but has memory
1275 properties similar to the list form and access properties similar
1276 to the table form making it both suitable for fast searches and
1277 small enough that its feasable to store for the duration of a program.
1279 See the comment in the code where the compressed table is produced
1280 inplace from the flat tabe representation for an explanation of how
1281 the compression works.
1286 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1288 Second Pass -- Array Of Lists Representation
1290 Each state will be represented by a list of charid:state records
1291 (reg_trie_trans_le) the first such element holds the CUR and LEN
1292 points of the allocated array. (See defines above).
1294 We build the initial structure using the lists, and then convert
1295 it into the compressed table form which allows faster lookups
1296 (but cant be modified once converted).
1299 STRLEN transcount = 1;
1301 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1305 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1307 regnode * const noper = NEXTOPER( cur );
1308 U8 *uc = (U8*)STRING( noper );
1309 const U8 * const e = uc + STR_LEN( noper );
1310 U32 state = 1; /* required init */
1311 U16 charid = 0; /* sanity init */
1312 U8 *scan = (U8*)NULL; /* sanity init */
1313 STRLEN foldlen = 0; /* required init */
1314 U32 wordlen = 0; /* required init */
1315 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1317 if (OP(noper) != NOTHING) {
1318 for ( ; uc < e ; uc += len ) {
1323 charid = trie->charmap[ uvc ];
1325 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1329 charid=(U16)SvIV( *svpp );
1338 if ( !trie->states[ state ].trans.list ) {
1339 TRIE_LIST_NEW( state );
1341 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1342 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1343 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1348 newstate = next_alloc++;
1349 TRIE_LIST_PUSH( state, charid, newstate );
1354 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1356 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1359 TRIE_HANDLE_WORD(state);
1361 } /* end second pass */
1363 TRIE_LASTSTATE(trie) = next_alloc;
1364 Renew( trie->states, next_alloc, reg_trie_state );
1366 /* and now dump it out before we compress it */
1367 DEBUG_TRIE_COMPILE_MORE_r(
1368 dump_trie_interim_list(trie,next_alloc,depth+1)
1371 Newxz( trie->trans, transcount ,reg_trie_trans );
1378 for( state=1 ; state < next_alloc ; state ++ ) {
1382 DEBUG_TRIE_COMPILE_MORE_r(
1383 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1387 if (trie->states[state].trans.list) {
1388 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1392 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1393 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1394 if ( forid < minid ) {
1396 } else if ( forid > maxid ) {
1400 if ( transcount < tp + maxid - minid + 1) {
1402 Renew( trie->trans, transcount, reg_trie_trans );
1403 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1405 base = trie->uniquecharcount + tp - minid;
1406 if ( maxid == minid ) {
1408 for ( ; zp < tp ; zp++ ) {
1409 if ( ! trie->trans[ zp ].next ) {
1410 base = trie->uniquecharcount + zp - minid;
1411 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1412 trie->trans[ zp ].check = state;
1418 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1419 trie->trans[ tp ].check = state;
1424 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1425 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1426 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1427 trie->trans[ tid ].check = state;
1429 tp += ( maxid - minid + 1 );
1431 Safefree(trie->states[ state ].trans.list);
1434 DEBUG_TRIE_COMPILE_MORE_r(
1435 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1438 trie->states[ state ].trans.base=base;
1440 trie->lasttrans = tp + 1;
1444 Second Pass -- Flat Table Representation.
1446 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1447 We know that we will need Charcount+1 trans at most to store the data
1448 (one row per char at worst case) So we preallocate both structures
1449 assuming worst case.
1451 We then construct the trie using only the .next slots of the entry
1454 We use the .check field of the first entry of the node temporarily to
1455 make compression both faster and easier by keeping track of how many non
1456 zero fields are in the node.
1458 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1461 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1462 number representing the first entry of the node, and state as a
1463 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1464 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1465 are 2 entrys per node. eg:
1473 The table is internally in the right hand, idx form. However as we also
1474 have to deal with the states array which is indexed by nodenum we have to
1475 use TRIE_NODENUM() to convert.
1480 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1482 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1483 next_alloc = trie->uniquecharcount + 1;
1486 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1488 regnode * const noper = NEXTOPER( cur );
1489 const U8 *uc = (U8*)STRING( noper );
1490 const U8 * const e = uc + STR_LEN( noper );
1492 U32 state = 1; /* required init */
1494 U16 charid = 0; /* sanity init */
1495 U32 accept_state = 0; /* sanity init */
1496 U8 *scan = (U8*)NULL; /* sanity init */
1498 STRLEN foldlen = 0; /* required init */
1499 U32 wordlen = 0; /* required init */
1500 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1502 if ( OP(noper) != NOTHING ) {
1503 for ( ; uc < e ; uc += len ) {
1508 charid = trie->charmap[ uvc ];
1510 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1511 charid = svpp ? (U16)SvIV(*svpp) : 0;
1515 if ( !trie->trans[ state + charid ].next ) {
1516 trie->trans[ state + charid ].next = next_alloc;
1517 trie->trans[ state ].check++;
1518 next_alloc += trie->uniquecharcount;
1520 state = trie->trans[ state + charid ].next;
1522 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1524 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1527 accept_state = TRIE_NODENUM( state );
1528 TRIE_HANDLE_WORD(accept_state);
1530 } /* end second pass */
1532 /* and now dump it out before we compress it */
1533 DEBUG_TRIE_COMPILE_MORE_r(
1534 dump_trie_interim_table(trie,next_alloc,depth+1)
1539 * Inplace compress the table.*
1541 For sparse data sets the table constructed by the trie algorithm will
1542 be mostly 0/FAIL transitions or to put it another way mostly empty.
1543 (Note that leaf nodes will not contain any transitions.)
1545 This algorithm compresses the tables by eliminating most such
1546 transitions, at the cost of a modest bit of extra work during lookup:
1548 - Each states[] entry contains a .base field which indicates the
1549 index in the state[] array wheres its transition data is stored.
1551 - If .base is 0 there are no valid transitions from that node.
1553 - If .base is nonzero then charid is added to it to find an entry in
1556 -If trans[states[state].base+charid].check!=state then the
1557 transition is taken to be a 0/Fail transition. Thus if there are fail
1558 transitions at the front of the node then the .base offset will point
1559 somewhere inside the previous nodes data (or maybe even into a node
1560 even earlier), but the .check field determines if the transition is
1563 The following process inplace converts the table to the compressed
1564 table: We first do not compress the root node 1,and mark its all its
1565 .check pointers as 1 and set its .base pointer as 1 as well. This
1566 allows to do a DFA construction from the compressed table later, and
1567 ensures that any .base pointers we calculate later are greater than
1570 - We set 'pos' to indicate the first entry of the second node.
1572 - We then iterate over the columns of the node, finding the first and
1573 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1574 and set the .check pointers accordingly, and advance pos
1575 appropriately and repreat for the next node. Note that when we copy
1576 the next pointers we have to convert them from the original
1577 NODEIDX form to NODENUM form as the former is not valid post
1580 - If a node has no transitions used we mark its base as 0 and do not
1581 advance the pos pointer.
1583 - If a node only has one transition we use a second pointer into the
1584 structure to fill in allocated fail transitions from other states.
1585 This pointer is independent of the main pointer and scans forward
1586 looking for null transitions that are allocated to a state. When it
1587 finds one it writes the single transition into the "hole". If the
1588 pointer doesnt find one the single transition is appeneded as normal.
1590 - Once compressed we can Renew/realloc the structures to release the
1593 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1594 specifically Fig 3.47 and the associated pseudocode.
1598 const U32 laststate = TRIE_NODENUM( next_alloc );
1601 TRIE_LASTSTATE(trie) = laststate;
1603 for ( state = 1 ; state < laststate ; state++ ) {
1605 const U32 stateidx = TRIE_NODEIDX( state );
1606 const U32 o_used = trie->trans[ stateidx ].check;
1607 U32 used = trie->trans[ stateidx ].check;
1608 trie->trans[ stateidx ].check = 0;
1610 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1611 if ( flag || trie->trans[ stateidx + charid ].next ) {
1612 if ( trie->trans[ stateidx + charid ].next ) {
1614 for ( ; zp < pos ; zp++ ) {
1615 if ( ! trie->trans[ zp ].next ) {
1619 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1620 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1621 trie->trans[ zp ].check = state;
1622 if ( ++zp > pos ) pos = zp;
1629 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1631 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1632 trie->trans[ pos ].check = state;
1637 trie->lasttrans = pos + 1;
1638 Renew( trie->states, laststate + 1, reg_trie_state);
1639 DEBUG_TRIE_COMPILE_MORE_r(
1640 PerlIO_printf( Perl_debug_log,
1641 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1642 (int)depth * 2 + 2,"",
1643 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1646 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1649 } /* end table compress */
1651 /* resize the trans array to remove unused space */
1652 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1654 /* and now dump out the compressed format */
1655 DEBUG_TRIE_COMPILE_r(
1656 dump_trie(trie,depth+1)
1659 { /* Modify the program and insert the new TRIE node*/
1661 U8 nodetype =(U8)(flags & 0xFF);
1668 This means we convert either the first branch or the first Exact,
1669 depending on whether the thing following (in 'last') is a branch
1670 or not and whther first is the startbranch (ie is it a sub part of
1671 the alternation or is it the whole thing.)
1672 Assuming its a sub part we conver the EXACT otherwise we convert
1673 the whole branch sequence, including the first.
1675 /* Find the node we are going to overwrite */
1676 if ( first == startbranch && OP( last ) != BRANCH ) {
1677 /* whole branch chain */
1680 const regnode *nop = NEXTOPER( convert );
1681 mjd_offset= Node_Offset((nop));
1682 mjd_nodelen= Node_Length((nop));
1685 /* branch sub-chain */
1686 convert = NEXTOPER( first );
1687 NEXT_OFF( first ) = (U16)(last - first);
1689 mjd_offset= Node_Offset((convert));
1690 mjd_nodelen= Node_Length((convert));
1694 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1695 (int)depth * 2 + 2, "",
1696 mjd_offset,mjd_nodelen)
1699 /* But first we check to see if there is a common prefix we can
1700 split out as an EXACT and put in front of the TRIE node. */
1701 trie->startstate= 1;
1702 if ( trie->bitmap && !trie->widecharmap ) {
1705 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1706 (int)depth * 2 + 2, "",
1707 TRIE_LASTSTATE(trie))
1709 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1713 const U32 base = trie->states[ state ].trans.base;
1715 if ( trie->states[state].wordnum )
1718 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1719 if ( ( base + ofs >= trie->uniquecharcount ) &&
1720 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1721 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1723 if ( ++count > 1 ) {
1724 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1725 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1726 if ( state == 1 ) break;
1728 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1730 PerlIO_printf(Perl_debug_log,
1731 "%*sNew Start State=%"UVuf" Class: [",
1732 (int)depth * 2 + 2, "",
1735 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1736 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1738 TRIE_BITMAP_SET(trie,*ch);
1740 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1742 PerlIO_printf(Perl_debug_log, (char*)ch)
1746 TRIE_BITMAP_SET(trie,*ch);
1748 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1749 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1755 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1756 const char *ch = SvPV_nolen_const( *tmp );
1758 PerlIO_printf( Perl_debug_log,
1759 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1760 (int)depth * 2 + 2, "",
1764 OP( convert ) = nodetype;
1765 str=STRING(convert);
1773 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1778 regnode *n = convert+NODE_SZ_STR(convert);
1779 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1780 trie->startstate = state;
1781 trie->minlen -= (state - 1);
1782 trie->maxlen -= (state - 1);
1784 regnode *fix = convert;
1786 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1787 while( ++fix < n ) {
1788 Set_Node_Offset_Length(fix, 0, 0);
1794 NEXT_OFF(convert) = (U16)(tail - convert);
1798 if ( trie->maxlen ) {
1799 OP( convert ) = TRIE;
1800 NEXT_OFF( convert ) = (U16)(tail - convert);
1801 ARG_SET( convert, data_slot );
1803 /* store the type in the flags */
1804 convert->flags = nodetype;
1805 /* XXX We really should free up the resource in trie now, as we wont use them */
1807 /* needed for dumping*/
1809 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1810 regnode *opt = convert;
1811 while (++opt<optimize) {
1812 Set_Node_Offset_Length(opt,0,0);
1814 /* We now need to mark all of the space originally used by the
1815 branches as optimized away. This keeps the dumpuntil from
1816 throwing a wobbly as it doesnt use regnext() to traverse the
1818 We also "fix" the offsets
1820 while( optimize < last ) {
1821 mjd_nodelen += Node_Length((optimize));
1822 OP( optimize ) = OPTIMIZED;
1823 Set_Node_Offset_Length(optimize,0,0);
1826 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1828 } /* end node insert */
1830 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1836 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1837 * These need to be revisited when a newer toolchain becomes available.
1839 #if defined(__sparc64__) && defined(__GNUC__)
1840 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1841 # undef SPARC64_GCC_WORKAROUND
1842 # define SPARC64_GCC_WORKAROUND 1
1846 #define DEBUG_PEEP(str,scan,depth) \
1847 DEBUG_OPTIMISE_r({ \
1848 SV * const mysv=sv_newmortal(); \
1849 regnode *Next = regnext(scan); \
1850 regprop(RExC_rx, mysv, scan); \
1851 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1852 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1853 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1856 #define JOIN_EXACT(scan,min,flags) \
1857 if (PL_regkind[OP(scan)] == EXACT) \
1858 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1861 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1862 /* Merge several consecutive EXACTish nodes into one. */
1863 regnode *n = regnext(scan);
1865 regnode *next = scan + NODE_SZ_STR(scan);
1869 regnode *stop = scan;
1871 GET_RE_DEBUG_FLAGS_DECL;
1872 DEBUG_PEEP("join",scan,depth);
1874 /* Skip NOTHING, merge EXACT*. */
1876 ( PL_regkind[OP(n)] == NOTHING ||
1877 (stringok && (OP(n) == OP(scan))))
1879 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1881 if (OP(n) == TAIL || n > next)
1883 if (PL_regkind[OP(n)] == NOTHING) {
1885 DEBUG_PEEP("skip:",n,depth);
1886 NEXT_OFF(scan) += NEXT_OFF(n);
1887 next = n + NODE_STEP_REGNODE;
1894 else if (stringok) {
1895 const int oldl = STR_LEN(scan);
1896 regnode * const nnext = regnext(n);
1898 DEBUG_PEEP("merg",n,depth);
1901 if (oldl + STR_LEN(n) > U8_MAX)
1903 NEXT_OFF(scan) += NEXT_OFF(n);
1904 STR_LEN(scan) += STR_LEN(n);
1905 next = n + NODE_SZ_STR(n);
1906 /* Now we can overwrite *n : */
1907 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1915 #ifdef EXPERIMENTAL_INPLACESCAN
1916 if (flags && !NEXT_OFF(n)) {
1917 DEBUG_PEEP("atch",val,depth);
1918 if (reg_off_by_arg[OP(n)]) {
1919 ARG_SET(n, val - n);
1922 NEXT_OFF(n) = val - n;
1929 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1931 Two problematic code points in Unicode casefolding of EXACT nodes:
1933 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1934 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1940 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1941 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1943 This means that in case-insensitive matching (or "loose matching",
1944 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1945 length of the above casefolded versions) can match a target string
1946 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1947 This would rather mess up the minimum length computation.
1949 What we'll do is to look for the tail four bytes, and then peek
1950 at the preceding two bytes to see whether we need to decrease
1951 the minimum length by four (six minus two).
1953 Thanks to the design of UTF-8, there cannot be false matches:
1954 A sequence of valid UTF-8 bytes cannot be a subsequence of
1955 another valid sequence of UTF-8 bytes.
1958 char * const s0 = STRING(scan), *s, *t;
1959 char * const s1 = s0 + STR_LEN(scan) - 1;
1960 char * const s2 = s1 - 4;
1961 const char t0[] = "\xcc\x88\xcc\x81";
1962 const char * const t1 = t0 + 3;
1965 s < s2 && (t = ninstr(s, s1, t0, t1));
1967 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1968 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1975 n = scan + NODE_SZ_STR(scan);
1977 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1984 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
1988 /* REx optimizer. Converts nodes into quickier variants "in place".
1989 Finds fixed substrings. */
1991 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1992 to the position after last scanned or to NULL. */
1997 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1998 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1999 /* scanp: Start here (read-write). */
2000 /* deltap: Write maxlen-minlen here. */
2001 /* last: Stop before this one. */
2004 I32 min = 0, pars = 0, code;
2005 regnode *scan = *scanp, *next;
2007 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2008 int is_inf_internal = 0; /* The studied chunk is infinite */
2009 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2010 scan_data_t data_fake;
2011 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2012 SV *re_trie_maxbuff = NULL;
2014 GET_RE_DEBUG_FLAGS_DECL;
2016 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2019 while (scan && OP(scan) != END && scan < last) {
2020 /* Peephole optimizer: */
2021 DEBUG_PEEP("Peep",scan,depth);
2023 JOIN_EXACT(scan,&min,0);
2025 /* Follow the next-chain of the current node and optimize
2026 away all the NOTHINGs from it. */
2027 if (OP(scan) != CURLYX) {
2028 const int max = (reg_off_by_arg[OP(scan)]
2030 /* I32 may be smaller than U16 on CRAYs! */
2031 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2032 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2036 /* Skip NOTHING and LONGJMP. */
2037 while ((n = regnext(n))
2038 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2039 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2040 && off + noff < max)
2042 if (reg_off_by_arg[OP(scan)])
2045 NEXT_OFF(scan) = off;
2050 /* The principal pseudo-switch. Cannot be a switch, since we
2051 look into several different things. */
2052 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2053 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2054 next = regnext(scan);
2056 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2058 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2059 I32 max1 = 0, min1 = I32_MAX, num = 0;
2060 struct regnode_charclass_class accum;
2061 regnode * const startbranch=scan;
2063 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2064 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2065 if (flags & SCF_DO_STCLASS)
2066 cl_init_zero(pRExC_state, &accum);
2068 while (OP(scan) == code) {
2069 I32 deltanext, minnext, f = 0, fake;
2070 struct regnode_charclass_class this_class;
2073 data_fake.flags = 0;
2075 data_fake.whilem_c = data->whilem_c;
2076 data_fake.last_closep = data->last_closep;
2079 data_fake.last_closep = &fake;
2080 next = regnext(scan);
2081 scan = NEXTOPER(scan);
2083 scan = NEXTOPER(scan);
2084 if (flags & SCF_DO_STCLASS) {
2085 cl_init(pRExC_state, &this_class);
2086 data_fake.start_class = &this_class;
2087 f = SCF_DO_STCLASS_AND;
2089 if (flags & SCF_WHILEM_VISITED_POS)
2090 f |= SCF_WHILEM_VISITED_POS;
2092 /* we suppose the run is continuous, last=next...*/
2093 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2094 next, &data_fake, f,depth+1);
2097 if (max1 < minnext + deltanext)
2098 max1 = minnext + deltanext;
2099 if (deltanext == I32_MAX)
2100 is_inf = is_inf_internal = 1;
2102 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2105 if (data_fake.flags & SF_HAS_EVAL)
2106 data->flags |= SF_HAS_EVAL;
2107 data->whilem_c = data_fake.whilem_c;
2109 if (flags & SCF_DO_STCLASS)
2110 cl_or(pRExC_state, &accum, &this_class);
2111 if (code == SUSPEND)
2114 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2116 if (flags & SCF_DO_SUBSTR) {
2117 data->pos_min += min1;
2118 data->pos_delta += max1 - min1;
2119 if (max1 != min1 || is_inf)
2120 data->longest = &(data->longest_float);
2123 delta += max1 - min1;
2124 if (flags & SCF_DO_STCLASS_OR) {
2125 cl_or(pRExC_state, data->start_class, &accum);
2127 cl_and(data->start_class, &and_with);
2128 flags &= ~SCF_DO_STCLASS;
2131 else if (flags & SCF_DO_STCLASS_AND) {
2133 cl_and(data->start_class, &accum);
2134 flags &= ~SCF_DO_STCLASS;
2137 /* Switch to OR mode: cache the old value of
2138 * data->start_class */
2139 StructCopy(data->start_class, &and_with,
2140 struct regnode_charclass_class);
2141 flags &= ~SCF_DO_STCLASS_AND;
2142 StructCopy(&accum, data->start_class,
2143 struct regnode_charclass_class);
2144 flags |= SCF_DO_STCLASS_OR;
2145 data->start_class->flags |= ANYOF_EOS;
2151 Assuming this was/is a branch we are dealing with: 'scan' now
2152 points at the item that follows the branch sequence, whatever
2153 it is. We now start at the beginning of the sequence and look
2159 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2161 If we can find such a subseqence we need to turn the first
2162 element into a trie and then add the subsequent branch exact
2163 strings to the trie.
2167 1. patterns where the whole set of branch can be converted to a trie,
2169 2. patterns where only a subset of the alternations can be
2170 converted to a trie.
2172 In case 1 we can replace the whole set with a single regop
2173 for the trie. In case 2 we need to keep the start and end
2176 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2177 becomes BRANCH TRIE; BRANCH X;
2179 Hypthetically when we know the regex isnt anchored we can
2180 turn a case 1 into a DFA and let it rip... Every time it finds a match
2181 it would just call its tail, no WHILEM/CURLY needed.
2184 if (PERL_ENABLE_TRIE_OPTIMISATION) {
2186 if (!re_trie_maxbuff) {
2187 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2188 if (!SvIOK(re_trie_maxbuff))
2189 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2191 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2193 regnode *first = (regnode *)NULL;
2194 regnode *last = (regnode *)NULL;
2195 regnode *tail = scan;
2200 SV * const mysv = sv_newmortal(); /* for dumping */
2202 /* var tail is used because there may be a TAIL
2203 regop in the way. Ie, the exacts will point to the
2204 thing following the TAIL, but the last branch will
2205 point at the TAIL. So we advance tail. If we
2206 have nested (?:) we may have to move through several
2210 while ( OP( tail ) == TAIL ) {
2211 /* this is the TAIL generated by (?:) */
2212 tail = regnext( tail );
2217 regprop(RExC_rx, mysv, tail );
2218 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2219 (int)depth * 2 + 2, "",
2220 "Looking for TRIE'able sequences. Tail node is: ",
2221 SvPV_nolen_const( mysv )
2227 step through the branches, cur represents each
2228 branch, noper is the first thing to be matched
2229 as part of that branch and noper_next is the
2230 regnext() of that node. if noper is an EXACT
2231 and noper_next is the same as scan (our current
2232 position in the regex) then the EXACT branch is
2233 a possible optimization target. Once we have
2234 two or more consequetive such branches we can
2235 create a trie of the EXACT's contents and stich
2236 it in place. If the sequence represents all of
2237 the branches we eliminate the whole thing and
2238 replace it with a single TRIE. If it is a
2239 subsequence then we need to stitch it in. This
2240 means the first branch has to remain, and needs
2241 to be repointed at the item on the branch chain
2242 following the last branch optimized. This could
2243 be either a BRANCH, in which case the
2244 subsequence is internal, or it could be the
2245 item following the branch sequence in which
2246 case the subsequence is at the end.
2250 /* dont use tail as the end marker for this traverse */
2251 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2252 regnode * const noper = NEXTOPER( cur );
2253 regnode * const noper_next = regnext( noper );
2256 regprop(RExC_rx, mysv, cur);
2257 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2258 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2260 regprop(RExC_rx, mysv, noper);
2261 PerlIO_printf( Perl_debug_log, " -> %s",
2262 SvPV_nolen_const(mysv));
2265 regprop(RExC_rx, mysv, noper_next );
2266 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2267 SvPV_nolen_const(mysv));
2269 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2270 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2272 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2273 : PL_regkind[ OP( noper ) ] == EXACT )
2274 || OP(noper) == NOTHING )
2275 && noper_next == tail && count<U16_MAX)
2278 if ( !first || optype == NOTHING ) {
2279 if (!first) first = cur;
2280 optype = OP( noper );
2286 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2288 if ( PL_regkind[ OP( noper ) ] == EXACT
2289 && noper_next == tail )
2293 optype = OP( noper );
2303 regprop(RExC_rx, mysv, cur);
2304 PerlIO_printf( Perl_debug_log,
2305 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2306 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2310 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2311 #ifdef TRIE_STUDY_OPT
2312 if ( made && startbranch == first ) {
2313 if ( OP(first)!=TRIE )
2314 flags |= SCF_EXACT_TRIE;
2316 regnode *chk=*scanp;
2317 while ( OP( chk ) == OPEN )
2318 chk = regnext( chk );
2320 flags |= SCF_EXACT_TRIE;
2329 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2330 scan = NEXTOPER(NEXTOPER(scan));
2331 } else /* single branch is optimized. */
2332 scan = NEXTOPER(scan);
2335 else if (OP(scan) == EXACT) {
2336 I32 l = STR_LEN(scan);
2339 const U8 * const s = (U8*)STRING(scan);
2340 l = utf8_length(s, s + l);
2341 uc = utf8_to_uvchr(s, NULL);
2343 uc = *((U8*)STRING(scan));
2346 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2347 /* The code below prefers earlier match for fixed
2348 offset, later match for variable offset. */
2349 if (data->last_end == -1) { /* Update the start info. */
2350 data->last_start_min = data->pos_min;
2351 data->last_start_max = is_inf
2352 ? I32_MAX : data->pos_min + data->pos_delta;
2354 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2356 SvUTF8_on(data->last_found);
2358 SV * const sv = data->last_found;
2359 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2360 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2361 if (mg && mg->mg_len >= 0)
2362 mg->mg_len += utf8_length((U8*)STRING(scan),
2363 (U8*)STRING(scan)+STR_LEN(scan));
2365 data->last_end = data->pos_min + l;
2366 data->pos_min += l; /* As in the first entry. */
2367 data->flags &= ~SF_BEFORE_EOL;
2369 if (flags & SCF_DO_STCLASS_AND) {
2370 /* Check whether it is compatible with what we know already! */
2374 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2375 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2376 && (!(data->start_class->flags & ANYOF_FOLD)
2377 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2380 ANYOF_CLASS_ZERO(data->start_class);
2381 ANYOF_BITMAP_ZERO(data->start_class);
2383 ANYOF_BITMAP_SET(data->start_class, uc);
2384 data->start_class->flags &= ~ANYOF_EOS;
2386 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2388 else if (flags & SCF_DO_STCLASS_OR) {
2389 /* false positive possible if the class is case-folded */
2391 ANYOF_BITMAP_SET(data->start_class, uc);
2393 data->start_class->flags |= ANYOF_UNICODE_ALL;
2394 data->start_class->flags &= ~ANYOF_EOS;
2395 cl_and(data->start_class, &and_with);
2397 flags &= ~SCF_DO_STCLASS;
2399 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2400 I32 l = STR_LEN(scan);
2401 UV uc = *((U8*)STRING(scan));
2403 /* Search for fixed substrings supports EXACT only. */
2404 if (flags & SCF_DO_SUBSTR) {
2406 scan_commit(pRExC_state, data);
2409 const U8 * const s = (U8 *)STRING(scan);
2410 l = utf8_length(s, s + l);
2411 uc = utf8_to_uvchr(s, NULL);
2414 if (flags & SCF_DO_SUBSTR)
2416 if (flags & SCF_DO_STCLASS_AND) {
2417 /* Check whether it is compatible with what we know already! */
2421 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2422 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2423 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2425 ANYOF_CLASS_ZERO(data->start_class);
2426 ANYOF_BITMAP_ZERO(data->start_class);
2428 ANYOF_BITMAP_SET(data->start_class, uc);
2429 data->start_class->flags &= ~ANYOF_EOS;
2430 data->start_class->flags |= ANYOF_FOLD;
2431 if (OP(scan) == EXACTFL)
2432 data->start_class->flags |= ANYOF_LOCALE;
2435 else if (flags & SCF_DO_STCLASS_OR) {
2436 if (data->start_class->flags & ANYOF_FOLD) {
2437 /* false positive possible if the class is case-folded.
2438 Assume that the locale settings are the same... */
2440 ANYOF_BITMAP_SET(data->start_class, uc);
2441 data->start_class->flags &= ~ANYOF_EOS;
2443 cl_and(data->start_class, &and_with);
2445 flags &= ~SCF_DO_STCLASS;
2447 #ifdef TRIE_STUDY_OPT
2448 else if (OP(scan) == TRIE) {
2449 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2450 min += trie->minlen;
2451 delta += (trie->maxlen - trie->minlen);
2452 flags &= ~SCF_DO_STCLASS; /* xxx */
2453 if (flags & SCF_DO_SUBSTR) {
2454 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2455 data->pos_min += trie->minlen;
2456 data->pos_delta += (trie->maxlen - trie->minlen);
2457 if (trie->maxlen != trie->minlen)
2458 data->longest = &(data->longest_float);
2462 else if (strchr((const char*)PL_varies,OP(scan))) {
2463 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2464 I32 f = flags, pos_before = 0;
2465 regnode * const oscan = scan;
2466 struct regnode_charclass_class this_class;
2467 struct regnode_charclass_class *oclass = NULL;
2468 I32 next_is_eval = 0;
2470 switch (PL_regkind[OP(scan)]) {
2471 case WHILEM: /* End of (?:...)* . */
2472 scan = NEXTOPER(scan);
2475 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2476 next = NEXTOPER(scan);
2477 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2479 maxcount = REG_INFTY;
2480 next = regnext(scan);
2481 scan = NEXTOPER(scan);
2485 if (flags & SCF_DO_SUBSTR)
2490 if (flags & SCF_DO_STCLASS) {
2492 maxcount = REG_INFTY;
2493 next = regnext(scan);
2494 scan = NEXTOPER(scan);
2497 is_inf = is_inf_internal = 1;
2498 scan = regnext(scan);
2499 if (flags & SCF_DO_SUBSTR) {
2500 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2501 data->longest = &(data->longest_float);
2503 goto optimize_curly_tail;
2505 mincount = ARG1(scan);
2506 maxcount = ARG2(scan);
2507 next = regnext(scan);
2508 if (OP(scan) == CURLYX) {
2509 I32 lp = (data ? *(data->last_closep) : 0);
2510 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2512 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2513 next_is_eval = (OP(scan) == EVAL);
2515 if (flags & SCF_DO_SUBSTR) {
2516 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2517 pos_before = data->pos_min;
2521 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2523 data->flags |= SF_IS_INF;
2525 if (flags & SCF_DO_STCLASS) {
2526 cl_init(pRExC_state, &this_class);
2527 oclass = data->start_class;
2528 data->start_class = &this_class;
2529 f |= SCF_DO_STCLASS_AND;
2530 f &= ~SCF_DO_STCLASS_OR;
2532 /* These are the cases when once a subexpression
2533 fails at a particular position, it cannot succeed
2534 even after backtracking at the enclosing scope.
2536 XXXX what if minimal match and we are at the
2537 initial run of {n,m}? */
2538 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2539 f &= ~SCF_WHILEM_VISITED_POS;
2541 /* This will finish on WHILEM, setting scan, or on NULL: */
2542 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2544 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2546 if (flags & SCF_DO_STCLASS)
2547 data->start_class = oclass;
2548 if (mincount == 0 || minnext == 0) {
2549 if (flags & SCF_DO_STCLASS_OR) {
2550 cl_or(pRExC_state, data->start_class, &this_class);
2552 else if (flags & SCF_DO_STCLASS_AND) {
2553 /* Switch to OR mode: cache the old value of
2554 * data->start_class */
2555 StructCopy(data->start_class, &and_with,
2556 struct regnode_charclass_class);
2557 flags &= ~SCF_DO_STCLASS_AND;
2558 StructCopy(&this_class, data->start_class,
2559 struct regnode_charclass_class);
2560 flags |= SCF_DO_STCLASS_OR;
2561 data->start_class->flags |= ANYOF_EOS;
2563 } else { /* Non-zero len */
2564 if (flags & SCF_DO_STCLASS_OR) {
2565 cl_or(pRExC_state, data->start_class, &this_class);
2566 cl_and(data->start_class, &and_with);
2568 else if (flags & SCF_DO_STCLASS_AND)
2569 cl_and(data->start_class, &this_class);
2570 flags &= ~SCF_DO_STCLASS;
2572 if (!scan) /* It was not CURLYX, but CURLY. */
2574 if ( /* ? quantifier ok, except for (?{ ... }) */
2575 (next_is_eval || !(mincount == 0 && maxcount == 1))
2576 && (minnext == 0) && (deltanext == 0)
2577 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2578 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2579 && ckWARN(WARN_REGEXP))
2582 "Quantifier unexpected on zero-length expression");
2585 min += minnext * mincount;
2586 is_inf_internal |= ((maxcount == REG_INFTY
2587 && (minnext + deltanext) > 0)
2588 || deltanext == I32_MAX);
2589 is_inf |= is_inf_internal;
2590 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2592 /* Try powerful optimization CURLYX => CURLYN. */
2593 if ( OP(oscan) == CURLYX && data
2594 && data->flags & SF_IN_PAR
2595 && !(data->flags & SF_HAS_EVAL)
2596 && !deltanext && minnext == 1 ) {
2597 /* Try to optimize to CURLYN. */
2598 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2599 regnode * const nxt1 = nxt;
2606 if (!strchr((const char*)PL_simple,OP(nxt))
2607 && !(PL_regkind[OP(nxt)] == EXACT
2608 && STR_LEN(nxt) == 1))
2614 if (OP(nxt) != CLOSE)
2616 /* Now we know that nxt2 is the only contents: */
2617 oscan->flags = (U8)ARG(nxt);
2619 OP(nxt1) = NOTHING; /* was OPEN. */
2621 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2622 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2623 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2624 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2625 OP(nxt + 1) = OPTIMIZED; /* was count. */
2626 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2631 /* Try optimization CURLYX => CURLYM. */
2632 if ( OP(oscan) == CURLYX && data
2633 && !(data->flags & SF_HAS_PAR)
2634 && !(data->flags & SF_HAS_EVAL)
2635 && !deltanext /* atom is fixed width */
2636 && minnext != 0 /* CURLYM can't handle zero width */
2638 /* XXXX How to optimize if data == 0? */
2639 /* Optimize to a simpler form. */
2640 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2644 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2645 && (OP(nxt2) != WHILEM))
2647 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2648 /* Need to optimize away parenths. */
2649 if (data->flags & SF_IN_PAR) {
2650 /* Set the parenth number. */
2651 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2653 if (OP(nxt) != CLOSE)
2654 FAIL("Panic opt close");
2655 oscan->flags = (U8)ARG(nxt);
2656 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2657 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2659 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2660 OP(nxt + 1) = OPTIMIZED; /* was count. */
2661 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2662 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2665 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2666 regnode *nnxt = regnext(nxt1);
2669 if (reg_off_by_arg[OP(nxt1)])
2670 ARG_SET(nxt1, nxt2 - nxt1);
2671 else if (nxt2 - nxt1 < U16_MAX)
2672 NEXT_OFF(nxt1) = nxt2 - nxt1;
2674 OP(nxt) = NOTHING; /* Cannot beautify */
2679 /* Optimize again: */
2680 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2686 else if ((OP(oscan) == CURLYX)
2687 && (flags & SCF_WHILEM_VISITED_POS)
2688 /* See the comment on a similar expression above.
2689 However, this time it not a subexpression
2690 we care about, but the expression itself. */
2691 && (maxcount == REG_INFTY)
2692 && data && ++data->whilem_c < 16) {
2693 /* This stays as CURLYX, we can put the count/of pair. */
2694 /* Find WHILEM (as in regexec.c) */
2695 regnode *nxt = oscan + NEXT_OFF(oscan);
2697 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2699 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2700 | (RExC_whilem_seen << 4)); /* On WHILEM */
2702 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2704 if (flags & SCF_DO_SUBSTR) {
2705 SV *last_str = NULL;
2706 int counted = mincount != 0;
2708 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2709 #if defined(SPARC64_GCC_WORKAROUND)
2712 const char *s = NULL;
2715 if (pos_before >= data->last_start_min)
2718 b = data->last_start_min;
2721 s = SvPV_const(data->last_found, l);
2722 old = b - data->last_start_min;
2725 I32 b = pos_before >= data->last_start_min
2726 ? pos_before : data->last_start_min;
2728 const char * const s = SvPV_const(data->last_found, l);
2729 I32 old = b - data->last_start_min;
2733 old = utf8_hop((U8*)s, old) - (U8*)s;
2736 /* Get the added string: */
2737 last_str = newSVpvn(s + old, l);
2739 SvUTF8_on(last_str);
2740 if (deltanext == 0 && pos_before == b) {
2741 /* What was added is a constant string */
2743 SvGROW(last_str, (mincount * l) + 1);
2744 repeatcpy(SvPVX(last_str) + l,
2745 SvPVX_const(last_str), l, mincount - 1);
2746 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2747 /* Add additional parts. */
2748 SvCUR_set(data->last_found,
2749 SvCUR(data->last_found) - l);
2750 sv_catsv(data->last_found, last_str);
2752 SV * sv = data->last_found;
2754 SvUTF8(sv) && SvMAGICAL(sv) ?
2755 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2756 if (mg && mg->mg_len >= 0)
2757 mg->mg_len += CHR_SVLEN(last_str);
2759 data->last_end += l * (mincount - 1);
2762 /* start offset must point into the last copy */
2763 data->last_start_min += minnext * (mincount - 1);
2764 data->last_start_max += is_inf ? I32_MAX
2765 : (maxcount - 1) * (minnext + data->pos_delta);
2768 /* It is counted once already... */
2769 data->pos_min += minnext * (mincount - counted);
2770 data->pos_delta += - counted * deltanext +
2771 (minnext + deltanext) * maxcount - minnext * mincount;
2772 if (mincount != maxcount) {
2773 /* Cannot extend fixed substrings found inside
2775 scan_commit(pRExC_state,data);
2776 if (mincount && last_str) {
2777 SV * const sv = data->last_found;
2778 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2779 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2783 sv_setsv(sv, last_str);
2784 data->last_end = data->pos_min;
2785 data->last_start_min =
2786 data->pos_min - CHR_SVLEN(last_str);
2787 data->last_start_max = is_inf
2789 : data->pos_min + data->pos_delta
2790 - CHR_SVLEN(last_str);
2792 data->longest = &(data->longest_float);
2794 SvREFCNT_dec(last_str);
2796 if (data && (fl & SF_HAS_EVAL))
2797 data->flags |= SF_HAS_EVAL;
2798 optimize_curly_tail:
2799 if (OP(oscan) != CURLYX) {
2800 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2802 NEXT_OFF(oscan) += NEXT_OFF(next);
2805 default: /* REF and CLUMP only? */
2806 if (flags & SCF_DO_SUBSTR) {
2807 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2808 data->longest = &(data->longest_float);
2810 is_inf = is_inf_internal = 1;
2811 if (flags & SCF_DO_STCLASS_OR)
2812 cl_anything(pRExC_state, data->start_class);
2813 flags &= ~SCF_DO_STCLASS;
2817 else if (strchr((const char*)PL_simple,OP(scan))) {
2820 if (flags & SCF_DO_SUBSTR) {
2821 scan_commit(pRExC_state,data);
2825 if (flags & SCF_DO_STCLASS) {
2826 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2828 /* Some of the logic below assumes that switching
2829 locale on will only add false positives. */
2830 switch (PL_regkind[OP(scan)]) {
2834 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2835 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2836 cl_anything(pRExC_state, data->start_class);
2839 if (OP(scan) == SANY)
2841 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2842 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2843 || (data->start_class->flags & ANYOF_CLASS));
2844 cl_anything(pRExC_state, data->start_class);
2846 if (flags & SCF_DO_STCLASS_AND || !value)
2847 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2850 if (flags & SCF_DO_STCLASS_AND)
2851 cl_and(data->start_class,
2852 (struct regnode_charclass_class*)scan);
2854 cl_or(pRExC_state, data->start_class,
2855 (struct regnode_charclass_class*)scan);
2858 if (flags & SCF_DO_STCLASS_AND) {
2859 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2860 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2861 for (value = 0; value < 256; value++)
2862 if (!isALNUM(value))
2863 ANYOF_BITMAP_CLEAR(data->start_class, value);
2867 if (data->start_class->flags & ANYOF_LOCALE)
2868 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2870 for (value = 0; value < 256; value++)
2872 ANYOF_BITMAP_SET(data->start_class, value);
2877 if (flags & SCF_DO_STCLASS_AND) {
2878 if (data->start_class->flags & ANYOF_LOCALE)
2879 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2882 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2883 data->start_class->flags |= ANYOF_LOCALE;
2887 if (flags & SCF_DO_STCLASS_AND) {
2888 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2889 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2890 for (value = 0; value < 256; value++)
2892 ANYOF_BITMAP_CLEAR(data->start_class, value);
2896 if (data->start_class->flags & ANYOF_LOCALE)
2897 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2899 for (value = 0; value < 256; value++)
2900 if (!isALNUM(value))
2901 ANYOF_BITMAP_SET(data->start_class, value);
2906 if (flags & SCF_DO_STCLASS_AND) {
2907 if (data->start_class->flags & ANYOF_LOCALE)
2908 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2911 data->start_class->flags |= ANYOF_LOCALE;
2912 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2916 if (flags & SCF_DO_STCLASS_AND) {
2917 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2918 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2919 for (value = 0; value < 256; value++)
2920 if (!isSPACE(value))
2921 ANYOF_BITMAP_CLEAR(data->start_class, value);
2925 if (data->start_class->flags & ANYOF_LOCALE)
2926 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2928 for (value = 0; value < 256; value++)
2930 ANYOF_BITMAP_SET(data->start_class, value);
2935 if (flags & SCF_DO_STCLASS_AND) {
2936 if (data->start_class->flags & ANYOF_LOCALE)
2937 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2940 data->start_class->flags |= ANYOF_LOCALE;
2941 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2945 if (flags & SCF_DO_STCLASS_AND) {
2946 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2947 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2948 for (value = 0; value < 256; value++)
2950 ANYOF_BITMAP_CLEAR(data->start_class, value);
2954 if (data->start_class->flags & ANYOF_LOCALE)
2955 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2957 for (value = 0; value < 256; value++)
2958 if (!isSPACE(value))
2959 ANYOF_BITMAP_SET(data->start_class, value);
2964 if (flags & SCF_DO_STCLASS_AND) {
2965 if (data->start_class->flags & ANYOF_LOCALE) {
2966 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2967 for (value = 0; value < 256; value++)
2968 if (!isSPACE(value))
2969 ANYOF_BITMAP_CLEAR(data->start_class, value);
2973 data->start_class->flags |= ANYOF_LOCALE;
2974 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2978 if (flags & SCF_DO_STCLASS_AND) {
2979 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2980 for (value = 0; value < 256; value++)
2981 if (!isDIGIT(value))
2982 ANYOF_BITMAP_CLEAR(data->start_class, value);
2985 if (data->start_class->flags & ANYOF_LOCALE)
2986 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2988 for (value = 0; value < 256; value++)
2990 ANYOF_BITMAP_SET(data->start_class, value);
2995 if (flags & SCF_DO_STCLASS_AND) {
2996 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2997 for (value = 0; value < 256; value++)
2999 ANYOF_BITMAP_CLEAR(data->start_class, value);
3002 if (data->start_class->flags & ANYOF_LOCALE)
3003 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3005 for (value = 0; value < 256; value++)
3006 if (!isDIGIT(value))
3007 ANYOF_BITMAP_SET(data->start_class, value);
3012 if (flags & SCF_DO_STCLASS_OR)
3013 cl_and(data->start_class, &and_with);
3014 flags &= ~SCF_DO_STCLASS;
3017 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3018 data->flags |= (OP(scan) == MEOL
3022 else if ( PL_regkind[OP(scan)] == BRANCHJ
3023 /* Lookbehind, or need to calculate parens/evals/stclass: */
3024 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3025 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3026 /* Lookahead/lookbehind */
3027 I32 deltanext, minnext, fake = 0;
3029 struct regnode_charclass_class intrnl;
3032 data_fake.flags = 0;
3034 data_fake.whilem_c = data->whilem_c;
3035 data_fake.last_closep = data->last_closep;
3038 data_fake.last_closep = &fake;
3039 if ( flags & SCF_DO_STCLASS && !scan->flags
3040 && OP(scan) == IFMATCH ) { /* Lookahead */
3041 cl_init(pRExC_state, &intrnl);
3042 data_fake.start_class = &intrnl;
3043 f |= SCF_DO_STCLASS_AND;
3045 if (flags & SCF_WHILEM_VISITED_POS)
3046 f |= SCF_WHILEM_VISITED_POS;
3047 next = regnext(scan);
3048 nscan = NEXTOPER(NEXTOPER(scan));
3049 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3052 vFAIL("Variable length lookbehind not implemented");
3054 else if (minnext > U8_MAX) {
3055 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3057 scan->flags = (U8)minnext;
3060 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3062 if (data_fake.flags & SF_HAS_EVAL)
3063 data->flags |= SF_HAS_EVAL;
3064 data->whilem_c = data_fake.whilem_c;
3066 if (f & SCF_DO_STCLASS_AND) {
3067 const int was = (data->start_class->flags & ANYOF_EOS);
3069 cl_and(data->start_class, &intrnl);
3071 data->start_class->flags |= ANYOF_EOS;
3074 else if (OP(scan) == OPEN) {
3077 else if (OP(scan) == CLOSE) {
3078 if ((I32)ARG(scan) == is_par) {
3079 next = regnext(scan);
3081 if ( next && (OP(next) != WHILEM) && next < last)
3082 is_par = 0; /* Disable optimization */
3085 *(data->last_closep) = ARG(scan);
3087 else if (OP(scan) == EVAL) {
3089 data->flags |= SF_HAS_EVAL;
3091 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3092 if (flags & SCF_DO_SUBSTR) {
3093 scan_commit(pRExC_state,data);
3094 data->longest = &(data->longest_float);
3096 is_inf = is_inf_internal = 1;
3097 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3098 cl_anything(pRExC_state, data->start_class);
3099 flags &= ~SCF_DO_STCLASS;
3101 /* Else: zero-length, ignore. */
3102 scan = regnext(scan);
3107 *deltap = is_inf_internal ? I32_MAX : delta;
3108 if (flags & SCF_DO_SUBSTR && is_inf)
3109 data->pos_delta = I32_MAX - data->pos_min;
3110 if (is_par > U8_MAX)
3112 if (is_par && pars==1 && data) {
3113 data->flags |= SF_IN_PAR;
3114 data->flags &= ~SF_HAS_PAR;
3116 else if (pars && data) {
3117 data->flags |= SF_HAS_PAR;
3118 data->flags &= ~SF_IN_PAR;
3120 if (flags & SCF_DO_STCLASS_OR)
3121 cl_and(data->start_class, &and_with);
3122 if (flags & SCF_EXACT_TRIE)
3123 data->flags |= SCF_EXACT_TRIE;
3128 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3130 if (RExC_rx->data) {
3131 Renewc(RExC_rx->data,
3132 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3133 char, struct reg_data);
3134 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3135 RExC_rx->data->count += n;
3138 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3139 char, struct reg_data);
3140 Newx(RExC_rx->data->what, n, U8);
3141 RExC_rx->data->count = n;
3143 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3144 return RExC_rx->data->count - n;
3147 #ifndef PERL_IN_XSUB_RE
3149 Perl_reginitcolors(pTHX)
3152 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3154 char *t = savepv(s);
3158 t = strchr(t, '\t');
3164 PL_colors[i] = t = (char *)"";
3169 PL_colors[i++] = (char *)"";
3177 - pregcomp - compile a regular expression into internal code
3179 * We can't allocate space until we know how big the compiled form will be,
3180 * but we can't compile it (and thus know how big it is) until we've got a
3181 * place to put the code. So we cheat: we compile it twice, once with code
3182 * generation turned off and size counting turned on, and once "for real".
3183 * This also means that we don't allocate space until we are sure that the
3184 * thing really will compile successfully, and we never have to move the
3185 * code and thus invalidate pointers into it. (Note that it has to be in
3186 * one piece because free() must be able to free it all.) [NB: not true in perl]
3188 * Beware that the optimization-preparation code in here knows about some
3189 * of the structure of the compiled regexp. [I'll say.]
3192 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3203 RExC_state_t RExC_state;
3204 RExC_state_t * const pRExC_state = &RExC_state;
3205 #ifdef TRIE_STUDY_OPT
3207 RExC_state_t copyRExC_state;
3210 GET_RE_DEBUG_FLAGS_DECL;
3213 FAIL("NULL regexp argument");
3215 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3218 DEBUG_r(if (!PL_colorset) reginitcolors());
3220 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
3221 PL_colors[4],PL_colors[5],PL_colors[0],
3222 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3224 RExC_flags = pm->op_pmflags;
3228 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3229 RExC_seen_evals = 0;
3232 /* First pass: determine size, legality. */
3239 RExC_emit = &PL_regdummy;
3240 RExC_whilem_seen = 0;
3241 #if 0 /* REGC() is (currently) a NOP at the first pass.
3242 * Clever compilers notice this and complain. --jhi */
3243 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3245 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3246 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3247 RExC_precomp = NULL;
3250 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3251 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3252 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3255 RExC_lastparse=NULL;
3259 /* Small enough for pointer-storage convention?
3260 If extralen==0, this means that we will not need long jumps. */
3261 if (RExC_size >= 0x10000L && RExC_extralen)
3262 RExC_size += RExC_extralen;
3265 if (RExC_whilem_seen > 15)
3266 RExC_whilem_seen = 15;
3268 /* Allocate space and initialize. */
3269 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3272 FAIL("Regexp out of space");
3275 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3276 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3279 r->prelen = xend - exp;
3280 r->precomp = savepvn(RExC_precomp, r->prelen);
3282 #ifdef PERL_OLD_COPY_ON_WRITE
3283 r->saved_copy = NULL;
3285 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3286 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3287 r->lastparen = 0; /* mg.c reads this. */
3289 r->substrs = 0; /* Useful during FAIL. */
3290 r->startp = 0; /* Useful during FAIL. */
3291 r->endp = 0; /* Useful during FAIL. */
3293 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3295 r->offsets[0] = RExC_size;
3297 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3298 "%s %"UVuf" bytes for offset annotations.\n",
3299 r->offsets ? "Got" : "Couldn't get",
3300 (UV)((2*RExC_size+1) * sizeof(U32))));
3304 /* Second pass: emit code. */
3305 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3310 RExC_emit_start = r->program;
3311 RExC_emit = r->program;
3312 /* Store the count of eval-groups for security checks: */
3313 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3314 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3316 if (reg(pRExC_state, 0, &flags,1) == NULL)
3318 /* XXXX To minimize changes to RE engine we always allocate
3319 3-units-long substrs field. */
3320 Newx(r->substrs, 1, struct reg_substr_data);
3323 Zero(r->substrs, 1, struct reg_substr_data);
3324 StructCopy(&zero_scan_data, &data, scan_data_t);
3326 #ifdef TRIE_STUDY_OPT
3328 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3329 RExC_state=copyRExC_state;
3330 if (data.longest_fixed)
3331 SvREFCNT_dec(data.longest_fixed);
3332 if (data.longest_float)
3333 SvREFCNT_dec(data.longest_float);
3334 if (data.last_found)
3335 SvREFCNT_dec(data.last_found);
3337 copyRExC_state=RExC_state;
3340 /* Dig out information for optimizations. */
3341 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3342 pm->op_pmflags = RExC_flags;
3344 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3345 r->regstclass = NULL;
3346 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3347 r->reganch |= ROPT_NAUGHTY;
3348 scan = r->program + 1; /* First BRANCH. */
3350 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3351 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3353 STRLEN longest_float_length, longest_fixed_length;
3354 struct regnode_charclass_class ch_class; /* pointed to by data */
3356 I32 last_close = 0; /* pointed to by data */
3359 /* Skip introductions and multiplicators >= 1. */
3360 while ((OP(first) == OPEN && (sawopen = 1)) ||
3361 /* An OR of *one* alternative - should not happen now. */
3362 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3363 /* for now we can't handle lookbehind IFMATCH*/
3364 (OP(first) == IFMATCH && !first->flags) ||
3365 (OP(first) == PLUS) ||
3366 (OP(first) == MINMOD) ||
3367 /* An {n,m} with n>0 */
3368 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3370 DEBUG_PEEP("first:",first,0);
3371 if (OP(first) == PLUS)
3374 first += regarglen[OP(first)];
3375 if (OP(first) == IFMATCH) {
3376 first = NEXTOPER(first);
3377 first += EXTRA_STEP_2ARGS;
3378 } else /*xxx possible optimisation for /(?=)/*/
3379 first = NEXTOPER(first);
3382 /* Starting-point info. */
3384 /* Ignore EXACT as we deal with it later. */
3385 if (PL_regkind[OP(first)] == EXACT) {
3386 if (OP(first) == EXACT)
3387 NOOP; /* Empty, get anchored substr later. */
3388 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3389 r->regstclass = first;
3392 else if (OP(first) == TRIE &&
3393 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3395 /* this can happen only on restudy */
3396 struct regnode_1 *trie_op;
3397 Newxz(trie_op,1,struct regnode_1);
3398 StructCopy(first,trie_op,struct regnode_1);
3399 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3400 r->regstclass = (regnode *)trie_op;
3403 else if (strchr((const char*)PL_simple,OP(first)))
3404 r->regstclass = first;
3405 else if (PL_regkind[OP(first)] == BOUND ||
3406 PL_regkind[OP(first)] == NBOUND)
3407 r->regstclass = first;
3408 else if (PL_regkind[OP(first)] == BOL) {
3409 r->reganch |= (OP(first) == MBOL
3411 : (OP(first) == SBOL
3414 first = NEXTOPER(first);
3417 else if (OP(first) == GPOS) {
3418 r->reganch |= ROPT_ANCH_GPOS;
3419 first = NEXTOPER(first);
3422 else if (!sawopen && (OP(first) == STAR &&
3423 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3424 !(r->reganch & ROPT_ANCH) )
3426 /* turn .* into ^.* with an implied $*=1 */
3428 (OP(NEXTOPER(first)) == REG_ANY)
3431 r->reganch |= type | ROPT_IMPLICIT;
3432 first = NEXTOPER(first);
3435 if (sawplus && (!sawopen || !RExC_sawback)
3436 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3437 /* x+ must match at the 1st pos of run of x's */
3438 r->reganch |= ROPT_SKIP;
3440 /* Scan is after the zeroth branch, first is atomic matcher. */
3441 #ifdef TRIE_STUDY_OPT
3444 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3445 (IV)(first - scan + 1))
3449 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3450 (IV)(first - scan + 1))
3456 * If there's something expensive in the r.e., find the
3457 * longest literal string that must appear and make it the
3458 * regmust. Resolve ties in favor of later strings, since
3459 * the regstart check works with the beginning of the r.e.
3460 * and avoiding duplication strengthens checking. Not a
3461 * strong reason, but sufficient in the absence of others.
3462 * [Now we resolve ties in favor of the earlier string if
3463 * it happens that c_offset_min has been invalidated, since the
3464 * earlier string may buy us something the later one won't.]
3468 data.longest_fixed = newSVpvs("");
3469 data.longest_float = newSVpvs("");
3470 data.last_found = newSVpvs("");
3471 data.longest = &(data.longest_fixed);
3473 if (!r->regstclass) {
3474 cl_init(pRExC_state, &ch_class);
3475 data.start_class = &ch_class;
3476 stclass_flag = SCF_DO_STCLASS_AND;
3477 } else /* XXXX Check for BOUND? */
3479 data.last_closep = &last_close;
3481 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3482 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3484 #ifdef TRIE_STUDY_OPT
3485 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3490 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3491 && data.last_start_min == 0 && data.last_end > 0
3492 && !RExC_seen_zerolen
3493 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3494 r->reganch |= ROPT_CHECK_ALL;
3495 scan_commit(pRExC_state, &data);
3496 SvREFCNT_dec(data.last_found);
3498 longest_float_length = CHR_SVLEN(data.longest_float);
3499 if (longest_float_length
3500 || (data.flags & SF_FL_BEFORE_EOL
3501 && (!(data.flags & SF_FL_BEFORE_MEOL)
3502 || (RExC_flags & PMf_MULTILINE)))) {
3505 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3506 && data.offset_fixed == data.offset_float_min
3507 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3508 goto remove_float; /* As in (a)+. */
3510 if (SvUTF8(data.longest_float)) {
3511 r->float_utf8 = data.longest_float;
3512 r->float_substr = NULL;
3514 r->float_substr = data.longest_float;
3515 r->float_utf8 = NULL;
3517 r->float_min_offset = data.offset_float_min;
3518 r->float_max_offset = data.offset_float_max;
3519 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3520 && (!(data.flags & SF_FL_BEFORE_MEOL)
3521 || (RExC_flags & PMf_MULTILINE)));
3522 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3526 r->float_substr = r->float_utf8 = NULL;
3527 SvREFCNT_dec(data.longest_float);
3528 longest_float_length = 0;
3531 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3532 if (longest_fixed_length
3533 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3534 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3535 || (RExC_flags & PMf_MULTILINE)))) {
3538 if (SvUTF8(data.longest_fixed)) {
3539 r->anchored_utf8 = data.longest_fixed;
3540 r->anchored_substr = NULL;
3542 r->anchored_substr = data.longest_fixed;
3543 r->anchored_utf8 = NULL;
3545 r->anchored_offset = data.offset_fixed;
3546 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3547 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3548 || (RExC_flags & PMf_MULTILINE)));
3549 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3552 r->anchored_substr = r->anchored_utf8 = NULL;
3553 SvREFCNT_dec(data.longest_fixed);
3554 longest_fixed_length = 0;
3557 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3558 r->regstclass = NULL;
3559 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3561 && !(data.start_class->flags & ANYOF_EOS)
3562 && !cl_is_anything(data.start_class))
3564 const I32 n = add_data(pRExC_state, 1, "f");
3566 Newx(RExC_rx->data->data[n], 1,
3567 struct regnode_charclass_class);
3568 StructCopy(data.start_class,
3569 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3570 struct regnode_charclass_class);
3571 r->regstclass = (regnode*)RExC_rx->data->data[n];
3572 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3573 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3574 regprop(r, sv, (regnode*)data.start_class);
3575 PerlIO_printf(Perl_debug_log,
3576 "synthetic stclass \"%s\".\n",
3577 SvPVX_const(sv));});
3580 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3581 if (longest_fixed_length > longest_float_length) {
3582 r->check_substr = r->anchored_substr;
3583 r->check_utf8 = r->anchored_utf8;
3584 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3585 if (r->reganch & ROPT_ANCH_SINGLE)
3586 r->reganch |= ROPT_NOSCAN;
3589 r->check_substr = r->float_substr;
3590 r->check_utf8 = r->float_utf8;
3591 r->check_offset_min = data.offset_float_min;
3592 r->check_offset_max = data.offset_float_max;
3594 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3595 This should be changed ASAP! */
3596 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3597 r->reganch |= RE_USE_INTUIT;
3598 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3599 r->reganch |= RE_INTUIT_TAIL;
3603 /* Several toplevels. Best we can is to set minlen. */
3605 struct regnode_charclass_class ch_class;
3608 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3610 scan = r->program + 1;
3611 cl_init(pRExC_state, &ch_class);
3612 data.start_class = &ch_class;
3613 data.last_closep = &last_close;
3615 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3616 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3618 #ifdef TRIE_STUDY_OPT
3619 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3624 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3625 = r->float_substr = r->float_utf8 = NULL;
3626 if (!(data.start_class->flags & ANYOF_EOS)
3627 && !cl_is_anything(data.start_class))
3629 const I32 n = add_data(pRExC_state, 1, "f");
3631 Newx(RExC_rx->data->data[n], 1,
3632 struct regnode_charclass_class);
3633 StructCopy(data.start_class,
3634 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3635 struct regnode_charclass_class);
3636 r->regstclass = (regnode*)RExC_rx->data->data[n];
3637 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3638 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3639 regprop(r, sv, (regnode*)data.start_class);
3640 PerlIO_printf(Perl_debug_log,
3641 "synthetic stclass \"%s\".\n",
3642 SvPVX_const(sv));});
3647 if (RExC_seen & REG_SEEN_GPOS)
3648 r->reganch |= ROPT_GPOS_SEEN;
3649 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3650 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3651 if (RExC_seen & REG_SEEN_EVAL)
3652 r->reganch |= ROPT_EVAL_SEEN;
3653 if (RExC_seen & REG_SEEN_CANY)
3654 r->reganch |= ROPT_CANY_SEEN;
3655 Newxz(r->startp, RExC_npar, I32);
3656 Newxz(r->endp, RExC_npar, I32);
3658 DEBUG_r( RX_DEBUG_on(r) );
3660 PerlIO_printf(Perl_debug_log,"Final program:\n");
3663 DEBUG_OFFSETS_r(if (r->offsets) {
3664 const U32 len = r->offsets[0];
3666 GET_RE_DEBUG_FLAGS_DECL;
3667 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3668 for (i = 1; i <= len; i++) {
3669 if (r->offsets[i*2-1] || r->offsets[i*2])
3670 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3671 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3673 PerlIO_printf(Perl_debug_log, "\n");
3679 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3680 int rem=(int)(RExC_end - RExC_parse); \
3689 if (RExC_lastparse!=RExC_parse) \
3690 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3693 iscut ? "..." : "<" \
3696 PerlIO_printf(Perl_debug_log,"%16s",""); \
3701 num=REG_NODE_NUM(RExC_emit); \
3702 if (RExC_lastnum!=num) \
3703 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3705 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3706 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3707 (int)((depth*2)), "", \
3711 RExC_lastparse=RExC_parse; \
3716 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3717 DEBUG_PARSE_MSG((funcname)); \
3718 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3721 - reg - regular expression, i.e. main body or parenthesized thing
3723 * Caller must absorb opening parenthesis.
3725 * Combining parenthesis handling with the base level of regular expression
3726 * is a trifle forced, but the need to tie the tails of the branches to what
3727 * follows makes it hard to avoid.
3729 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3731 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3733 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3737 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3738 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3741 register regnode *ret; /* Will be the head of the group. */
3742 register regnode *br;
3743 register regnode *lastbr;
3744 register regnode *ender = NULL;
3745 register I32 parno = 0;
3747 const I32 oregflags = RExC_flags;
3748 bool have_branch = 0;
3751 /* for (?g), (?gc), and (?o) warnings; warning
3752 about (?c) will warn about (?g) -- japhy */
3754 #define WASTED_O 0x01
3755 #define WASTED_G 0x02
3756 #define WASTED_C 0x04
3757 #define WASTED_GC (0x02|0x04)
3758 I32 wastedflags = 0x00;
3760 char * parse_start = RExC_parse; /* MJD */
3761 char * const oregcomp_parse = RExC_parse;
3763 GET_RE_DEBUG_FLAGS_DECL;
3764 DEBUG_PARSE("reg ");
3767 *flagp = 0; /* Tentatively. */
3770 /* Make an OPEN node, if parenthesized. */
3772 if (*RExC_parse == '?') { /* (?...) */
3773 U32 posflags = 0, negflags = 0;
3774 U32 *flagsp = &posflags;
3775 bool is_logical = 0;
3776 const char * const seqstart = RExC_parse;
3779 paren = *RExC_parse++;
3780 ret = NULL; /* For look-ahead/behind. */
3782 case '<': /* (?<...) */
3783 RExC_seen |= REG_SEEN_LOOKBEHIND;
3784 if (*RExC_parse == '!')
3786 if (*RExC_parse != '=' && *RExC_parse != '!')
3789 case '=': /* (?=...) */
3790 case '!': /* (?!...) */
3791 RExC_seen_zerolen++;
3792 case ':': /* (?:...) */
3793 case '>': /* (?>...) */
3795 case '$': /* (?$...) */
3796 case '@': /* (?@...) */
3797 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3799 case '#': /* (?#...) */
3800 while (*RExC_parse && *RExC_parse != ')')
3802 if (*RExC_parse != ')')
3803 FAIL("Sequence (?#... not terminated");
3804 nextchar(pRExC_state);
3807 case 'p': /* (?p...) */
3808 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3809 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3811 case '?': /* (??...) */
3813 if (*RExC_parse != '{')
3815 paren = *RExC_parse++;
3817 case '{': /* (?{...}) */
3819 I32 count = 1, n = 0;
3821 char *s = RExC_parse;
3823 RExC_seen_zerolen++;
3824 RExC_seen |= REG_SEEN_EVAL;
3825 while (count && (c = *RExC_parse)) {
3836 if (*RExC_parse != ')') {
3838 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3842 OP_4tree *sop, *rop;
3843 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3846 Perl_save_re_context(aTHX);
3847 rop = sv_compile_2op(sv, &sop, "re", &pad);
3848 sop->op_private |= OPpREFCOUNTED;
3849 /* re_dup will OpREFCNT_inc */
3850 OpREFCNT_set(sop, 1);
3853 n = add_data(pRExC_state, 3, "nop");
3854 RExC_rx->data->data[n] = (void*)rop;
3855 RExC_rx->data->data[n+1] = (void*)sop;
3856 RExC_rx->data->data[n+2] = (void*)pad;
3859 else { /* First pass */
3860 if (PL_reginterp_cnt < ++RExC_seen_evals
3862 /* No compiled RE interpolated, has runtime
3863 components ===> unsafe. */
3864 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3865 if (PL_tainting && PL_tainted)
3866 FAIL("Eval-group in insecure regular expression");
3867 #if PERL_VERSION > 8
3868 if (IN_PERL_COMPILETIME)
3873 nextchar(pRExC_state);
3875 ret = reg_node(pRExC_state, LOGICAL);
3878 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3879 /* deal with the length of this later - MJD */
3882 ret = reganode(pRExC_state, EVAL, n);
3883 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3884 Set_Node_Offset(ret, parse_start);
3887 case '(': /* (?(?{...})...) and (?(?=...)...) */
3889 if (RExC_parse[0] == '?') { /* (?(?...)) */
3890 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3891 || RExC_parse[1] == '<'
3892 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3895 ret = reg_node(pRExC_state, LOGICAL);
3898 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3902 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3905 parno = atoi(RExC_parse++);
3907 while (isDIGIT(*RExC_parse))
3909 ret = reganode(pRExC_state, GROUPP, parno);
3911 if ((c = *nextchar(pRExC_state)) != ')')
3912 vFAIL("Switch condition not recognized");
3914 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3915 br = regbranch(pRExC_state, &flags, 1,depth+1);
3917 br = reganode(pRExC_state, LONGJMP, 0);
3919 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3920 c = *nextchar(pRExC_state);
3924 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3925 regbranch(pRExC_state, &flags, 1,depth+1);
3926 REGTAIL(pRExC_state, ret, lastbr);
3929 c = *nextchar(pRExC_state);
3934 vFAIL("Switch (?(condition)... contains too many branches");
3935 ender = reg_node(pRExC_state, TAIL);
3936 REGTAIL(pRExC_state, br, ender);
3938 REGTAIL(pRExC_state, lastbr, ender);
3939 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3942 REGTAIL(pRExC_state, ret, ender);
3946 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3950 RExC_parse--; /* for vFAIL to print correctly */
3951 vFAIL("Sequence (? incomplete");
3955 parse_flags: /* (?i) */
3956 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3957 /* (?g), (?gc) and (?o) are useless here
3958 and must be globally applied -- japhy */
3960 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3961 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3962 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3963 if (! (wastedflags & wflagbit) ) {
3964 wastedflags |= wflagbit;
3967 "Useless (%s%c) - %suse /%c modifier",
3968 flagsp == &negflags ? "?-" : "?",
3970 flagsp == &negflags ? "don't " : "",
3976 else if (*RExC_parse == 'c') {
3977 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3978 if (! (wastedflags & WASTED_C) ) {
3979 wastedflags |= WASTED_GC;
3982 "Useless (%sc) - %suse /gc modifier",
3983 flagsp == &negflags ? "?-" : "?",
3984 flagsp == &negflags ? "don't " : ""
3989 else { pmflag(flagsp, *RExC_parse); }
3993 if (*RExC_parse == '-') {
3995 wastedflags = 0; /* reset so (?g-c) warns twice */
3999 RExC_flags |= posflags;
4000 RExC_flags &= ~negflags;
4001 if (*RExC_parse == ':') {
4007 if (*RExC_parse != ')') {
4009 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
4011 nextchar(pRExC_state);
4019 ret = reganode(pRExC_state, OPEN, parno);
4020 Set_Node_Length(ret, 1); /* MJD */
4021 Set_Node_Offset(ret, RExC_parse); /* MJD */
4028 /* Pick up the branches, linking them together. */
4029 parse_start = RExC_parse; /* MJD */
4030 br = regbranch(pRExC_state, &flags, 1,depth+1);
4031 /* branch_len = (paren != 0); */
4035 if (*RExC_parse == '|') {
4036 if (!SIZE_ONLY && RExC_extralen) {
4037 reginsert(pRExC_state, BRANCHJ, br);
4040 reginsert(pRExC_state, BRANCH, br);
4041 Set_Node_Length(br, paren != 0);
4042 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4046 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4048 else if (paren == ':') {
4049 *flagp |= flags&SIMPLE;
4051 if (is_open) { /* Starts with OPEN. */
4052 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4054 else if (paren != '?') /* Not Conditional */
4056 *flagp |= flags & (SPSTART | HASWIDTH);
4058 while (*RExC_parse == '|') {
4059 if (!SIZE_ONLY && RExC_extralen) {
4060 ender = reganode(pRExC_state, LONGJMP,0);
4061 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4064 RExC_extralen += 2; /* Account for LONGJMP. */
4065 nextchar(pRExC_state);
4066 br = regbranch(pRExC_state, &flags, 0, depth+1);
4070 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4074 *flagp |= flags&SPSTART;
4077 if (have_branch || paren != ':') {
4078 /* Make a closing node, and hook it on the end. */
4081 ender = reg_node(pRExC_state, TAIL);
4084 ender = reganode(pRExC_state, CLOSE, parno);
4085 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4086 Set_Node_Length(ender,1); /* MJD */
4092 *flagp &= ~HASWIDTH;
4095 ender = reg_node(pRExC_state, SUCCEED);
4098 ender = reg_node(pRExC_state, END);
4101 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4103 if (have_branch && !SIZE_ONLY) {
4104 /* Hook the tails of the branches to the closing node. */
4105 for (br = ret; br; br = regnext(br)) {
4106 const U8 op = PL_regkind[OP(br)];
4108 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4110 else if (op == BRANCHJ) {
4111 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4119 static const char parens[] = "=!<,>";
4121 if (paren && (p = strchr(parens, paren))) {
4122 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4123 int flag = (p - parens) > 1;
4126 node = SUSPEND, flag = 0;
4127 reginsert(pRExC_state, node,ret);
4128 Set_Node_Cur_Length(ret);
4129 Set_Node_Offset(ret, parse_start + 1);
4131 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4135 /* Check for proper termination. */
4137 RExC_flags = oregflags;
4138 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4139 RExC_parse = oregcomp_parse;
4140 vFAIL("Unmatched (");
4143 else if (!paren && RExC_parse < RExC_end) {
4144 if (*RExC_parse == ')') {
4146 vFAIL("Unmatched )");
4149 FAIL("Junk on end of regexp"); /* "Can't happen". */
4157 - regbranch - one alternative of an | operator
4159 * Implements the concatenation operator.
4162 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4165 register regnode *ret;
4166 register regnode *chain = NULL;
4167 register regnode *latest;
4168 I32 flags = 0, c = 0;
4169 GET_RE_DEBUG_FLAGS_DECL;
4170 DEBUG_PARSE("brnc");
4174 if (!SIZE_ONLY && RExC_extralen)
4175 ret = reganode(pRExC_state, BRANCHJ,0);
4177 ret = reg_node(pRExC_state, BRANCH);
4178 Set_Node_Length(ret, 1);
4182 if (!first && SIZE_ONLY)
4183 RExC_extralen += 1; /* BRANCHJ */
4185 *flagp = WORST; /* Tentatively. */
4188 nextchar(pRExC_state);
4189 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4191 latest = regpiece(pRExC_state, &flags,depth+1);
4192 if (latest == NULL) {
4193 if (flags & TRYAGAIN)
4197 else if (ret == NULL)
4199 *flagp |= flags&HASWIDTH;
4200 if (chain == NULL) /* First piece. */
4201 *flagp |= flags&SPSTART;
4204 REGTAIL(pRExC_state, chain, latest);
4209 if (chain == NULL) { /* Loop ran zero times. */
4210 chain = reg_node(pRExC_state, NOTHING);
4215 *flagp |= flags&SIMPLE;
4222 - regpiece - something followed by possible [*+?]
4224 * Note that the branching code sequences used for ? and the general cases
4225 * of * and + are somewhat optimized: they use the same NOTHING node as
4226 * both the endmarker for their branch list and the body of the last branch.
4227 * It might seem that this node could be dispensed with entirely, but the
4228 * endmarker role is not redundant.
4231 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4234 register regnode *ret;
4236 register char *next;
4238 const char * const origparse = RExC_parse;
4240 I32 max = REG_INFTY;
4242 GET_RE_DEBUG_FLAGS_DECL;
4243 DEBUG_PARSE("piec");
4245 ret = regatom(pRExC_state, &flags,depth+1);
4247 if (flags & TRYAGAIN)
4254 if (op == '{' && regcurly(RExC_parse)) {
4255 const char *maxpos = NULL;
4256 parse_start = RExC_parse; /* MJD */
4257 next = RExC_parse + 1;
4258 while (isDIGIT(*next) || *next == ',') {
4267 if (*next == '}') { /* got one */
4271 min = atoi(RExC_parse);
4275 maxpos = RExC_parse;
4277 if (!max && *maxpos != '0')
4278 max = REG_INFTY; /* meaning "infinity" */
4279 else if (max >= REG_INFTY)
4280 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4282 nextchar(pRExC_state);
4285 if ((flags&SIMPLE)) {
4286 RExC_naughty += 2 + RExC_naughty / 2;
4287 reginsert(pRExC_state, CURLY, ret);
4288 Set_Node_Offset(ret, parse_start+1); /* MJD */
4289 Set_Node_Cur_Length(ret);
4292 regnode * const w = reg_node(pRExC_state, WHILEM);
4295 REGTAIL(pRExC_state, ret, w);
4296 if (!SIZE_ONLY && RExC_extralen) {
4297 reginsert(pRExC_state, LONGJMP,ret);
4298 reginsert(pRExC_state, NOTHING,ret);
4299 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4301 reginsert(pRExC_state, CURLYX,ret);
4303 Set_Node_Offset(ret, parse_start+1);
4304 Set_Node_Length(ret,
4305 op == '{' ? (RExC_parse - parse_start) : 1);
4307 if (!SIZE_ONLY && RExC_extralen)
4308 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4309 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4311 RExC_whilem_seen++, RExC_extralen += 3;
4312 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4320 if (max && max < min)
4321 vFAIL("Can't do {n,m} with n > m");
4323 ARG1_SET(ret, (U16)min);
4324 ARG2_SET(ret, (U16)max);
4336 #if 0 /* Now runtime fix should be reliable. */
4338 /* if this is reinstated, don't forget to put this back into perldiag:
4340 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4342 (F) The part of the regexp subject to either the * or + quantifier
4343 could match an empty string. The {#} shows in the regular
4344 expression about where the problem was discovered.
4348 if (!(flags&HASWIDTH) && op != '?')
4349 vFAIL("Regexp *+ operand could be empty");
4352 parse_start = RExC_parse;
4353 nextchar(pRExC_state);
4355 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4357 if (op == '*' && (flags&SIMPLE)) {
4358 reginsert(pRExC_state, STAR, ret);
4362 else if (op == '*') {
4366 else if (op == '+' && (flags&SIMPLE)) {
4367 reginsert(pRExC_state, PLUS, ret);
4371 else if (op == '+') {
4375 else if (op == '?') {
4380 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4382 "%.*s matches null string many times",
4383 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4387 if (*RExC_parse == '?') {
4388 nextchar(pRExC_state);
4389 reginsert(pRExC_state, MINMOD, ret);
4390 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4392 if (ISMULT2(RExC_parse)) {
4394 vFAIL("Nested quantifiers");
4401 - regatom - the lowest level
4403 * Optimization: gobbles an entire sequence of ordinary characters so that
4404 * it can turn them into a single node, which is smaller to store and
4405 * faster to run. Backslashed characters are exceptions, each becoming a
4406 * separate node; the code is simpler that way and it's not worth fixing.
4408 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4409 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4412 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4415 register regnode *ret = NULL;
4417 char *parse_start = RExC_parse;
4418 GET_RE_DEBUG_FLAGS_DECL;
4419 DEBUG_PARSE("atom");
4420 *flagp = WORST; /* Tentatively. */
4423 switch (*RExC_parse) {
4425 RExC_seen_zerolen++;
4426 nextchar(pRExC_state);
4427 if (RExC_flags & PMf_MULTILINE)
4428 ret = reg_node(pRExC_state, MBOL);
4429 else if (RExC_flags & PMf_SINGLELINE)
4430 ret = reg_node(pRExC_state, SBOL);
4432 ret = reg_node(pRExC_state, BOL);
4433 Set_Node_Length(ret, 1); /* MJD */
4436 nextchar(pRExC_state);
4438 RExC_seen_zerolen++;
4439 if (RExC_flags & PMf_MULTILINE)
4440 ret = reg_node(pRExC_state, MEOL);
4441 else if (RExC_flags & PMf_SINGLELINE)
4442 ret = reg_node(pRExC_state, SEOL);
4444 ret = reg_node(pRExC_state, EOL);
4445 Set_Node_Length(ret, 1); /* MJD */
4448 nextchar(pRExC_state);
4449 if (RExC_flags & PMf_SINGLELINE)
4450 ret = reg_node(pRExC_state, SANY);
4452 ret = reg_node(pRExC_state, REG_ANY);
4453 *flagp |= HASWIDTH|SIMPLE;
4455 Set_Node_Length(ret, 1); /* MJD */
4459 char * const oregcomp_parse = ++RExC_parse;
4460 ret = regclass(pRExC_state,depth+1);
4461 if (*RExC_parse != ']') {
4462 RExC_parse = oregcomp_parse;
4463 vFAIL("Unmatched [");
4465 nextchar(pRExC_state);
4466 *flagp |= HASWIDTH|SIMPLE;
4467 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4471 nextchar(pRExC_state);
4472 ret = reg(pRExC_state, 1, &flags,depth+1);
4474 if (flags & TRYAGAIN) {
4475 if (RExC_parse == RExC_end) {
4476 /* Make parent create an empty node if needed. */
4484 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4488 if (flags & TRYAGAIN) {
4492 vFAIL("Internal urp");
4493 /* Supposed to be caught earlier. */
4496 if (!regcurly(RExC_parse)) {
4505 vFAIL("Quantifier follows nothing");
4508 switch (*++RExC_parse) {
4510 RExC_seen_zerolen++;
4511 ret = reg_node(pRExC_state, SBOL);
4513 nextchar(pRExC_state);
4514 Set_Node_Length(ret, 2); /* MJD */
4517 ret = reg_node(pRExC_state, GPOS);
4518 RExC_seen |= REG_SEEN_GPOS;
4520 nextchar(pRExC_state);
4521 Set_Node_Length(ret, 2); /* MJD */
4524 ret = reg_node(pRExC_state, SEOL);
4526 RExC_seen_zerolen++; /* Do not optimize RE away */
4527 nextchar(pRExC_state);
4530 ret = reg_node(pRExC_state, EOS);
4532 RExC_seen_zerolen++; /* Do not optimize RE away */
4533 nextchar(pRExC_state);
4534 Set_Node_Length(ret, 2); /* MJD */
4537 ret = reg_node(pRExC_state, CANY);
4538 RExC_seen |= REG_SEEN_CANY;
4539 *flagp |= HASWIDTH|SIMPLE;
4540 nextchar(pRExC_state);
4541 Set_Node_Length(ret, 2); /* MJD */
4544 ret = reg_node(pRExC_state, CLUMP);
4546 nextchar(pRExC_state);
4547 Set_Node_Length(ret, 2); /* MJD */
4550 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4551 *flagp |= HASWIDTH|SIMPLE;
4552 nextchar(pRExC_state);
4553 Set_Node_Length(ret, 2); /* MJD */
4556 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4557 *flagp |= HASWIDTH|SIMPLE;
4558 nextchar(pRExC_state);
4559 Set_Node_Length(ret, 2); /* MJD */
4562 RExC_seen_zerolen++;
4563 RExC_seen |= REG_SEEN_LOOKBEHIND;
4564 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4566 nextchar(pRExC_state);
4567 Set_Node_Length(ret, 2); /* MJD */
4570 RExC_seen_zerolen++;
4571 RExC_seen |= REG_SEEN_LOOKBEHIND;
4572 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4574 nextchar(pRExC_state);
4575 Set_Node_Length(ret, 2); /* MJD */
4578 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4579 *flagp |= HASWIDTH|SIMPLE;
4580 nextchar(pRExC_state);
4581 Set_Node_Length(ret, 2); /* MJD */
4584 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4585 *flagp |= HASWIDTH|SIMPLE;
4586 nextchar(pRExC_state);
4587 Set_Node_Length(ret, 2); /* MJD */
4590 ret = reg_node(pRExC_state, DIGIT);
4591 *flagp |= HASWIDTH|SIMPLE;
4592 nextchar(pRExC_state);
4593 Set_Node_Length(ret, 2); /* MJD */
4596 ret = reg_node(pRExC_state, NDIGIT);
4597 *flagp |= HASWIDTH|SIMPLE;
4598 nextchar(pRExC_state);
4599 Set_Node_Length(ret, 2); /* MJD */
4604 char* const oldregxend = RExC_end;
4605 char* parse_start = RExC_parse - 2;
4607 if (RExC_parse[1] == '{') {
4608 /* a lovely hack--pretend we saw [\pX] instead */
4609 RExC_end = strchr(RExC_parse, '}');
4611 const U8 c = (U8)*RExC_parse;
4613 RExC_end = oldregxend;
4614 vFAIL2("Missing right brace on \\%c{}", c);
4619 RExC_end = RExC_parse + 2;
4620 if (RExC_end > oldregxend)
4621 RExC_end = oldregxend;
4625 ret = regclass(pRExC_state,depth+1);
4627 RExC_end = oldregxend;
4630 Set_Node_Offset(ret, parse_start + 2);
4631 Set_Node_Cur_Length(ret);
4632 nextchar(pRExC_state);
4633 *flagp |= HASWIDTH|SIMPLE;
4646 case '1': case '2': case '3': case '4':
4647 case '5': case '6': case '7': case '8': case '9':
4649 const I32 num = atoi(RExC_parse);
4651 if (num > 9 && num >= RExC_npar)
4654 char * const parse_start = RExC_parse - 1; /* MJD */
4655 while (isDIGIT(*RExC_parse))
4658 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4659 vFAIL("Reference to nonexistent group");
4661 ret = reganode(pRExC_state,
4662 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4666 /* override incorrect value set in reganode MJD */
4667 Set_Node_Offset(ret, parse_start+1);
4668 Set_Node_Cur_Length(ret); /* MJD */
4670 nextchar(pRExC_state);
4675 if (RExC_parse >= RExC_end)
4676 FAIL("Trailing \\");
4679 /* Do not generate "unrecognized" warnings here, we fall
4680 back into the quick-grab loop below */
4687 if (RExC_flags & PMf_EXTENDED) {
4688 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4690 if (RExC_parse < RExC_end)
4696 register STRLEN len;
4701 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4703 parse_start = RExC_parse - 1;
4709 ret = reg_node(pRExC_state,
4710 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4712 for (len = 0, p = RExC_parse - 1;
4713 len < 127 && p < RExC_end;
4716 char * const oldp = p;
4718 if (RExC_flags & PMf_EXTENDED)
4719 p = regwhite(p, RExC_end);
4766 ender = ASCII_TO_NATIVE('\033');
4770 ender = ASCII_TO_NATIVE('\007');
4775 char* const e = strchr(p, '}');
4779 vFAIL("Missing right brace on \\x{}");
4782 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4783 | PERL_SCAN_DISALLOW_PREFIX;
4784 STRLEN numlen = e - p - 1;
4785 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4792 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4794 ender = grok_hex(p, &numlen, &flags, NULL);
4800 ender = UCHARAT(p++);
4801 ender = toCTRL(ender);
4803 case '0': case '1': case '2': case '3':case '4':
4804 case '5': case '6': case '7': case '8':case '9':
4806 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4809 ender = grok_oct(p, &numlen, &flags, NULL);
4819 FAIL("Trailing \\");
4822 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4823 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4824 goto normal_default;
4829 if (UTF8_IS_START(*p) && UTF) {
4831 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4832 &numlen, UTF8_ALLOW_DEFAULT);
4839 if (RExC_flags & PMf_EXTENDED)
4840 p = regwhite(p, RExC_end);
4842 /* Prime the casefolded buffer. */
4843 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4845 if (ISMULT2(p)) { /* Back off on ?+*. */
4850 /* Emit all the Unicode characters. */
4852 for (foldbuf = tmpbuf;
4854 foldlen -= numlen) {
4855 ender = utf8_to_uvchr(foldbuf, &numlen);
4857 const STRLEN unilen = reguni(pRExC_state, ender, s);
4860 /* In EBCDIC the numlen
4861 * and unilen can differ. */
4863 if (numlen >= foldlen)
4867 break; /* "Can't happen." */
4871 const STRLEN unilen = reguni(pRExC_state, ender, s);
4880 REGC((char)ender, s++);
4886 /* Emit all the Unicode characters. */
4888 for (foldbuf = tmpbuf;
4890 foldlen -= numlen) {
4891 ender = utf8_to_uvchr(foldbuf, &numlen);
4893 const STRLEN unilen = reguni(pRExC_state, ender, s);
4896 /* In EBCDIC the numlen
4897 * and unilen can differ. */
4899 if (numlen >= foldlen)
4907 const STRLEN unilen = reguni(pRExC_state, ender, s);
4916 REGC((char)ender, s++);
4920 Set_Node_Cur_Length(ret); /* MJD */
4921 nextchar(pRExC_state);
4923 /* len is STRLEN which is unsigned, need to copy to signed */
4926 vFAIL("Internal disaster");
4930 if (len == 1 && UNI_IS_INVARIANT(ender))
4934 RExC_size += STR_SZ(len);
4937 RExC_emit += STR_SZ(len);
4943 /* If the encoding pragma is in effect recode the text of
4944 * any EXACT-kind nodes. */
4945 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4946 const STRLEN oldlen = STR_LEN(ret);
4947 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4951 if (sv_utf8_downgrade(sv, TRUE)) {
4952 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4953 const STRLEN newlen = SvCUR(sv);
4958 GET_RE_DEBUG_FLAGS_DECL;
4959 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4960 (int)oldlen, STRING(ret),
4962 Copy(s, STRING(ret), newlen, char);
4963 STR_LEN(ret) += newlen - oldlen;
4964 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4966 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4974 S_regwhite(char *p, const char *e)
4979 else if (*p == '#') {
4982 } while (p < e && *p != '\n');
4990 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4991 Character classes ([:foo:]) can also be negated ([:^foo:]).
4992 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4993 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4994 but trigger failures because they are currently unimplemented. */
4996 #define POSIXCC_DONE(c) ((c) == ':')
4997 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4998 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5001 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
5004 I32 namedclass = OOB_NAMEDCLASS;
5006 if (value == '[' && RExC_parse + 1 < RExC_end &&
5007 /* I smell either [: or [= or [. -- POSIX has been here, right? */
5008 POSIXCC(UCHARAT(RExC_parse))) {
5009 const char c = UCHARAT(RExC_parse);
5010 char* const s = RExC_parse++;
5012 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
5014 if (RExC_parse == RExC_end)
5015 /* Grandfather lone [:, [=, [. */
5018 const char* const t = RExC_parse++; /* skip over the c */
5021 if (UCHARAT(RExC_parse) == ']') {
5022 const char *posixcc = s + 1;
5023 RExC_parse++; /* skip over the ending ] */
5026 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5027 const I32 skip = t - posixcc;
5029 /* Initially switch on the length of the name. */
5032 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5033 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5036 /* Names all of length 5. */
5037 /* alnum alpha ascii blank cntrl digit graph lower
5038 print punct space upper */
5039 /* Offset 4 gives the best switch position. */
5040 switch (posixcc[4]) {
5042 if (memEQ(posixcc, "alph", 4)) /* alpha */
5043 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5046 if (memEQ(posixcc, "spac", 4)) /* space */
5047 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5050 if (memEQ(posixcc, "grap", 4)) /* graph */
5051 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5054 if (memEQ(posixcc, "asci", 4)) /* ascii */
5055 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5058 if (memEQ(posixcc, "blan", 4)) /* blank */
5059 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5062 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5063 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5066 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5067 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5070 if (memEQ(posixcc, "lowe", 4)) /* lower */
5071 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5072 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5073 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5076 if (memEQ(posixcc, "digi", 4)) /* digit */
5077 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5078 else if (memEQ(posixcc, "prin", 4)) /* print */
5079 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5080 else if (memEQ(posixcc, "punc", 4)) /* punct */
5081 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5086 if (memEQ(posixcc, "xdigit", 6))
5087 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5091 if (namedclass == OOB_NAMEDCLASS)
5092 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5094 assert (posixcc[skip] == ':');
5095 assert (posixcc[skip+1] == ']');
5096 } else if (!SIZE_ONLY) {
5097 /* [[=foo=]] and [[.foo.]] are still future. */
5099 /* adjust RExC_parse so the warning shows after
5101 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5103 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5106 /* Maternal grandfather:
5107 * "[:" ending in ":" but not in ":]" */
5117 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5120 if (POSIXCC(UCHARAT(RExC_parse))) {
5121 const char *s = RExC_parse;
5122 const char c = *s++;
5126 if (*s && c == *s && s[1] == ']') {
5127 if (ckWARN(WARN_REGEXP))
5129 "POSIX syntax [%c %c] belongs inside character classes",
5132 /* [[=foo=]] and [[.foo.]] are still future. */
5133 if (POSIXCC_NOTYET(c)) {
5134 /* adjust RExC_parse so the error shows after
5136 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5138 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5146 parse a class specification and produce either an ANYOF node that
5147 matches the pattern. If the pattern matches a single char only and
5148 that char is < 256 then we produce an EXACT node instead.
5151 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5155 register UV nextvalue;
5156 register IV prevvalue = OOB_UNICODE;
5157 register IV range = 0;
5158 register regnode *ret;
5161 char *rangebegin = NULL;
5162 bool need_class = 0;
5165 bool optimize_invert = TRUE;
5166 AV* unicode_alternate = NULL;
5168 UV literal_endpoint = 0;
5170 UV stored = 0; /* number of chars stored in the class */
5172 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5173 case we need to change the emitted regop to an EXACT. */
5174 const char * orig_parse = RExC_parse;
5175 GET_RE_DEBUG_FLAGS_DECL;
5176 DEBUG_PARSE("clas");
5178 /* Assume we are going to generate an ANYOF node. */
5179 ret = reganode(pRExC_state, ANYOF, 0);
5182 ANYOF_FLAGS(ret) = 0;
5184 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5188 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5192 RExC_size += ANYOF_SKIP;
5193 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5196 RExC_emit += ANYOF_SKIP;
5198 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5200 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5201 ANYOF_BITMAP_ZERO(ret);
5202 listsv = newSVpvs("# comment\n");
5205 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5207 if (!SIZE_ONLY && POSIXCC(nextvalue))
5208 checkposixcc(pRExC_state);
5210 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5211 if (UCHARAT(RExC_parse) == ']')
5214 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5218 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5221 rangebegin = RExC_parse;
5223 value = utf8n_to_uvchr((U8*)RExC_parse,
5224 RExC_end - RExC_parse,
5225 &numlen, UTF8_ALLOW_DEFAULT);
5226 RExC_parse += numlen;
5229 value = UCHARAT(RExC_parse++);
5231 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5232 if (value == '[' && POSIXCC(nextvalue))
5233 namedclass = regpposixcc(pRExC_state, value);
5234 else if (value == '\\') {
5236 value = utf8n_to_uvchr((U8*)RExC_parse,
5237 RExC_end - RExC_parse,
5238 &numlen, UTF8_ALLOW_DEFAULT);
5239 RExC_parse += numlen;
5242 value = UCHARAT(RExC_parse++);
5243 /* Some compilers cannot handle switching on 64-bit integer
5244 * values, therefore value cannot be an UV. Yes, this will
5245 * be a problem later if we want switch on Unicode.
5246 * A similar issue a little bit later when switching on
5247 * namedclass. --jhi */
5248 switch ((I32)value) {
5249 case 'w': namedclass = ANYOF_ALNUM; break;
5250 case 'W': namedclass = ANYOF_NALNUM; break;
5251 case 's': namedclass = ANYOF_SPACE; break;
5252 case 'S': namedclass = ANYOF_NSPACE; break;
5253 case 'd': namedclass = ANYOF_DIGIT; break;
5254 case 'D': namedclass = ANYOF_NDIGIT; break;
5259 if (RExC_parse >= RExC_end)
5260 vFAIL2("Empty \\%c{}", (U8)value);
5261 if (*RExC_parse == '{') {
5262 const U8 c = (U8)value;
5263 e = strchr(RExC_parse++, '}');
5265 vFAIL2("Missing right brace on \\%c{}", c);
5266 while (isSPACE(UCHARAT(RExC_parse)))
5268 if (e == RExC_parse)
5269 vFAIL2("Empty \\%c{}", c);
5271 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5279 if (UCHARAT(RExC_parse) == '^') {
5282 value = value == 'p' ? 'P' : 'p'; /* toggle */
5283 while (isSPACE(UCHARAT(RExC_parse))) {
5288 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5289 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5292 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5293 namedclass = ANYOF_MAX; /* no official name, but it's named */
5296 case 'n': value = '\n'; break;
5297 case 'r': value = '\r'; break;
5298 case 't': value = '\t'; break;
5299 case 'f': value = '\f'; break;
5300 case 'b': value = '\b'; break;
5301 case 'e': value = ASCII_TO_NATIVE('\033');break;
5302 case 'a': value = ASCII_TO_NATIVE('\007');break;
5304 if (*RExC_parse == '{') {
5305 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5306 | PERL_SCAN_DISALLOW_PREFIX;
5307 char * const e = strchr(RExC_parse++, '}');
5309 vFAIL("Missing right brace on \\x{}");
5311 numlen = e - RExC_parse;
5312 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5316 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5318 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5319 RExC_parse += numlen;
5323 value = UCHARAT(RExC_parse++);
5324 value = toCTRL(value);
5326 case '0': case '1': case '2': case '3': case '4':
5327 case '5': case '6': case '7': case '8': case '9':
5331 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5332 RExC_parse += numlen;
5336 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5338 "Unrecognized escape \\%c in character class passed through",
5342 } /* end of \blah */
5348 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5350 if (!SIZE_ONLY && !need_class)
5351 ANYOF_CLASS_ZERO(ret);
5355 /* a bad range like a-\d, a-[:digit:] ? */
5358 if (ckWARN(WARN_REGEXP)) {
5360 RExC_parse >= rangebegin ?
5361 RExC_parse - rangebegin : 0;
5363 "False [] range \"%*.*s\"",
5366 if (prevvalue < 256) {
5367 ANYOF_BITMAP_SET(ret, prevvalue);
5368 ANYOF_BITMAP_SET(ret, '-');
5371 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5372 Perl_sv_catpvf(aTHX_ listsv,
5373 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5377 range = 0; /* this was not a true range */
5381 const char *what = NULL;
5384 if (namedclass > OOB_NAMEDCLASS)
5385 optimize_invert = FALSE;
5386 /* Possible truncation here but in some 64-bit environments
5387 * the compiler gets heartburn about switch on 64-bit values.
5388 * A similar issue a little earlier when switching on value.
5390 switch ((I32)namedclass) {
5393 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5395 for (value = 0; value < 256; value++)
5397 ANYOF_BITMAP_SET(ret, value);
5404 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5406 for (value = 0; value < 256; value++)
5407 if (!isALNUM(value))
5408 ANYOF_BITMAP_SET(ret, value);
5415 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5417 for (value = 0; value < 256; value++)
5418 if (isALNUMC(value))
5419 ANYOF_BITMAP_SET(ret, value);
5426 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5428 for (value = 0; value < 256; value++)
5429 if (!isALNUMC(value))
5430 ANYOF_BITMAP_SET(ret, value);
5437 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5439 for (value = 0; value < 256; value++)
5441 ANYOF_BITMAP_SET(ret, value);
5448 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5450 for (value = 0; value < 256; value++)
5451 if (!isALPHA(value))
5452 ANYOF_BITMAP_SET(ret, value);
5459 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5462 for (value = 0; value < 128; value++)
5463 ANYOF_BITMAP_SET(ret, value);
5465 for (value = 0; value < 256; value++) {
5467 ANYOF_BITMAP_SET(ret, value);
5476 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5479 for (value = 128; value < 256; value++)
5480 ANYOF_BITMAP_SET(ret, value);
5482 for (value = 0; value < 256; value++) {
5483 if (!isASCII(value))
5484 ANYOF_BITMAP_SET(ret, value);
5493 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5495 for (value = 0; value < 256; value++)
5497 ANYOF_BITMAP_SET(ret, value);
5504 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5506 for (value = 0; value < 256; value++)
5507 if (!isBLANK(value))
5508 ANYOF_BITMAP_SET(ret, value);
5515 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5517 for (value = 0; value < 256; value++)
5519 ANYOF_BITMAP_SET(ret, value);
5526 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5528 for (value = 0; value < 256; value++)
5529 if (!isCNTRL(value))
5530 ANYOF_BITMAP_SET(ret, value);
5537 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5539 /* consecutive digits assumed */
5540 for (value = '0'; value <= '9'; value++)
5541 ANYOF_BITMAP_SET(ret, value);
5548 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5550 /* consecutive digits assumed */
5551 for (value = 0; value < '0'; value++)
5552 ANYOF_BITMAP_SET(ret, value);
5553 for (value = '9' + 1; value < 256; value++)
5554 ANYOF_BITMAP_SET(ret, value);
5561 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5563 for (value = 0; value < 256; value++)
5565 ANYOF_BITMAP_SET(ret, value);
5572 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5574 for (value = 0; value < 256; value++)
5575 if (!isGRAPH(value))
5576 ANYOF_BITMAP_SET(ret, value);
5583 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5585 for (value = 0; value < 256; value++)
5587 ANYOF_BITMAP_SET(ret, value);
5594 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5596 for (value = 0; value < 256; value++)
5597 if (!isLOWER(value))
5598 ANYOF_BITMAP_SET(ret, value);
5605 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5607 for (value = 0; value < 256; value++)
5609 ANYOF_BITMAP_SET(ret, value);
5616 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5618 for (value = 0; value < 256; value++)
5619 if (!isPRINT(value))
5620 ANYOF_BITMAP_SET(ret, value);
5627 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5629 for (value = 0; value < 256; value++)
5630 if (isPSXSPC(value))
5631 ANYOF_BITMAP_SET(ret, value);
5638 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5640 for (value = 0; value < 256; value++)
5641 if (!isPSXSPC(value))
5642 ANYOF_BITMAP_SET(ret, value);
5649 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5651 for (value = 0; value < 256; value++)
5653 ANYOF_BITMAP_SET(ret, value);
5660 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5662 for (value = 0; value < 256; value++)
5663 if (!isPUNCT(value))
5664 ANYOF_BITMAP_SET(ret, value);
5671 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5673 for (value = 0; value < 256; value++)
5675 ANYOF_BITMAP_SET(ret, value);
5682 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5684 for (value = 0; value < 256; value++)
5685 if (!isSPACE(value))
5686 ANYOF_BITMAP_SET(ret, value);
5693 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5695 for (value = 0; value < 256; value++)
5697 ANYOF_BITMAP_SET(ret, value);
5704 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5706 for (value = 0; value < 256; value++)
5707 if (!isUPPER(value))
5708 ANYOF_BITMAP_SET(ret, value);
5715 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5717 for (value = 0; value < 256; value++)
5718 if (isXDIGIT(value))
5719 ANYOF_BITMAP_SET(ret, value);
5726 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5728 for (value = 0; value < 256; value++)
5729 if (!isXDIGIT(value))
5730 ANYOF_BITMAP_SET(ret, value);
5736 /* this is to handle \p and \P */
5739 vFAIL("Invalid [::] class");
5743 /* Strings such as "+utf8::isWord\n" */
5744 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5747 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5750 } /* end of namedclass \blah */
5753 if (prevvalue > (IV)value) /* b-a */ {
5754 const int w = RExC_parse - rangebegin;
5755 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5756 range = 0; /* not a valid range */
5760 prevvalue = value; /* save the beginning of the range */
5761 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5762 RExC_parse[1] != ']') {
5765 /* a bad range like \w-, [:word:]- ? */
5766 if (namedclass > OOB_NAMEDCLASS) {
5767 if (ckWARN(WARN_REGEXP)) {
5769 RExC_parse >= rangebegin ?
5770 RExC_parse - rangebegin : 0;
5772 "False [] range \"%*.*s\"",
5776 ANYOF_BITMAP_SET(ret, '-');
5778 range = 1; /* yeah, it's a range! */
5779 continue; /* but do it the next time */
5783 /* now is the next time */
5784 /*stored += (value - prevvalue + 1);*/
5786 if (prevvalue < 256) {
5787 const IV ceilvalue = value < 256 ? value : 255;
5790 /* In EBCDIC [\x89-\x91] should include
5791 * the \x8e but [i-j] should not. */
5792 if (literal_endpoint == 2 &&
5793 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5794 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5796 if (isLOWER(prevvalue)) {
5797 for (i = prevvalue; i <= ceilvalue; i++)
5799 ANYOF_BITMAP_SET(ret, i);
5801 for (i = prevvalue; i <= ceilvalue; i++)
5803 ANYOF_BITMAP_SET(ret, i);
5808 for (i = prevvalue; i <= ceilvalue; i++) {
5809 if (!ANYOF_BITMAP_TEST(ret,i)) {
5811 ANYOF_BITMAP_SET(ret, i);
5815 if (value > 255 || UTF) {
5816 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5817 const UV natvalue = NATIVE_TO_UNI(value);
5818 stored+=2; /* can't optimize this class */
5819 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5820 if (prevnatvalue < natvalue) { /* what about > ? */
5821 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5822 prevnatvalue, natvalue);
5824 else if (prevnatvalue == natvalue) {
5825 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5827 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5829 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5831 /* If folding and foldable and a single
5832 * character, insert also the folded version
5833 * to the charclass. */
5835 if (foldlen == (STRLEN)UNISKIP(f))
5836 Perl_sv_catpvf(aTHX_ listsv,
5839 /* Any multicharacter foldings
5840 * require the following transform:
5841 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5842 * where E folds into "pq" and F folds
5843 * into "rst", all other characters
5844 * fold to single characters. We save
5845 * away these multicharacter foldings,
5846 * to be later saved as part of the
5847 * additional "s" data. */
5850 if (!unicode_alternate)
5851 unicode_alternate = newAV();
5852 sv = newSVpvn((char*)foldbuf, foldlen);
5854 av_push(unicode_alternate, sv);
5858 /* If folding and the value is one of the Greek
5859 * sigmas insert a few more sigmas to make the
5860 * folding rules of the sigmas to work right.
5861 * Note that not all the possible combinations
5862 * are handled here: some of them are handled
5863 * by the standard folding rules, and some of
5864 * them (literal or EXACTF cases) are handled
5865 * during runtime in regexec.c:S_find_byclass(). */
5866 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5867 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5868 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5869 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5870 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5872 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5873 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5874 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5879 literal_endpoint = 0;
5883 range = 0; /* this range (if it was one) is done now */
5887 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5889 RExC_size += ANYOF_CLASS_ADD_SKIP;
5891 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5897 /****** !SIZE_ONLY AFTER HERE *********/
5899 if( stored == 1 && value < 256
5900 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5902 /* optimize single char class to an EXACT node
5903 but *only* when its not a UTF/high char */
5904 const char * cur_parse= RExC_parse;
5905 RExC_emit = (regnode *)orig_emit;
5906 RExC_parse = (char *)orig_parse;
5907 ret = reg_node(pRExC_state,
5908 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5909 RExC_parse = (char *)cur_parse;
5910 *STRING(ret)= (char)value;
5912 RExC_emit += STR_SZ(1);
5915 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5916 if ( /* If the only flag is folding (plus possibly inversion). */
5917 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5919 for (value = 0; value < 256; ++value) {
5920 if (ANYOF_BITMAP_TEST(ret, value)) {
5921 UV fold = PL_fold[value];
5924 ANYOF_BITMAP_SET(ret, fold);
5927 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5930 /* optimize inverted simple patterns (e.g. [^a-z]) */
5931 if (optimize_invert &&
5932 /* If the only flag is inversion. */
5933 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5934 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5935 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5936 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5939 AV * const av = newAV();
5941 /* The 0th element stores the character class description
5942 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5943 * to initialize the appropriate swash (which gets stored in
5944 * the 1st element), and also useful for dumping the regnode.
5945 * The 2nd element stores the multicharacter foldings,
5946 * used later (regexec.c:S_reginclass()). */
5947 av_store(av, 0, listsv);
5948 av_store(av, 1, NULL);
5949 av_store(av, 2, (SV*)unicode_alternate);
5950 rv = newRV_noinc((SV*)av);
5951 n = add_data(pRExC_state, 1, "s");
5952 RExC_rx->data->data[n] = (void*)rv;
5959 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5961 char* const retval = RExC_parse++;
5964 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5965 RExC_parse[2] == '#') {
5966 while (*RExC_parse != ')') {
5967 if (RExC_parse == RExC_end)
5968 FAIL("Sequence (?#... not terminated");
5974 if (RExC_flags & PMf_EXTENDED) {
5975 if (isSPACE(*RExC_parse)) {
5979 else if (*RExC_parse == '#') {
5980 while (RExC_parse < RExC_end)
5981 if (*RExC_parse++ == '\n') break;
5990 - reg_node - emit a node
5992 STATIC regnode * /* Location. */
5993 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5996 register regnode *ptr;
5997 regnode * const ret = RExC_emit;
5998 GET_RE_DEBUG_FLAGS_DECL;
6001 SIZE_ALIGN(RExC_size);
6005 NODE_ALIGN_FILL(ret);
6007 FILL_ADVANCE_NODE(ptr, op);
6008 if (RExC_offsets) { /* MJD */
6009 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
6010 "reg_node", __LINE__,
6012 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6013 ? "Overwriting end of array!\n" : "OK",
6014 (UV)(RExC_emit - RExC_emit_start),
6015 (UV)(RExC_parse - RExC_start),
6016 (UV)RExC_offsets[0]));
6017 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
6026 - reganode - emit a node with an argument
6028 STATIC regnode * /* Location. */
6029 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6032 register regnode *ptr;
6033 regnode * const ret = RExC_emit;
6034 GET_RE_DEBUG_FLAGS_DECL;
6037 SIZE_ALIGN(RExC_size);
6042 NODE_ALIGN_FILL(ret);
6044 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6045 if (RExC_offsets) { /* MJD */
6046 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6050 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6051 "Overwriting end of array!\n" : "OK",
6052 (UV)(RExC_emit - RExC_emit_start),
6053 (UV)(RExC_parse - RExC_start),
6054 (UV)RExC_offsets[0]));
6055 Set_Cur_Node_Offset;
6064 - reguni - emit (if appropriate) a Unicode character
6067 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6070 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6074 - reginsert - insert an operator in front of already-emitted operand
6076 * Means relocating the operand.
6079 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6082 register regnode *src;
6083 register regnode *dst;
6084 register regnode *place;
6085 const int offset = regarglen[(U8)op];
6086 GET_RE_DEBUG_FLAGS_DECL;
6087 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6090 RExC_size += NODE_STEP_REGNODE + offset;
6095 RExC_emit += NODE_STEP_REGNODE + offset;
6097 while (src > opnd) {
6098 StructCopy(--src, --dst, regnode);
6099 if (RExC_offsets) { /* MJD 20010112 */
6100 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6104 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6105 ? "Overwriting end of array!\n" : "OK",
6106 (UV)(src - RExC_emit_start),
6107 (UV)(dst - RExC_emit_start),
6108 (UV)RExC_offsets[0]));
6109 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6110 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6115 place = opnd; /* Op node, where operand used to be. */
6116 if (RExC_offsets) { /* MJD */
6117 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6121 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6122 ? "Overwriting end of array!\n" : "OK",
6123 (UV)(place - RExC_emit_start),
6124 (UV)(RExC_parse - RExC_start),
6126 Set_Node_Offset(place, RExC_parse);
6127 Set_Node_Length(place, 1);
6129 src = NEXTOPER(place);
6130 FILL_ADVANCE_NODE(place, op);
6131 Zero(src, offset, regnode);
6135 - regtail - set the next-pointer at the end of a node chain of p to val.
6136 - SEE ALSO: regtail_study
6138 /* TODO: All three parms should be const */
6140 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6143 register regnode *scan;
6144 GET_RE_DEBUG_FLAGS_DECL;
6149 /* Find last node. */
6152 regnode * const temp = regnext(scan);
6154 SV * const mysv=sv_newmortal();
6155 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6156 regprop(RExC_rx, mysv, scan);
6157 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6158 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6165 if (reg_off_by_arg[OP(scan)]) {
6166 ARG_SET(scan, val - scan);
6169 NEXT_OFF(scan) = val - scan;
6175 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6176 - Look for optimizable sequences at the same time.
6177 - currently only looks for EXACT chains.
6179 This is expermental code. The idea is to use this routine to perform
6180 in place optimizations on branches and groups as they are constructed,
6181 with the long term intention of removing optimization from study_chunk so
6182 that it is purely analytical.
6184 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6185 to control which is which.
6188 /* TODO: All four parms should be const */
6191 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6194 register regnode *scan;
6196 #ifdef EXPERIMENTAL_INPLACESCAN
6200 GET_RE_DEBUG_FLAGS_DECL;
6206 /* Find last node. */
6210 regnode * const temp = regnext(scan);
6211 #ifdef EXPERIMENTAL_INPLACESCAN
6212 if (PL_regkind[OP(scan)] == EXACT)
6213 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6221 if( exact == PSEUDO )
6223 else if ( exact != OP(scan) )
6232 SV * const mysv=sv_newmortal();
6233 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6234 regprop(RExC_rx, mysv, scan);
6235 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6236 SvPV_nolen_const(mysv),
6238 REG_NODE_NUM(scan));
6245 SV * const mysv_val=sv_newmortal();
6246 DEBUG_PARSE_MSG("");
6247 regprop(RExC_rx, mysv_val, val);
6248 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6249 SvPV_nolen_const(mysv_val),
6254 if (reg_off_by_arg[OP(scan)]) {
6255 ARG_SET(scan, val - scan);
6258 NEXT_OFF(scan) = val - scan;
6266 - regcurly - a little FSA that accepts {\d+,?\d*}
6269 S_regcurly(register const char *s)
6288 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6291 Perl_regdump(pTHX_ const regexp *r)
6295 SV * const sv = sv_newmortal();
6297 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
6299 /* Header fields of interest. */
6300 if (r->anchored_substr)
6301 PerlIO_printf(Perl_debug_log,
6302 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
6304 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
6305 SvPVX_const(r->anchored_substr),
6307 SvTAIL(r->anchored_substr) ? "$" : "",
6308 (IV)r->anchored_offset);
6309 else if (r->anchored_utf8)
6310 PerlIO_printf(Perl_debug_log,
6311 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
6313 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
6314 SvPVX_const(r->anchored_utf8),
6316 SvTAIL(r->anchored_utf8) ? "$" : "",
6317 (IV)r->anchored_offset);
6318 if (r->float_substr)
6319 PerlIO_printf(Perl_debug_log,
6320 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6322 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
6323 SvPVX_const(r->float_substr),
6325 SvTAIL(r->float_substr) ? "$" : "",
6326 (IV)r->float_min_offset, (UV)r->float_max_offset);
6327 else if (r->float_utf8)
6328 PerlIO_printf(Perl_debug_log,
6329 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6331 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
6332 SvPVX_const(r->float_utf8),
6334 SvTAIL(r->float_utf8) ? "$" : "",
6335 (IV)r->float_min_offset, (UV)r->float_max_offset);
6336 if (r->check_substr || r->check_utf8)
6337 PerlIO_printf(Perl_debug_log,
6338 r->check_substr == r->float_substr
6339 && r->check_utf8 == r->float_utf8
6340 ? "(checking floating" : "(checking anchored");
6341 if (r->reganch & ROPT_NOSCAN)
6342 PerlIO_printf(Perl_debug_log, " noscan");
6343 if (r->reganch & ROPT_CHECK_ALL)
6344 PerlIO_printf(Perl_debug_log, " isall");
6345 if (r->check_substr || r->check_utf8)
6346 PerlIO_printf(Perl_debug_log, ") ");
6348 if (r->regstclass) {
6349 regprop(r, sv, r->regstclass);
6350 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6352 if (r->reganch & ROPT_ANCH) {
6353 PerlIO_printf(Perl_debug_log, "anchored");
6354 if (r->reganch & ROPT_ANCH_BOL)
6355 PerlIO_printf(Perl_debug_log, "(BOL)");
6356 if (r->reganch & ROPT_ANCH_MBOL)
6357 PerlIO_printf(Perl_debug_log, "(MBOL)");
6358 if (r->reganch & ROPT_ANCH_SBOL)
6359 PerlIO_printf(Perl_debug_log, "(SBOL)");
6360 if (r->reganch & ROPT_ANCH_GPOS)
6361 PerlIO_printf(Perl_debug_log, "(GPOS)");
6362 PerlIO_putc(Perl_debug_log, ' ');
6364 if (r->reganch & ROPT_GPOS_SEEN)
6365 PerlIO_printf(Perl_debug_log, "GPOS ");
6366 if (r->reganch & ROPT_SKIP)
6367 PerlIO_printf(Perl_debug_log, "plus ");
6368 if (r->reganch & ROPT_IMPLICIT)
6369 PerlIO_printf(Perl_debug_log, "implicit ");
6370 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6371 if (r->reganch & ROPT_EVAL_SEEN)
6372 PerlIO_printf(Perl_debug_log, "with eval ");
6373 PerlIO_printf(Perl_debug_log, "\n");
6375 PERL_UNUSED_CONTEXT;
6377 #endif /* DEBUGGING */
6381 - regprop - printable representation of opcode
6384 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6390 sv_setpvn(sv, "", 0);
6391 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6392 /* It would be nice to FAIL() here, but this may be called from
6393 regexec.c, and it would be hard to supply pRExC_state. */
6394 Perl_croak(aTHX_ "Corrupted regexp opcode");
6395 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6397 k = PL_regkind[OP(o)];
6400 SV * const dsv = sv_2mortal(newSVpvs(""));
6401 /* Using is_utf8_string() is a crude hack but it may
6402 * be the best for now since we have no flag "this EXACTish
6403 * node was UTF-8" --jhi */
6404 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
6405 const char * const s = do_utf8 ?
6406 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6407 UNI_DISPLAY_REGEX) :
6409 const int len = do_utf8 ?
6412 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6416 } else if (k == TRIE) {
6417 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6418 /* print the details of the trie in dumpuntil instead, as
6419 * prog->data isn't available here */
6420 } else if (k == CURLY) {
6421 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6422 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6423 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6425 else if (k == WHILEM && o->flags) /* Ordinal/of */
6426 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6427 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6428 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6429 else if (k == LOGICAL)
6430 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6431 else if (k == ANYOF) {
6432 int i, rangestart = -1;
6433 const U8 flags = ANYOF_FLAGS(o);
6435 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6436 static const char * const anyofs[] = {
6469 if (flags & ANYOF_LOCALE)
6470 sv_catpvs(sv, "{loc}");
6471 if (flags & ANYOF_FOLD)
6472 sv_catpvs(sv, "{i}");
6473 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6474 if (flags & ANYOF_INVERT)
6476 for (i = 0; i <= 256; i++) {
6477 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6478 if (rangestart == -1)
6480 } else if (rangestart != -1) {
6481 if (i <= rangestart + 3)
6482 for (; rangestart < i; rangestart++)
6483 put_byte(sv, rangestart);
6485 put_byte(sv, rangestart);
6487 put_byte(sv, i - 1);
6493 if (o->flags & ANYOF_CLASS)
6494 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6495 if (ANYOF_CLASS_TEST(o,i))
6496 sv_catpv(sv, anyofs[i]);
6498 if (flags & ANYOF_UNICODE)
6499 sv_catpvs(sv, "{unicode}");
6500 else if (flags & ANYOF_UNICODE_ALL)
6501 sv_catpvs(sv, "{unicode_all}");
6505 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6509 U8 s[UTF8_MAXBYTES_CASE+1];
6511 for (i = 0; i <= 256; i++) { /* just the first 256 */
6512 uvchr_to_utf8(s, i);
6514 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6515 if (rangestart == -1)
6517 } else if (rangestart != -1) {
6518 if (i <= rangestart + 3)
6519 for (; rangestart < i; rangestart++) {
6520 const U8 * const e = uvchr_to_utf8(s,rangestart);
6522 for(p = s; p < e; p++)
6526 const U8 *e = uvchr_to_utf8(s,rangestart);
6528 for (p = s; p < e; p++)
6531 e = uvchr_to_utf8(s, i-1);
6532 for (p = s; p < e; p++)
6539 sv_catpvs(sv, "..."); /* et cetera */
6543 char *s = savesvpv(lv);
6544 char * const origs = s;
6546 while (*s && *s != '\n')
6550 const char * const t = ++s;
6568 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6570 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6571 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
6573 PERL_UNUSED_CONTEXT;
6574 PERL_UNUSED_ARG(sv);
6576 #endif /* DEBUGGING */
6580 Perl_re_intuit_string(pTHX_ regexp *prog)
6581 { /* Assume that RE_INTUIT is set */
6583 GET_RE_DEBUG_FLAGS_DECL;
6584 PERL_UNUSED_CONTEXT;
6588 const char * const s = SvPV_nolen_const(prog->check_substr
6589 ? prog->check_substr : prog->check_utf8);
6591 if (!PL_colorset) reginitcolors();
6592 PerlIO_printf(Perl_debug_log,
6593 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6595 prog->check_substr ? "" : "utf8 ",
6596 PL_colors[5],PL_colors[0],
6599 (strlen(s) > 60 ? "..." : ""));
6602 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6606 Perl_pregfree(pTHX_ struct regexp *r)
6610 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6612 GET_RE_DEBUG_FLAGS_DECL;
6614 if (!r || (--r->refcnt > 0))
6616 DEBUG_COMPILE_r(if (RX_DEBUG(r)){
6617 const char * const s = (r->reganch & ROPT_UTF8)
6618 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6619 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6620 const int len = SvCUR(dsv);
6623 PerlIO_printf(Perl_debug_log,
6624 "%sFreeing REx:%s %s%*.*s%s%s\n",
6625 PL_colors[4],PL_colors[5],PL_colors[0],
6628 len > 60 ? "..." : "");
6631 /* gcov results gave these as non-null 100% of the time, so there's no
6632 optimisation in checking them before calling Safefree */
6633 Safefree(r->precomp);
6634 Safefree(r->offsets); /* 20010421 MJD */
6635 RX_MATCH_COPY_FREE(r);
6636 #ifdef PERL_OLD_COPY_ON_WRITE
6638 SvREFCNT_dec(r->saved_copy);
6641 if (r->anchored_substr)
6642 SvREFCNT_dec(r->anchored_substr);
6643 if (r->anchored_utf8)
6644 SvREFCNT_dec(r->anchored_utf8);
6645 if (r->float_substr)
6646 SvREFCNT_dec(r->float_substr);
6648 SvREFCNT_dec(r->float_utf8);
6649 Safefree(r->substrs);
6652 int n = r->data->count;
6653 PAD* new_comppad = NULL;
6658 /* If you add a ->what type here, update the comment in regcomp.h */
6659 switch (r->data->what[n]) {
6661 SvREFCNT_dec((SV*)r->data->data[n]);
6664 Safefree(r->data->data[n]);
6667 new_comppad = (AV*)r->data->data[n];
6670 if (new_comppad == NULL)
6671 Perl_croak(aTHX_ "panic: pregfree comppad");
6672 PAD_SAVE_LOCAL(old_comppad,
6673 /* Watch out for global destruction's random ordering. */
6674 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6677 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6680 op_free((OP_4tree*)r->data->data[n]);
6682 PAD_RESTORE_LOCAL(old_comppad);
6683 SvREFCNT_dec((SV*)new_comppad);
6689 { /* Aho Corasick add-on structure for a trie node.
6690 Used in stclass optimization only */
6692 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6694 refcount = --aho->refcount;
6697 Safefree(aho->states);
6698 Safefree(aho->fail);
6699 aho->trie=NULL; /* not necessary to free this as it is
6700 handled by the 't' case */
6701 Safefree(r->data->data[n]); /* do this last!!!! */
6702 Safefree(r->regstclass);
6708 /* trie structure. */
6710 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6712 refcount = --trie->refcount;
6715 Safefree(trie->charmap);
6716 if (trie->widecharmap)
6717 SvREFCNT_dec((SV*)trie->widecharmap);
6718 Safefree(trie->states);
6719 Safefree(trie->trans);
6721 Safefree(trie->bitmap);
6723 Safefree(trie->wordlen);
6727 SvREFCNT_dec((SV*)trie->words);
6728 if (trie->revcharmap)
6729 SvREFCNT_dec((SV*)trie->revcharmap);
6732 Safefree(r->data->data[n]); /* do this last!!!! */
6737 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6740 Safefree(r->data->what);
6743 Safefree(r->startp);
6748 #ifndef PERL_IN_XSUB_RE
6750 - regnext - dig the "next" pointer out of a node
6753 Perl_regnext(pTHX_ register regnode *p)
6756 register I32 offset;
6758 if (p == &PL_regdummy)
6761 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6770 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6773 STRLEN l1 = strlen(pat1);
6774 STRLEN l2 = strlen(pat2);
6777 const char *message;
6783 Copy(pat1, buf, l1 , char);
6784 Copy(pat2, buf + l1, l2 , char);
6785 buf[l1 + l2] = '\n';
6786 buf[l1 + l2 + 1] = '\0';
6788 /* ANSI variant takes additional second argument */
6789 va_start(args, pat2);
6793 msv = vmess(buf, &args);
6795 message = SvPV_const(msv,l1);
6798 Copy(message, buf, l1 , char);
6799 buf[l1-1] = '\0'; /* Overwrite \n */
6800 Perl_croak(aTHX_ "%s", buf);
6803 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6805 #ifndef PERL_IN_XSUB_RE
6807 Perl_save_re_context(pTHX)
6811 struct re_save_state *state;
6813 SAVEVPTR(PL_curcop);
6814 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6816 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6817 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6818 SSPUSHINT(SAVEt_RE_STATE);
6820 Copy(&PL_reg_state, state, 1, struct re_save_state);
6822 PL_reg_start_tmp = 0;
6823 PL_reg_start_tmpl = 0;
6824 PL_reg_oldsaved = NULL;
6825 PL_reg_oldsavedlen = 0;
6827 PL_reg_leftiter = 0;
6828 PL_reg_poscache = NULL;
6829 PL_reg_poscache_size = 0;
6830 #ifdef PERL_OLD_COPY_ON_WRITE
6834 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6836 const REGEXP * const rx = PM_GETRE(PL_curpm);
6839 for (i = 1; i <= rx->nparens; i++) {
6840 char digits[TYPE_CHARS(long)];
6841 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
6842 GV *const *const gvp
6843 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6846 GV * const gv = *gvp;
6847 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6857 clear_re(pTHX_ void *r)
6860 ReREFCNT_dec((regexp *)r);
6866 S_put_byte(pTHX_ SV *sv, int c)
6868 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6869 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6870 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6871 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6873 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6876 #define CLEAR_OPTSTART \
6877 if (optstart) STMT_START { \
6878 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
6882 #define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6884 STATIC const regnode *
6885 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6886 const regnode *last, SV* sv, I32 l)
6889 register U8 op = EXACT; /* Arbitrary non-END op. */
6890 register const regnode *next;
6891 const regnode *optstart= NULL;
6892 GET_RE_DEBUG_FLAGS_DECL;
6894 while (op != END && (!last || node < last)) {
6895 /* While that wasn't END last time... */
6901 next = regnext((regnode *)node);
6904 if (OP(node) == OPTIMIZED) {
6905 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
6912 regprop(r, sv, node);
6913 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6914 (int)(2*l + 1), "", SvPVX_const(sv));
6916 if (OP(node) != OPTIMIZED) {
6917 if (next == NULL) /* Next ptr. */
6918 PerlIO_printf(Perl_debug_log, "(0)");
6920 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6921 (void)PerlIO_putc(Perl_debug_log, '\n');
6925 if (PL_regkind[(U8)op] == BRANCHJ) {
6928 register const regnode *nnode = (OP(next) == LONGJMP
6929 ? regnext((regnode *)next)
6931 if (last && nnode > last)
6933 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6936 else if (PL_regkind[(U8)op] == BRANCH) {
6938 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
6940 else if ( PL_regkind[(U8)op] == TRIE ) {
6941 const I32 n = ARG(node);
6942 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6943 const I32 arry_len = av_len(trie->words)+1;
6945 PerlIO_printf(Perl_debug_log,
6946 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
6950 TRIE_WORDCOUNT(trie),
6951 (int)TRIE_CHARCOUNT(trie),
6952 trie->uniquecharcount,
6953 (IV)TRIE_LASTSTATE(trie)-1,
6960 sv_setpvn(sv, "", 0);
6961 for (i = 0; i <= 256; i++) {
6962 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6963 if (rangestart == -1)
6965 } else if (rangestart != -1) {
6966 if (i <= rangestart + 3)
6967 for (; rangestart < i; rangestart++)
6968 put_byte(sv, rangestart);
6970 put_byte(sv, rangestart);
6972 put_byte(sv, i - 1);
6977 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
6979 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
6981 for (word_idx=0; word_idx < arry_len; word_idx++) {
6982 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
6984 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6987 SvPV_nolen_const(*elem_ptr),
6993 node = NEXTOPER(node);
6994 node += regarglen[(U8)op];
6997 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6998 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6999 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7001 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
7003 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7006 else if ( op == PLUS || op == STAR) {
7007 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
7009 else if (op == ANYOF) {
7010 /* arglen 1 + class block */
7011 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7012 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7013 node = NEXTOPER(node);
7015 else if (PL_regkind[(U8)op] == EXACT) {
7016 /* Literal string, where present. */
7017 node += NODE_SZ_STR(node) - 1;
7018 node = NEXTOPER(node);
7021 node = NEXTOPER(node);
7022 node += regarglen[(U8)op];
7024 if (op == CURLYX || op == OPEN)
7026 else if (op == WHILEM)
7033 #endif /* DEBUGGING */
7037 * c-indentation-style: bsd
7039 * indent-tabs-mode: t
7042 * ex: set ts=8 sts=4 sw=4 noet: