5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
176 /* Length of a variant. */
178 typedef struct scan_data_t {
184 I32 last_end; /* min value, <0 unless valid. */
187 SV **longest; /* Either &l_fixed, or &l_float. */
191 I32 offset_float_min;
192 I32 offset_float_max;
196 struct regnode_charclass_class *start_class;
200 * Forward declarations for pregcomp()'s friends.
203 static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL 0x0001
208 #define SF_BEFORE_MEOL 0x0002
209 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
213 # define SF_FIX_SHIFT_EOL (0+2)
214 # define SF_FL_SHIFT_EOL (0+4)
216 # define SF_FIX_SHIFT_EOL (+2)
217 # define SF_FL_SHIFT_EOL (+4)
220 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF 0x0040
226 #define SF_HAS_PAR 0x0080
227 #define SF_IN_PAR 0x0100
228 #define SF_HAS_EVAL 0x0200
229 #define SCF_DO_SUBSTR 0x0400
230 #define SCF_DO_STCLASS_AND 0x0800
231 #define SCF_DO_STCLASS_OR 0x1000
232 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS 0x2000
235 #define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
237 #define UTF (RExC_utf8 != 0)
238 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
239 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
241 #define OOB_UNICODE 12345678
242 #define OOB_NAMEDCLASS -1
244 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248 /* length of regex to show in messages that don't mark a position within */
249 #define RegexLengthToShowInErrorMessages 127
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
256 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
257 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
259 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 #define FAIL(msg) STMT_START { \
267 const char *ellipses = ""; \
268 IV len = RExC_end - RExC_precomp; \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
284 #define Simple_vFAIL(m) STMT_START { \
285 const IV offset = RExC_parse - RExC_precomp; \
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
293 #define vFAIL(m) STMT_START { \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
300 * Like Simple_vFAIL(), but accepts two arguments.
302 #define Simple_vFAIL2(m,a1) STMT_START { \
303 const IV offset = RExC_parse - RExC_precomp; \
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
311 #define vFAIL2(m,a1) STMT_START { \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
319 * Like Simple_vFAIL(), but accepts three arguments.
321 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
322 const IV offset = RExC_parse - RExC_precomp; \
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
330 #define vFAIL3(m,a1,a2) STMT_START { \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
337 * Like Simple_vFAIL(), but accepts four arguments.
339 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
340 const IV offset = RExC_parse - RExC_precomp; \
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
345 #define vWARN(loc,m) STMT_START { \
346 const IV offset = loc - RExC_precomp; \
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351 #define vWARNdep(loc,m) STMT_START { \
352 const IV offset = loc - RExC_precomp; \
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359 #define vWARN2(loc, m, a1) STMT_START { \
360 const IV offset = loc - RExC_precomp; \
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365 #define vWARN3(loc, m, a1, a2) STMT_START { \
366 const IV offset = loc - RExC_precomp; \
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
372 const IV offset = loc - RExC_precomp; \
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
378 const IV offset = loc - RExC_precomp; \
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 /* Allow for side effects in s */
385 #define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389 /* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
393 * Position is 1 indexed.
396 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
399 __LINE__, (node), (int)(byte))); \
401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
403 RExC_offsets[2*(node)-1] = (byte); \
408 #define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
412 #define Set_Node_Length_To_R(node,len) STMT_START { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
415 __LINE__, (int)(node), (int)(len))); \
417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
419 RExC_offsets[2*(node)] = (len); \
424 #define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427 #define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
430 /* Get offsets and lengths */
431 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
434 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
440 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441 #define EXPERIMENTAL_INPLACESCAN
444 static void clear_re(pTHX_ void *r);
446 /* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
451 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
457 SvSetMagicSV(*data->longest, data->last_found);
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
464 data->flags &= ~SF_FIX_BEFORE_EOL;
467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
468 data->offset_float_max = (l
469 ? data->last_start_max
470 : data->pos_min + data->pos_delta);
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
473 if (data->flags & SF_BEFORE_EOL)
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
477 data->flags &= ~SF_FL_BEFORE_EOL;
480 SvCUR_set(data->last_found, 0);
482 SV * const sv = data->last_found;
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
490 data->flags &= ~SF_BEFORE_EOL;
493 /* Can match anything (initialization) */
495 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
497 ANYOF_CLASS_ZERO(cl);
498 ANYOF_BITMAP_SETALL(cl);
499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
501 cl->flags |= ANYOF_LOCALE;
504 /* Can match anything (initialization) */
506 S_cl_is_anything(const struct regnode_charclass_class *cl)
510 for (value = 0; value <= ANYOF_MAX; value += 2)
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
520 /* Can match anything (initialization) */
522 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 Zero(cl, 1, struct regnode_charclass_class);
526 cl_anything(pRExC_state, cl);
530 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 Zero(cl, 1, struct regnode_charclass_class);
534 cl_anything(pRExC_state, cl);
536 cl->flags |= ANYOF_LOCALE;
539 /* 'And' a given class with another one. Can create false positives */
540 /* We assume that cl is not inverted */
542 S_cl_and(struct regnode_charclass_class *cl,
543 const struct regnode_charclass_class *and_with)
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
570 cl->flags &= ~ANYOF_UNICODE_ALL;
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
573 cl->flags &= ~ANYOF_UNICODE;
576 /* 'OR' a given class with another one. Can create false positives */
577 /* We assume that cl is not inverted */
579 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
581 if (or_with->flags & ANYOF_INVERT) {
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
600 cl_anything(pRExC_state, cl);
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && (!(or_with->flags & ANYOF_FOLD)
606 || (cl->flags & ANYOF_FOLD)) ) {
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
619 cl_anything(pRExC_state, cl);
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
638 make_trie(startbranch,first,last,tail,flags,depth)
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
648 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
650 A trie is an N'ary tree where the branches are determined by digital
651 decomposition of the key. IE, at the root node you look up the 1st character and
652 follow that branch repeat until you find the end of the branches. Nodes can be
653 marked as "accepting" meaning they represent a complete word. Eg:
657 would convert into the following structure. Numbers represent states, letters
658 following numbers represent valid transitions on the letter from that state, if
659 the number is in square brackets it represents an accepting state, otherwise it
660 will be in parenthesis.
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
666 (1) +-i->(6)-+-s->[7]
668 +-s->(3)-+-h->(4)-+-e->[5]
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
672 This shows that when matching against the string 'hers' we will begin at state 1
673 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676 single traverse. We store a mapping from accepting to state to which word was
677 matched, and then when we have multiple possibilities we try to complete the
678 rest of the regex in the order in which they occured in the alternation.
680 The only prior NFA like behaviour that would be changed by the TRIE support is
681 the silent ignoring of duplicate alternations which are of the form:
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
685 Thus EVAL blocks follwing a trie may be called a different number of times with
686 and without the optimisation. With the optimisations dupes will be silently
687 ignored. This inconsistant behaviour of EVAL type nodes is well established as
688 the following demonstrates:
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
692 which prints out 'word' three times, but
694 'words'=~/(word|word|word)(?{ print $1 })S/
696 which doesnt print it out at all. This is due to other optimisations kicking in.
698 Example of what happens on a structural level:
700 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
702 1: CURLYM[1] {1,32767}(18)
713 This would be optimizable with startbranch=5, first=5, last=16, tail=16
714 and should turn into:
716 1: CURLYM[1] {1,32767}(18)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
726 Cases where tail != last would be like /(?foo|bar)baz/:
736 which would be optimizable with startbranch=1, first=1, last=7, tail=8
737 and would end up looking like:
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
747 d = uvuni_to_utf8_flags(d, uv, 0);
749 is the recommended Unicode-aware way of saying
754 #define TRIE_STORE_REVCHAR \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
760 #define TRIE_READ_CHAR STMT_START { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
785 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
790 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
801 #define TRIE_LIST_NEW(state) STMT_START { \
802 Newxz( trie->states[ state ].trans.list, \
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
808 #define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
812 trie->wordlen[ curword ] = wordlen; \
813 trie->states[ state ].wordnum = ++curword; \
815 /* store the word for dumping */ \
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
820 tmp = newSVpvn( "", 0 ); \
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
825 NOOP; /* It's a dupe. So ignore it. */ \
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
835 These routines dump out a trie in a somewhat readable format.
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
852 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
855 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
875 const U32 base = trie->states[ state ].trans.base;
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
909 PerlIO_printf( Perl_debug_log, "]");
912 PerlIO_printf( Perl_debug_log, "\n" );
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
923 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
932 for( state=1 ; state < next_alloc ; state ++ ) {
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
945 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
964 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
968 GET_RE_DEBUG_FLAGS_DECL;
971 print out the table precompression so that we can do a visual check
972 that they are identical.
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
978 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
990 PerlIO_printf( Perl_debug_log, "\n" );
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1013 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1023 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1025 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1045 const U32 ucharcount = trie->uniquecharcount;
1046 const U32 numstates = trie->laststate;
1047 const U32 ubound = trie->lasttrans + ucharcount;
1051 U32 base = trie->states[ 1 ].trans.base;
1054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1055 GET_RE_DEBUG_FLAGS_DECL;
1057 PERL_UNUSED_ARG(depth);
1061 ARG_SET( stclass, data_slot );
1062 Newxz( aho, 1, reg_ac_data );
1063 RExC_rx->data->data[ data_slot ] = (void*)aho;
1065 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1066 (trie->laststate+1)*sizeof(reg_trie_state));
1067 Newxz( q, numstates, U32);
1068 Newxz( aho->fail, numstates, U32 );
1071 fail[ 0 ] = fail[ 1 ] = 1;
1073 for ( charid = 0; charid < ucharcount ; charid++ ) {
1074 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1076 q[ q_write ] = newstate;
1077 /* set to point at the root */
1078 fail[ q[ q_write++ ] ]=1;
1081 while ( q_read < q_write) {
1082 const U32 cur = q[ q_read++ % numstates ];
1083 base = trie->states[ cur ].trans.base;
1085 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1086 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1088 U32 fail_state = cur;
1091 fail_state = fail[ fail_state ];
1092 fail_base = aho->states[ fail_state ].trans.base;
1093 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1095 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1096 fail[ ch_state ] = fail_state;
1097 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1099 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1101 q[ q_write++ % numstates] = ch_state;
1106 DEBUG_TRIE_COMPILE_MORE_r({
1107 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1108 for( q_read=2; q_read<numstates; q_read++ ) {
1109 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1111 PerlIO_printf(Perl_debug_log, "\n");
1114 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1120 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1123 /* first pass, loop through and scan words */
1124 reg_trie_data *trie;
1126 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1131 /* we just use folder as a flag in utf8 */
1132 const U8 * const folder = ( flags == EXACTF
1134 : ( flags == EXACTFL
1140 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1141 SV *re_trie_maxbuff;
1143 /* these are only used during construction but are useful during
1144 * debugging so we store them in the struct when debugging.
1145 * Wordcount is actually superfluous in debugging as we have
1146 * (AV*)trie->words to use for it, but that's not available when
1147 * not debugging... We could make the macro use the AV during
1148 * debugging though...
1150 U16 trie_wordcount=0;
1151 STRLEN trie_charcount=0;
1152 /*U32 trie_laststate=0;*/
1153 AV *trie_revcharmap;
1155 GET_RE_DEBUG_FLAGS_DECL;
1157 PERL_UNUSED_ARG(depth);
1160 Newxz( trie, 1, reg_trie_data );
1162 trie->startstate = 1;
1163 RExC_rx->data->data[ data_slot ] = (void*)trie;
1164 Newxz( trie->charmap, 256, U16 );
1165 if (!(UTF && folder))
1166 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1168 trie->words = newAV();
1170 TRIE_REVCHARMAP(trie) = newAV();
1172 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1173 if (!SvIOK(re_trie_maxbuff)) {
1174 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1177 PerlIO_printf( Perl_debug_log,
1178 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1179 (int)depth * 2 + 2, "",
1180 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1181 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1183 /* -- First loop and Setup --
1185 We first traverse the branches and scan each word to determine if it
1186 contains widechars, and how many unique chars there are, this is
1187 important as we have to build a table with at least as many columns as we
1190 We use an array of integers to represent the character codes 0..255
1191 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1192 native representation of the character value as the key and IV's for the
1195 *TODO* If we keep track of how many times each character is used we can
1196 remap the columns so that the table compression later on is more
1197 efficient in terms of memory by ensuring most common value is in the
1198 middle and the least common are on the outside. IMO this would be better
1199 than a most to least common mapping as theres a decent chance the most
1200 common letter will share a node with the least common, meaning the node
1201 will not be compressable. With a middle is most common approach the worst
1202 case is when we have the least common nodes twice.
1206 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1207 regnode * const noper = NEXTOPER( cur );
1208 const U8 *uc = (U8*)STRING( noper );
1209 const U8 * const e = uc + STR_LEN( noper );
1211 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1212 const U8 *scan = (U8*)NULL;
1213 U32 wordlen = 0; /* required init */
1216 TRIE_WORDCOUNT(trie)++;
1217 if (OP(noper) == NOTHING) {
1222 TRIE_BITMAP_SET(trie,*uc);
1223 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1225 for ( ; uc < e ; uc += len ) {
1226 TRIE_CHARCOUNT(trie)++;
1230 if ( !trie->charmap[ uvc ] ) {
1231 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1233 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1238 if ( !trie->widecharmap )
1239 trie->widecharmap = newHV();
1241 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1244 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1246 if ( !SvTRUE( *svpp ) ) {
1247 sv_setiv( *svpp, ++trie->uniquecharcount );
1252 if( cur == first ) {
1255 } else if (chars < trie->minlen) {
1257 } else if (chars > trie->maxlen) {
1261 } /* end first pass */
1262 DEBUG_TRIE_COMPILE_r(
1263 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1264 (int)depth * 2 + 2,"",
1265 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1266 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1267 (int)trie->minlen, (int)trie->maxlen )
1269 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
1272 We now know what we are dealing with in terms of unique chars and
1273 string sizes so we can calculate how much memory a naive
1274 representation using a flat table will take. If it's over a reasonable
1275 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1276 conservative but potentially much slower representation using an array
1279 At the end we convert both representations into the same compressed
1280 form that will be used in regexec.c for matching with. The latter
1281 is a form that cannot be used to construct with but has memory
1282 properties similar to the list form and access properties similar
1283 to the table form making it both suitable for fast searches and
1284 small enough that its feasable to store for the duration of a program.
1286 See the comment in the code where the compressed table is produced
1287 inplace from the flat tabe representation for an explanation of how
1288 the compression works.
1293 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1295 Second Pass -- Array Of Lists Representation
1297 Each state will be represented by a list of charid:state records
1298 (reg_trie_trans_le) the first such element holds the CUR and LEN
1299 points of the allocated array. (See defines above).
1301 We build the initial structure using the lists, and then convert
1302 it into the compressed table form which allows faster lookups
1303 (but cant be modified once converted).
1306 STRLEN transcount = 1;
1308 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1312 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1314 regnode * const noper = NEXTOPER( cur );
1315 U8 *uc = (U8*)STRING( noper );
1316 const U8 * const e = uc + STR_LEN( noper );
1317 U32 state = 1; /* required init */
1318 U16 charid = 0; /* sanity init */
1319 U8 *scan = (U8*)NULL; /* sanity init */
1320 STRLEN foldlen = 0; /* required init */
1321 U32 wordlen = 0; /* required init */
1322 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1324 if (OP(noper) != NOTHING) {
1325 for ( ; uc < e ; uc += len ) {
1330 charid = trie->charmap[ uvc ];
1332 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1336 charid=(U16)SvIV( *svpp );
1345 if ( !trie->states[ state ].trans.list ) {
1346 TRIE_LIST_NEW( state );
1348 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1349 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1350 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1355 newstate = next_alloc++;
1356 TRIE_LIST_PUSH( state, charid, newstate );
1361 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1363 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1366 TRIE_HANDLE_WORD(state);
1368 } /* end second pass */
1370 TRIE_LASTSTATE(trie) = next_alloc;
1371 Renew( trie->states, next_alloc, reg_trie_state );
1373 /* and now dump it out before we compress it */
1374 DEBUG_TRIE_COMPILE_MORE_r(
1375 dump_trie_interim_list(trie,next_alloc,depth+1)
1378 Newxz( trie->trans, transcount ,reg_trie_trans );
1385 for( state=1 ; state < next_alloc ; state ++ ) {
1389 DEBUG_TRIE_COMPILE_MORE_r(
1390 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1394 if (trie->states[state].trans.list) {
1395 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1399 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1400 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1401 if ( forid < minid ) {
1403 } else if ( forid > maxid ) {
1407 if ( transcount < tp + maxid - minid + 1) {
1409 Renew( trie->trans, transcount, reg_trie_trans );
1410 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1412 base = trie->uniquecharcount + tp - minid;
1413 if ( maxid == minid ) {
1415 for ( ; zp < tp ; zp++ ) {
1416 if ( ! trie->trans[ zp ].next ) {
1417 base = trie->uniquecharcount + zp - minid;
1418 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1419 trie->trans[ zp ].check = state;
1425 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1426 trie->trans[ tp ].check = state;
1431 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1432 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1433 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1434 trie->trans[ tid ].check = state;
1436 tp += ( maxid - minid + 1 );
1438 Safefree(trie->states[ state ].trans.list);
1441 DEBUG_TRIE_COMPILE_MORE_r(
1442 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1445 trie->states[ state ].trans.base=base;
1447 trie->lasttrans = tp + 1;
1451 Second Pass -- Flat Table Representation.
1453 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1454 We know that we will need Charcount+1 trans at most to store the data
1455 (one row per char at worst case) So we preallocate both structures
1456 assuming worst case.
1458 We then construct the trie using only the .next slots of the entry
1461 We use the .check field of the first entry of the node temporarily to
1462 make compression both faster and easier by keeping track of how many non
1463 zero fields are in the node.
1465 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1468 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1469 number representing the first entry of the node, and state as a
1470 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1471 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1472 are 2 entrys per node. eg:
1480 The table is internally in the right hand, idx form. However as we also
1481 have to deal with the states array which is indexed by nodenum we have to
1482 use TRIE_NODENUM() to convert.
1487 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1489 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1490 next_alloc = trie->uniquecharcount + 1;
1493 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1495 regnode * const noper = NEXTOPER( cur );
1496 const U8 *uc = (U8*)STRING( noper );
1497 const U8 * const e = uc + STR_LEN( noper );
1499 U32 state = 1; /* required init */
1501 U16 charid = 0; /* sanity init */
1502 U32 accept_state = 0; /* sanity init */
1503 U8 *scan = (U8*)NULL; /* sanity init */
1505 STRLEN foldlen = 0; /* required init */
1506 U32 wordlen = 0; /* required init */
1507 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1509 if ( OP(noper) != NOTHING ) {
1510 for ( ; uc < e ; uc += len ) {
1515 charid = trie->charmap[ uvc ];
1517 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1518 charid = svpp ? (U16)SvIV(*svpp) : 0;
1522 if ( !trie->trans[ state + charid ].next ) {
1523 trie->trans[ state + charid ].next = next_alloc;
1524 trie->trans[ state ].check++;
1525 next_alloc += trie->uniquecharcount;
1527 state = trie->trans[ state + charid ].next;
1529 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1531 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1534 accept_state = TRIE_NODENUM( state );
1535 TRIE_HANDLE_WORD(accept_state);
1537 } /* end second pass */
1539 /* and now dump it out before we compress it */
1540 DEBUG_TRIE_COMPILE_MORE_r(
1541 dump_trie_interim_table(trie,next_alloc,depth+1)
1546 * Inplace compress the table.*
1548 For sparse data sets the table constructed by the trie algorithm will
1549 be mostly 0/FAIL transitions or to put it another way mostly empty.
1550 (Note that leaf nodes will not contain any transitions.)
1552 This algorithm compresses the tables by eliminating most such
1553 transitions, at the cost of a modest bit of extra work during lookup:
1555 - Each states[] entry contains a .base field which indicates the
1556 index in the state[] array wheres its transition data is stored.
1558 - If .base is 0 there are no valid transitions from that node.
1560 - If .base is nonzero then charid is added to it to find an entry in
1563 -If trans[states[state].base+charid].check!=state then the
1564 transition is taken to be a 0/Fail transition. Thus if there are fail
1565 transitions at the front of the node then the .base offset will point
1566 somewhere inside the previous nodes data (or maybe even into a node
1567 even earlier), but the .check field determines if the transition is
1570 The following process inplace converts the table to the compressed
1571 table: We first do not compress the root node 1,and mark its all its
1572 .check pointers as 1 and set its .base pointer as 1 as well. This
1573 allows to do a DFA construction from the compressed table later, and
1574 ensures that any .base pointers we calculate later are greater than
1577 - We set 'pos' to indicate the first entry of the second node.
1579 - We then iterate over the columns of the node, finding the first and
1580 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1581 and set the .check pointers accordingly, and advance pos
1582 appropriately and repreat for the next node. Note that when we copy
1583 the next pointers we have to convert them from the original
1584 NODEIDX form to NODENUM form as the former is not valid post
1587 - If a node has no transitions used we mark its base as 0 and do not
1588 advance the pos pointer.
1590 - If a node only has one transition we use a second pointer into the
1591 structure to fill in allocated fail transitions from other states.
1592 This pointer is independent of the main pointer and scans forward
1593 looking for null transitions that are allocated to a state. When it
1594 finds one it writes the single transition into the "hole". If the
1595 pointer doesnt find one the single transition is appeneded as normal.
1597 - Once compressed we can Renew/realloc the structures to release the
1600 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1601 specifically Fig 3.47 and the associated pseudocode.
1605 const U32 laststate = TRIE_NODENUM( next_alloc );
1608 TRIE_LASTSTATE(trie) = laststate;
1610 for ( state = 1 ; state < laststate ; state++ ) {
1612 const U32 stateidx = TRIE_NODEIDX( state );
1613 const U32 o_used = trie->trans[ stateidx ].check;
1614 U32 used = trie->trans[ stateidx ].check;
1615 trie->trans[ stateidx ].check = 0;
1617 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1618 if ( flag || trie->trans[ stateidx + charid ].next ) {
1619 if ( trie->trans[ stateidx + charid ].next ) {
1621 for ( ; zp < pos ; zp++ ) {
1622 if ( ! trie->trans[ zp ].next ) {
1626 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1627 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1628 trie->trans[ zp ].check = state;
1629 if ( ++zp > pos ) pos = zp;
1636 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1638 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1639 trie->trans[ pos ].check = state;
1644 trie->lasttrans = pos + 1;
1645 Renew( trie->states, laststate + 1, reg_trie_state);
1646 DEBUG_TRIE_COMPILE_MORE_r(
1647 PerlIO_printf( Perl_debug_log,
1648 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1649 (int)depth * 2 + 2,"",
1650 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1653 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1656 } /* end table compress */
1658 /* resize the trans array to remove unused space */
1659 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1661 /* and now dump out the compressed format */
1662 DEBUG_TRIE_COMPILE_r(
1663 dump_trie(trie,depth+1)
1666 { /* Modify the program and insert the new TRIE node*/
1668 U8 nodetype =(U8)(flags & 0xFF);
1675 This means we convert either the first branch or the first Exact,
1676 depending on whether the thing following (in 'last') is a branch
1677 or not and whther first is the startbranch (ie is it a sub part of
1678 the alternation or is it the whole thing.)
1679 Assuming its a sub part we conver the EXACT otherwise we convert
1680 the whole branch sequence, including the first.
1682 /* Find the node we are going to overwrite */
1683 if ( first == startbranch && OP( last ) != BRANCH ) {
1684 /* whole branch chain */
1687 const regnode *nop = NEXTOPER( convert );
1688 mjd_offset= Node_Offset((nop));
1689 mjd_nodelen= Node_Length((nop));
1692 /* branch sub-chain */
1693 convert = NEXTOPER( first );
1694 NEXT_OFF( first ) = (U16)(last - first);
1696 mjd_offset= Node_Offset((convert));
1697 mjd_nodelen= Node_Length((convert));
1701 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1702 (int)depth * 2 + 2, "",
1703 mjd_offset,mjd_nodelen)
1706 /* But first we check to see if there is a common prefix we can
1707 split out as an EXACT and put in front of the TRIE node. */
1708 trie->startstate= 1;
1709 if ( trie->bitmap && !trie->widecharmap ) {
1712 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1713 (int)depth * 2 + 2, "",
1714 TRIE_LASTSTATE(trie))
1716 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1720 const U32 base = trie->states[ state ].trans.base;
1722 if ( trie->states[state].wordnum )
1725 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1726 if ( ( base + ofs >= trie->uniquecharcount ) &&
1727 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1728 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1730 if ( ++count > 1 ) {
1731 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1732 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1733 if ( state == 1 ) break;
1735 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1737 PerlIO_printf(Perl_debug_log,
1738 "%*sNew Start State=%"UVuf" Class: [",
1739 (int)depth * 2 + 2, "",
1742 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1743 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1745 TRIE_BITMAP_SET(trie,*ch);
1747 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1749 PerlIO_printf(Perl_debug_log, (char*)ch)
1753 TRIE_BITMAP_SET(trie,*ch);
1755 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1756 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1762 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1763 const char *ch = SvPV_nolen_const( *tmp );
1765 PerlIO_printf( Perl_debug_log,
1766 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1767 (int)depth * 2 + 2, "",
1771 OP( convert ) = nodetype;
1772 str=STRING(convert);
1781 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1787 regnode *n = convert+NODE_SZ_STR(convert);
1788 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1789 trie->startstate = state;
1790 trie->minlen -= (state - 1);
1791 trie->maxlen -= (state - 1);
1793 regnode *fix = convert;
1795 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1796 while( ++fix < n ) {
1797 Set_Node_Offset_Length(fix, 0, 0);
1803 NEXT_OFF(convert) = (U16)(tail - convert);
1807 if ( trie->maxlen ) {
1808 OP( convert ) = TRIE;
1809 NEXT_OFF( convert ) = (U16)(tail - convert);
1810 ARG_SET( convert, data_slot );
1812 /* store the type in the flags */
1813 convert->flags = nodetype;
1814 /* XXX We really should free up the resource in trie now, as we wont use them */
1816 /* needed for dumping*/
1818 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1819 regnode *opt = convert;
1820 while (++opt<optimize) {
1821 Set_Node_Offset_Length(opt,0,0);
1823 /* We now need to mark all of the space originally used by the
1824 branches as optimized away. This keeps the dumpuntil from
1825 throwing a wobbly as it doesnt use regnext() to traverse the
1827 We also "fix" the offsets
1829 while( optimize < last ) {
1830 mjd_nodelen += Node_Length((optimize));
1831 OP( optimize ) = OPTIMIZED;
1832 Set_Node_Offset_Length(optimize,0,0);
1835 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1837 } /* end node insert */
1839 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1845 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1846 * These need to be revisited when a newer toolchain becomes available.
1848 #if defined(__sparc64__) && defined(__GNUC__)
1849 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1850 # undef SPARC64_GCC_WORKAROUND
1851 # define SPARC64_GCC_WORKAROUND 1
1855 #define DEBUG_PEEP(str,scan,depth) \
1856 DEBUG_OPTIMISE_r({ \
1857 SV * const mysv=sv_newmortal(); \
1858 regnode *Next = regnext(scan); \
1859 regprop(RExC_rx, mysv, scan); \
1860 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1861 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1862 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1865 #define JOIN_EXACT(scan,min,flags) \
1866 if (PL_regkind[OP(scan)] == EXACT) \
1867 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1870 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1871 /* Merge several consecutive EXACTish nodes into one. */
1872 regnode *n = regnext(scan);
1874 regnode *next = scan + NODE_SZ_STR(scan);
1878 regnode *stop = scan;
1879 GET_RE_DEBUG_FLAGS_DECL;
1881 PERL_UNUSED_ARG(depth);
1883 #ifndef EXPERIMENTAL_INPLACESCAN
1884 PERL_UNUSED_ARG(flags);
1885 PERL_UNUSED_ARG(val);
1887 DEBUG_PEEP("join",scan,depth);
1889 /* Skip NOTHING, merge EXACT*. */
1891 ( PL_regkind[OP(n)] == NOTHING ||
1892 (stringok && (OP(n) == OP(scan))))
1894 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1896 if (OP(n) == TAIL || n > next)
1898 if (PL_regkind[OP(n)] == NOTHING) {
1899 DEBUG_PEEP("skip:",n,depth);
1900 NEXT_OFF(scan) += NEXT_OFF(n);
1901 next = n + NODE_STEP_REGNODE;
1908 else if (stringok) {
1909 const int oldl = STR_LEN(scan);
1910 regnode * const nnext = regnext(n);
1912 DEBUG_PEEP("merg",n,depth);
1915 if (oldl + STR_LEN(n) > U8_MAX)
1917 NEXT_OFF(scan) += NEXT_OFF(n);
1918 STR_LEN(scan) += STR_LEN(n);
1919 next = n + NODE_SZ_STR(n);
1920 /* Now we can overwrite *n : */
1921 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1929 #ifdef EXPERIMENTAL_INPLACESCAN
1930 if (flags && !NEXT_OFF(n)) {
1931 DEBUG_PEEP("atch", val, depth);
1932 if (reg_off_by_arg[OP(n)]) {
1933 ARG_SET(n, val - n);
1936 NEXT_OFF(n) = val - n;
1943 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1945 Two problematic code points in Unicode casefolding of EXACT nodes:
1947 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1948 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1954 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1955 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1957 This means that in case-insensitive matching (or "loose matching",
1958 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1959 length of the above casefolded versions) can match a target string
1960 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1961 This would rather mess up the minimum length computation.
1963 What we'll do is to look for the tail four bytes, and then peek
1964 at the preceding two bytes to see whether we need to decrease
1965 the minimum length by four (six minus two).
1967 Thanks to the design of UTF-8, there cannot be false matches:
1968 A sequence of valid UTF-8 bytes cannot be a subsequence of
1969 another valid sequence of UTF-8 bytes.
1972 char * const s0 = STRING(scan), *s, *t;
1973 char * const s1 = s0 + STR_LEN(scan) - 1;
1974 char * const s2 = s1 - 4;
1975 const char t0[] = "\xcc\x88\xcc\x81";
1976 const char * const t1 = t0 + 3;
1979 s < s2 && (t = ninstr(s, s1, t0, t1));
1981 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1982 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1989 n = scan + NODE_SZ_STR(scan);
1991 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1998 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2002 /* REx optimizer. Converts nodes into quickier variants "in place".
2003 Finds fixed substrings. */
2005 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2006 to the position after last scanned or to NULL. */
2011 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2012 regnode *last, scan_data_t *data, U32 flags, U32 depth)
2013 /* scanp: Start here (read-write). */
2014 /* deltap: Write maxlen-minlen here. */
2015 /* last: Stop before this one. */
2018 I32 min = 0, pars = 0, code;
2019 regnode *scan = *scanp, *next;
2021 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2022 int is_inf_internal = 0; /* The studied chunk is infinite */
2023 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2024 scan_data_t data_fake;
2025 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2026 SV *re_trie_maxbuff = NULL;
2028 GET_RE_DEBUG_FLAGS_DECL;
2030 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2033 while (scan && OP(scan) != END && scan < last) {
2034 /* Peephole optimizer: */
2035 DEBUG_PEEP("Peep",scan,depth);
2037 JOIN_EXACT(scan,&min,0);
2039 /* Follow the next-chain of the current node and optimize
2040 away all the NOTHINGs from it. */
2041 if (OP(scan) != CURLYX) {
2042 const int max = (reg_off_by_arg[OP(scan)]
2044 /* I32 may be smaller than U16 on CRAYs! */
2045 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2046 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2050 /* Skip NOTHING and LONGJMP. */
2051 while ((n = regnext(n))
2052 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2053 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2054 && off + noff < max)
2056 if (reg_off_by_arg[OP(scan)])
2059 NEXT_OFF(scan) = off;
2064 /* The principal pseudo-switch. Cannot be a switch, since we
2065 look into several different things. */
2066 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2067 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2068 next = regnext(scan);
2070 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2072 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2073 I32 max1 = 0, min1 = I32_MAX, num = 0;
2074 struct regnode_charclass_class accum;
2075 regnode * const startbranch=scan;
2077 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2078 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2079 if (flags & SCF_DO_STCLASS)
2080 cl_init_zero(pRExC_state, &accum);
2082 while (OP(scan) == code) {
2083 I32 deltanext, minnext, f = 0, fake;
2084 struct regnode_charclass_class this_class;
2087 data_fake.flags = 0;
2089 data_fake.whilem_c = data->whilem_c;
2090 data_fake.last_closep = data->last_closep;
2093 data_fake.last_closep = &fake;
2094 next = regnext(scan);
2095 scan = NEXTOPER(scan);
2097 scan = NEXTOPER(scan);
2098 if (flags & SCF_DO_STCLASS) {
2099 cl_init(pRExC_state, &this_class);
2100 data_fake.start_class = &this_class;
2101 f = SCF_DO_STCLASS_AND;
2103 if (flags & SCF_WHILEM_VISITED_POS)
2104 f |= SCF_WHILEM_VISITED_POS;
2106 /* we suppose the run is continuous, last=next...*/
2107 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2108 next, &data_fake, f,depth+1);
2111 if (max1 < minnext + deltanext)
2112 max1 = minnext + deltanext;
2113 if (deltanext == I32_MAX)
2114 is_inf = is_inf_internal = 1;
2116 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2119 if (data_fake.flags & SF_HAS_EVAL)
2120 data->flags |= SF_HAS_EVAL;
2121 data->whilem_c = data_fake.whilem_c;
2123 if (flags & SCF_DO_STCLASS)
2124 cl_or(pRExC_state, &accum, &this_class);
2125 if (code == SUSPEND)
2128 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2130 if (flags & SCF_DO_SUBSTR) {
2131 data->pos_min += min1;
2132 data->pos_delta += max1 - min1;
2133 if (max1 != min1 || is_inf)
2134 data->longest = &(data->longest_float);
2137 delta += max1 - min1;
2138 if (flags & SCF_DO_STCLASS_OR) {
2139 cl_or(pRExC_state, data->start_class, &accum);
2141 cl_and(data->start_class, &and_with);
2142 flags &= ~SCF_DO_STCLASS;
2145 else if (flags & SCF_DO_STCLASS_AND) {
2147 cl_and(data->start_class, &accum);
2148 flags &= ~SCF_DO_STCLASS;
2151 /* Switch to OR mode: cache the old value of
2152 * data->start_class */
2153 StructCopy(data->start_class, &and_with,
2154 struct regnode_charclass_class);
2155 flags &= ~SCF_DO_STCLASS_AND;
2156 StructCopy(&accum, data->start_class,
2157 struct regnode_charclass_class);
2158 flags |= SCF_DO_STCLASS_OR;
2159 data->start_class->flags |= ANYOF_EOS;
2165 Assuming this was/is a branch we are dealing with: 'scan' now
2166 points at the item that follows the branch sequence, whatever
2167 it is. We now start at the beginning of the sequence and look
2173 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2175 If we can find such a subseqence we need to turn the first
2176 element into a trie and then add the subsequent branch exact
2177 strings to the trie.
2181 1. patterns where the whole set of branch can be converted to a trie,
2183 2. patterns where only a subset of the alternations can be
2184 converted to a trie.
2186 In case 1 we can replace the whole set with a single regop
2187 for the trie. In case 2 we need to keep the start and end
2190 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2191 becomes BRANCH TRIE; BRANCH X;
2193 Hypthetically when we know the regex isnt anchored we can
2194 turn a case 1 into a DFA and let it rip... Every time it finds a match
2195 it would just call its tail, no WHILEM/CURLY needed.
2198 if (PERL_ENABLE_TRIE_OPTIMISATION) {
2200 if (!re_trie_maxbuff) {
2201 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2202 if (!SvIOK(re_trie_maxbuff))
2203 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2205 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2207 regnode *first = (regnode *)NULL;
2208 regnode *last = (regnode *)NULL;
2209 regnode *tail = scan;
2214 SV * const mysv = sv_newmortal(); /* for dumping */
2216 /* var tail is used because there may be a TAIL
2217 regop in the way. Ie, the exacts will point to the
2218 thing following the TAIL, but the last branch will
2219 point at the TAIL. So we advance tail. If we
2220 have nested (?:) we may have to move through several
2224 while ( OP( tail ) == TAIL ) {
2225 /* this is the TAIL generated by (?:) */
2226 tail = regnext( tail );
2231 regprop(RExC_rx, mysv, tail );
2232 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2233 (int)depth * 2 + 2, "",
2234 "Looking for TRIE'able sequences. Tail node is: ",
2235 SvPV_nolen_const( mysv )
2241 step through the branches, cur represents each
2242 branch, noper is the first thing to be matched
2243 as part of that branch and noper_next is the
2244 regnext() of that node. if noper is an EXACT
2245 and noper_next is the same as scan (our current
2246 position in the regex) then the EXACT branch is
2247 a possible optimization target. Once we have
2248 two or more consequetive such branches we can
2249 create a trie of the EXACT's contents and stich
2250 it in place. If the sequence represents all of
2251 the branches we eliminate the whole thing and
2252 replace it with a single TRIE. If it is a
2253 subsequence then we need to stitch it in. This
2254 means the first branch has to remain, and needs
2255 to be repointed at the item on the branch chain
2256 following the last branch optimized. This could
2257 be either a BRANCH, in which case the
2258 subsequence is internal, or it could be the
2259 item following the branch sequence in which
2260 case the subsequence is at the end.
2264 /* dont use tail as the end marker for this traverse */
2265 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2266 regnode * const noper = NEXTOPER( cur );
2267 regnode * const noper_next = regnext( noper );
2270 regprop(RExC_rx, mysv, cur);
2271 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2272 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2274 regprop(RExC_rx, mysv, noper);
2275 PerlIO_printf( Perl_debug_log, " -> %s",
2276 SvPV_nolen_const(mysv));
2279 regprop(RExC_rx, mysv, noper_next );
2280 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2281 SvPV_nolen_const(mysv));
2283 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2284 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2286 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2287 : PL_regkind[ OP( noper ) ] == EXACT )
2288 || OP(noper) == NOTHING )
2289 && noper_next == tail && count<U16_MAX)
2292 if ( !first || optype == NOTHING ) {
2293 if (!first) first = cur;
2294 optype = OP( noper );
2300 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2302 if ( PL_regkind[ OP( noper ) ] == EXACT
2303 && noper_next == tail )
2307 optype = OP( noper );
2317 regprop(RExC_rx, mysv, cur);
2318 PerlIO_printf( Perl_debug_log,
2319 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2320 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2324 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2325 #ifdef TRIE_STUDY_OPT
2326 if ( made && startbranch == first ) {
2327 if ( OP(first)!=TRIE )
2328 flags |= SCF_EXACT_TRIE;
2330 regnode *chk=*scanp;
2331 while ( OP( chk ) == OPEN )
2332 chk = regnext( chk );
2334 flags |= SCF_EXACT_TRIE;
2343 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2344 scan = NEXTOPER(NEXTOPER(scan));
2345 } else /* single branch is optimized. */
2346 scan = NEXTOPER(scan);
2349 else if (OP(scan) == EXACT) {
2350 I32 l = STR_LEN(scan);
2353 const U8 * const s = (U8*)STRING(scan);
2354 l = utf8_length(s, s + l);
2355 uc = utf8_to_uvchr(s, NULL);
2357 uc = *((U8*)STRING(scan));
2360 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2361 /* The code below prefers earlier match for fixed
2362 offset, later match for variable offset. */
2363 if (data->last_end == -1) { /* Update the start info. */
2364 data->last_start_min = data->pos_min;
2365 data->last_start_max = is_inf
2366 ? I32_MAX : data->pos_min + data->pos_delta;
2368 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2370 SvUTF8_on(data->last_found);
2372 SV * const sv = data->last_found;
2373 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2374 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2375 if (mg && mg->mg_len >= 0)
2376 mg->mg_len += utf8_length((U8*)STRING(scan),
2377 (U8*)STRING(scan)+STR_LEN(scan));
2379 data->last_end = data->pos_min + l;
2380 data->pos_min += l; /* As in the first entry. */
2381 data->flags &= ~SF_BEFORE_EOL;
2383 if (flags & SCF_DO_STCLASS_AND) {
2384 /* Check whether it is compatible with what we know already! */
2388 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2389 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2390 && (!(data->start_class->flags & ANYOF_FOLD)
2391 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2394 ANYOF_CLASS_ZERO(data->start_class);
2395 ANYOF_BITMAP_ZERO(data->start_class);
2397 ANYOF_BITMAP_SET(data->start_class, uc);
2398 data->start_class->flags &= ~ANYOF_EOS;
2400 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2402 else if (flags & SCF_DO_STCLASS_OR) {
2403 /* false positive possible if the class is case-folded */
2405 ANYOF_BITMAP_SET(data->start_class, uc);
2407 data->start_class->flags |= ANYOF_UNICODE_ALL;
2408 data->start_class->flags &= ~ANYOF_EOS;
2409 cl_and(data->start_class, &and_with);
2411 flags &= ~SCF_DO_STCLASS;
2413 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2414 I32 l = STR_LEN(scan);
2415 UV uc = *((U8*)STRING(scan));
2417 /* Search for fixed substrings supports EXACT only. */
2418 if (flags & SCF_DO_SUBSTR) {
2420 scan_commit(pRExC_state, data);
2423 const U8 * const s = (U8 *)STRING(scan);
2424 l = utf8_length(s, s + l);
2425 uc = utf8_to_uvchr(s, NULL);
2428 if (flags & SCF_DO_SUBSTR)
2430 if (flags & SCF_DO_STCLASS_AND) {
2431 /* Check whether it is compatible with what we know already! */
2435 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2436 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2437 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2439 ANYOF_CLASS_ZERO(data->start_class);
2440 ANYOF_BITMAP_ZERO(data->start_class);
2442 ANYOF_BITMAP_SET(data->start_class, uc);
2443 data->start_class->flags &= ~ANYOF_EOS;
2444 data->start_class->flags |= ANYOF_FOLD;
2445 if (OP(scan) == EXACTFL)
2446 data->start_class->flags |= ANYOF_LOCALE;
2449 else if (flags & SCF_DO_STCLASS_OR) {
2450 if (data->start_class->flags & ANYOF_FOLD) {
2451 /* false positive possible if the class is case-folded.
2452 Assume that the locale settings are the same... */
2454 ANYOF_BITMAP_SET(data->start_class, uc);
2455 data->start_class->flags &= ~ANYOF_EOS;
2457 cl_and(data->start_class, &and_with);
2459 flags &= ~SCF_DO_STCLASS;
2461 #ifdef TRIE_STUDY_OPT
2462 else if (OP(scan) == TRIE) {
2463 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2464 min += trie->minlen;
2465 delta += (trie->maxlen - trie->minlen);
2466 flags &= ~SCF_DO_STCLASS; /* xxx */
2467 if (flags & SCF_DO_SUBSTR) {
2468 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2469 data->pos_min += trie->minlen;
2470 data->pos_delta += (trie->maxlen - trie->minlen);
2471 if (trie->maxlen != trie->minlen)
2472 data->longest = &(data->longest_float);
2476 else if (strchr((const char*)PL_varies,OP(scan))) {
2477 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2478 I32 f = flags, pos_before = 0;
2479 regnode * const oscan = scan;
2480 struct regnode_charclass_class this_class;
2481 struct regnode_charclass_class *oclass = NULL;
2482 I32 next_is_eval = 0;
2484 switch (PL_regkind[OP(scan)]) {
2485 case WHILEM: /* End of (?:...)* . */
2486 scan = NEXTOPER(scan);
2489 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2490 next = NEXTOPER(scan);
2491 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2493 maxcount = REG_INFTY;
2494 next = regnext(scan);
2495 scan = NEXTOPER(scan);
2499 if (flags & SCF_DO_SUBSTR)
2504 if (flags & SCF_DO_STCLASS) {
2506 maxcount = REG_INFTY;
2507 next = regnext(scan);
2508 scan = NEXTOPER(scan);
2511 is_inf = is_inf_internal = 1;
2512 scan = regnext(scan);
2513 if (flags & SCF_DO_SUBSTR) {
2514 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2515 data->longest = &(data->longest_float);
2517 goto optimize_curly_tail;
2519 mincount = ARG1(scan);
2520 maxcount = ARG2(scan);
2521 next = regnext(scan);
2522 if (OP(scan) == CURLYX) {
2523 I32 lp = (data ? *(data->last_closep) : 0);
2524 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2526 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2527 next_is_eval = (OP(scan) == EVAL);
2529 if (flags & SCF_DO_SUBSTR) {
2530 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2531 pos_before = data->pos_min;
2535 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2537 data->flags |= SF_IS_INF;
2539 if (flags & SCF_DO_STCLASS) {
2540 cl_init(pRExC_state, &this_class);
2541 oclass = data->start_class;
2542 data->start_class = &this_class;
2543 f |= SCF_DO_STCLASS_AND;
2544 f &= ~SCF_DO_STCLASS_OR;
2546 /* These are the cases when once a subexpression
2547 fails at a particular position, it cannot succeed
2548 even after backtracking at the enclosing scope.
2550 XXXX what if minimal match and we are at the
2551 initial run of {n,m}? */
2552 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2553 f &= ~SCF_WHILEM_VISITED_POS;
2555 /* This will finish on WHILEM, setting scan, or on NULL: */
2556 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2558 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2560 if (flags & SCF_DO_STCLASS)
2561 data->start_class = oclass;
2562 if (mincount == 0 || minnext == 0) {
2563 if (flags & SCF_DO_STCLASS_OR) {
2564 cl_or(pRExC_state, data->start_class, &this_class);
2566 else if (flags & SCF_DO_STCLASS_AND) {
2567 /* Switch to OR mode: cache the old value of
2568 * data->start_class */
2569 StructCopy(data->start_class, &and_with,
2570 struct regnode_charclass_class);
2571 flags &= ~SCF_DO_STCLASS_AND;
2572 StructCopy(&this_class, data->start_class,
2573 struct regnode_charclass_class);
2574 flags |= SCF_DO_STCLASS_OR;
2575 data->start_class->flags |= ANYOF_EOS;
2577 } else { /* Non-zero len */
2578 if (flags & SCF_DO_STCLASS_OR) {
2579 cl_or(pRExC_state, data->start_class, &this_class);
2580 cl_and(data->start_class, &and_with);
2582 else if (flags & SCF_DO_STCLASS_AND)
2583 cl_and(data->start_class, &this_class);
2584 flags &= ~SCF_DO_STCLASS;
2586 if (!scan) /* It was not CURLYX, but CURLY. */
2588 if ( /* ? quantifier ok, except for (?{ ... }) */
2589 (next_is_eval || !(mincount == 0 && maxcount == 1))
2590 && (minnext == 0) && (deltanext == 0)
2591 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2592 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2593 && ckWARN(WARN_REGEXP))
2596 "Quantifier unexpected on zero-length expression");
2599 min += minnext * mincount;
2600 is_inf_internal |= ((maxcount == REG_INFTY
2601 && (minnext + deltanext) > 0)
2602 || deltanext == I32_MAX);
2603 is_inf |= is_inf_internal;
2604 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2606 /* Try powerful optimization CURLYX => CURLYN. */
2607 if ( OP(oscan) == CURLYX && data
2608 && data->flags & SF_IN_PAR
2609 && !(data->flags & SF_HAS_EVAL)
2610 && !deltanext && minnext == 1 ) {
2611 /* Try to optimize to CURLYN. */
2612 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2613 regnode * const nxt1 = nxt;
2620 if (!strchr((const char*)PL_simple,OP(nxt))
2621 && !(PL_regkind[OP(nxt)] == EXACT
2622 && STR_LEN(nxt) == 1))
2628 if (OP(nxt) != CLOSE)
2630 /* Now we know that nxt2 is the only contents: */
2631 oscan->flags = (U8)ARG(nxt);
2633 OP(nxt1) = NOTHING; /* was OPEN. */
2635 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2636 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2637 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2638 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2639 OP(nxt + 1) = OPTIMIZED; /* was count. */
2640 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2645 /* Try optimization CURLYX => CURLYM. */
2646 if ( OP(oscan) == CURLYX && data
2647 && !(data->flags & SF_HAS_PAR)
2648 && !(data->flags & SF_HAS_EVAL)
2649 && !deltanext /* atom is fixed width */
2650 && minnext != 0 /* CURLYM can't handle zero width */
2652 /* XXXX How to optimize if data == 0? */
2653 /* Optimize to a simpler form. */
2654 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2658 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2659 && (OP(nxt2) != WHILEM))
2661 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2662 /* Need to optimize away parenths. */
2663 if (data->flags & SF_IN_PAR) {
2664 /* Set the parenth number. */
2665 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2667 if (OP(nxt) != CLOSE)
2668 FAIL("Panic opt close");
2669 oscan->flags = (U8)ARG(nxt);
2670 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2671 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2673 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2674 OP(nxt + 1) = OPTIMIZED; /* was count. */
2675 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2676 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2679 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2680 regnode *nnxt = regnext(nxt1);
2683 if (reg_off_by_arg[OP(nxt1)])
2684 ARG_SET(nxt1, nxt2 - nxt1);
2685 else if (nxt2 - nxt1 < U16_MAX)
2686 NEXT_OFF(nxt1) = nxt2 - nxt1;
2688 OP(nxt) = NOTHING; /* Cannot beautify */
2693 /* Optimize again: */
2694 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2700 else if ((OP(oscan) == CURLYX)
2701 && (flags & SCF_WHILEM_VISITED_POS)
2702 /* See the comment on a similar expression above.
2703 However, this time it not a subexpression
2704 we care about, but the expression itself. */
2705 && (maxcount == REG_INFTY)
2706 && data && ++data->whilem_c < 16) {
2707 /* This stays as CURLYX, we can put the count/of pair. */
2708 /* Find WHILEM (as in regexec.c) */
2709 regnode *nxt = oscan + NEXT_OFF(oscan);
2711 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2713 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2714 | (RExC_whilem_seen << 4)); /* On WHILEM */
2716 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2718 if (flags & SCF_DO_SUBSTR) {
2719 SV *last_str = NULL;
2720 int counted = mincount != 0;
2722 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2723 #if defined(SPARC64_GCC_WORKAROUND)
2726 const char *s = NULL;
2729 if (pos_before >= data->last_start_min)
2732 b = data->last_start_min;
2735 s = SvPV_const(data->last_found, l);
2736 old = b - data->last_start_min;
2739 I32 b = pos_before >= data->last_start_min
2740 ? pos_before : data->last_start_min;
2742 const char * const s = SvPV_const(data->last_found, l);
2743 I32 old = b - data->last_start_min;
2747 old = utf8_hop((U8*)s, old) - (U8*)s;
2750 /* Get the added string: */
2751 last_str = newSVpvn(s + old, l);
2753 SvUTF8_on(last_str);
2754 if (deltanext == 0 && pos_before == b) {
2755 /* What was added is a constant string */
2757 SvGROW(last_str, (mincount * l) + 1);
2758 repeatcpy(SvPVX(last_str) + l,
2759 SvPVX_const(last_str), l, mincount - 1);
2760 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2761 /* Add additional parts. */
2762 SvCUR_set(data->last_found,
2763 SvCUR(data->last_found) - l);
2764 sv_catsv(data->last_found, last_str);
2766 SV * sv = data->last_found;
2768 SvUTF8(sv) && SvMAGICAL(sv) ?
2769 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2770 if (mg && mg->mg_len >= 0)
2771 mg->mg_len += CHR_SVLEN(last_str);
2773 data->last_end += l * (mincount - 1);
2776 /* start offset must point into the last copy */
2777 data->last_start_min += minnext * (mincount - 1);
2778 data->last_start_max += is_inf ? I32_MAX
2779 : (maxcount - 1) * (minnext + data->pos_delta);
2782 /* It is counted once already... */
2783 data->pos_min += minnext * (mincount - counted);
2784 data->pos_delta += - counted * deltanext +
2785 (minnext + deltanext) * maxcount - minnext * mincount;
2786 if (mincount != maxcount) {
2787 /* Cannot extend fixed substrings found inside
2789 scan_commit(pRExC_state,data);
2790 if (mincount && last_str) {
2791 SV * const sv = data->last_found;
2792 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2793 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2797 sv_setsv(sv, last_str);
2798 data->last_end = data->pos_min;
2799 data->last_start_min =
2800 data->pos_min - CHR_SVLEN(last_str);
2801 data->last_start_max = is_inf
2803 : data->pos_min + data->pos_delta
2804 - CHR_SVLEN(last_str);
2806 data->longest = &(data->longest_float);
2808 SvREFCNT_dec(last_str);
2810 if (data && (fl & SF_HAS_EVAL))
2811 data->flags |= SF_HAS_EVAL;
2812 optimize_curly_tail:
2813 if (OP(oscan) != CURLYX) {
2814 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2816 NEXT_OFF(oscan) += NEXT_OFF(next);
2819 default: /* REF and CLUMP only? */
2820 if (flags & SCF_DO_SUBSTR) {
2821 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2822 data->longest = &(data->longest_float);
2824 is_inf = is_inf_internal = 1;
2825 if (flags & SCF_DO_STCLASS_OR)
2826 cl_anything(pRExC_state, data->start_class);
2827 flags &= ~SCF_DO_STCLASS;
2831 else if (strchr((const char*)PL_simple,OP(scan))) {
2834 if (flags & SCF_DO_SUBSTR) {
2835 scan_commit(pRExC_state,data);
2839 if (flags & SCF_DO_STCLASS) {
2840 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2842 /* Some of the logic below assumes that switching
2843 locale on will only add false positives. */
2844 switch (PL_regkind[OP(scan)]) {
2848 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2849 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2850 cl_anything(pRExC_state, data->start_class);
2853 if (OP(scan) == SANY)
2855 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2856 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2857 || (data->start_class->flags & ANYOF_CLASS));
2858 cl_anything(pRExC_state, data->start_class);
2860 if (flags & SCF_DO_STCLASS_AND || !value)
2861 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2864 if (flags & SCF_DO_STCLASS_AND)
2865 cl_and(data->start_class,
2866 (struct regnode_charclass_class*)scan);
2868 cl_or(pRExC_state, data->start_class,
2869 (struct regnode_charclass_class*)scan);
2872 if (flags & SCF_DO_STCLASS_AND) {
2873 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2874 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2875 for (value = 0; value < 256; value++)
2876 if (!isALNUM(value))
2877 ANYOF_BITMAP_CLEAR(data->start_class, value);
2881 if (data->start_class->flags & ANYOF_LOCALE)
2882 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2884 for (value = 0; value < 256; value++)
2886 ANYOF_BITMAP_SET(data->start_class, value);
2891 if (flags & SCF_DO_STCLASS_AND) {
2892 if (data->start_class->flags & ANYOF_LOCALE)
2893 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2896 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2897 data->start_class->flags |= ANYOF_LOCALE;
2901 if (flags & SCF_DO_STCLASS_AND) {
2902 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2903 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2904 for (value = 0; value < 256; value++)
2906 ANYOF_BITMAP_CLEAR(data->start_class, value);
2910 if (data->start_class->flags & ANYOF_LOCALE)
2911 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2913 for (value = 0; value < 256; value++)
2914 if (!isALNUM(value))
2915 ANYOF_BITMAP_SET(data->start_class, value);
2920 if (flags & SCF_DO_STCLASS_AND) {
2921 if (data->start_class->flags & ANYOF_LOCALE)
2922 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2925 data->start_class->flags |= ANYOF_LOCALE;
2926 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2930 if (flags & SCF_DO_STCLASS_AND) {
2931 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2932 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2933 for (value = 0; value < 256; value++)
2934 if (!isSPACE(value))
2935 ANYOF_BITMAP_CLEAR(data->start_class, value);
2939 if (data->start_class->flags & ANYOF_LOCALE)
2940 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2942 for (value = 0; value < 256; value++)
2944 ANYOF_BITMAP_SET(data->start_class, value);
2949 if (flags & SCF_DO_STCLASS_AND) {
2950 if (data->start_class->flags & ANYOF_LOCALE)
2951 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2954 data->start_class->flags |= ANYOF_LOCALE;
2955 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2959 if (flags & SCF_DO_STCLASS_AND) {
2960 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2961 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2962 for (value = 0; value < 256; value++)
2964 ANYOF_BITMAP_CLEAR(data->start_class, value);
2968 if (data->start_class->flags & ANYOF_LOCALE)
2969 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2971 for (value = 0; value < 256; value++)
2972 if (!isSPACE(value))
2973 ANYOF_BITMAP_SET(data->start_class, value);
2978 if (flags & SCF_DO_STCLASS_AND) {
2979 if (data->start_class->flags & ANYOF_LOCALE) {
2980 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2981 for (value = 0; value < 256; value++)
2982 if (!isSPACE(value))
2983 ANYOF_BITMAP_CLEAR(data->start_class, value);
2987 data->start_class->flags |= ANYOF_LOCALE;
2988 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2992 if (flags & SCF_DO_STCLASS_AND) {
2993 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2994 for (value = 0; value < 256; value++)
2995 if (!isDIGIT(value))
2996 ANYOF_BITMAP_CLEAR(data->start_class, value);
2999 if (data->start_class->flags & ANYOF_LOCALE)
3000 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3002 for (value = 0; value < 256; value++)
3004 ANYOF_BITMAP_SET(data->start_class, value);
3009 if (flags & SCF_DO_STCLASS_AND) {
3010 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3011 for (value = 0; value < 256; value++)
3013 ANYOF_BITMAP_CLEAR(data->start_class, value);
3016 if (data->start_class->flags & ANYOF_LOCALE)
3017 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3019 for (value = 0; value < 256; value++)
3020 if (!isDIGIT(value))
3021 ANYOF_BITMAP_SET(data->start_class, value);
3026 if (flags & SCF_DO_STCLASS_OR)
3027 cl_and(data->start_class, &and_with);
3028 flags &= ~SCF_DO_STCLASS;
3031 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3032 data->flags |= (OP(scan) == MEOL
3036 else if ( PL_regkind[OP(scan)] == BRANCHJ
3037 /* Lookbehind, or need to calculate parens/evals/stclass: */
3038 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3039 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3040 /* Lookahead/lookbehind */
3041 I32 deltanext, minnext, fake = 0;
3043 struct regnode_charclass_class intrnl;
3046 data_fake.flags = 0;
3048 data_fake.whilem_c = data->whilem_c;
3049 data_fake.last_closep = data->last_closep;
3052 data_fake.last_closep = &fake;
3053 if ( flags & SCF_DO_STCLASS && !scan->flags
3054 && OP(scan) == IFMATCH ) { /* Lookahead */
3055 cl_init(pRExC_state, &intrnl);
3056 data_fake.start_class = &intrnl;
3057 f |= SCF_DO_STCLASS_AND;
3059 if (flags & SCF_WHILEM_VISITED_POS)
3060 f |= SCF_WHILEM_VISITED_POS;
3061 next = regnext(scan);
3062 nscan = NEXTOPER(NEXTOPER(scan));
3063 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3066 vFAIL("Variable length lookbehind not implemented");
3068 else if (minnext > U8_MAX) {
3069 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3071 scan->flags = (U8)minnext;
3074 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3076 if (data_fake.flags & SF_HAS_EVAL)
3077 data->flags |= SF_HAS_EVAL;
3078 data->whilem_c = data_fake.whilem_c;
3080 if (f & SCF_DO_STCLASS_AND) {
3081 const int was = (data->start_class->flags & ANYOF_EOS);
3083 cl_and(data->start_class, &intrnl);
3085 data->start_class->flags |= ANYOF_EOS;
3088 else if (OP(scan) == OPEN) {
3091 else if (OP(scan) == CLOSE) {
3092 if ((I32)ARG(scan) == is_par) {
3093 next = regnext(scan);
3095 if ( next && (OP(next) != WHILEM) && next < last)
3096 is_par = 0; /* Disable optimization */
3099 *(data->last_closep) = ARG(scan);
3101 else if (OP(scan) == EVAL) {
3103 data->flags |= SF_HAS_EVAL;
3105 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3106 if (flags & SCF_DO_SUBSTR) {
3107 scan_commit(pRExC_state,data);
3108 data->longest = &(data->longest_float);
3110 is_inf = is_inf_internal = 1;
3111 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3112 cl_anything(pRExC_state, data->start_class);
3113 flags &= ~SCF_DO_STCLASS;
3115 /* Else: zero-length, ignore. */
3116 scan = regnext(scan);
3121 *deltap = is_inf_internal ? I32_MAX : delta;
3122 if (flags & SCF_DO_SUBSTR && is_inf)
3123 data->pos_delta = I32_MAX - data->pos_min;
3124 if (is_par > U8_MAX)
3126 if (is_par && pars==1 && data) {
3127 data->flags |= SF_IN_PAR;
3128 data->flags &= ~SF_HAS_PAR;
3130 else if (pars && data) {
3131 data->flags |= SF_HAS_PAR;
3132 data->flags &= ~SF_IN_PAR;
3134 if (flags & SCF_DO_STCLASS_OR)
3135 cl_and(data->start_class, &and_with);
3136 if (flags & SCF_EXACT_TRIE)
3137 data->flags |= SCF_EXACT_TRIE;
3142 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3144 if (RExC_rx->data) {
3145 Renewc(RExC_rx->data,
3146 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3147 char, struct reg_data);
3148 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3149 RExC_rx->data->count += n;
3152 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3153 char, struct reg_data);
3154 Newx(RExC_rx->data->what, n, U8);
3155 RExC_rx->data->count = n;
3157 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3158 return RExC_rx->data->count - n;
3161 #ifndef PERL_IN_XSUB_RE
3163 Perl_reginitcolors(pTHX)
3166 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3168 char *t = savepv(s);
3172 t = strchr(t, '\t');
3178 PL_colors[i] = t = (char *)"";
3183 PL_colors[i++] = (char *)"";
3191 - pregcomp - compile a regular expression into internal code
3193 * We can't allocate space until we know how big the compiled form will be,
3194 * but we can't compile it (and thus know how big it is) until we've got a
3195 * place to put the code. So we cheat: we compile it twice, once with code
3196 * generation turned off and size counting turned on, and once "for real".
3197 * This also means that we don't allocate space until we are sure that the
3198 * thing really will compile successfully, and we never have to move the
3199 * code and thus invalidate pointers into it. (Note that it has to be in
3200 * one piece because free() must be able to free it all.) [NB: not true in perl]
3202 * Beware that the optimization-preparation code in here knows about some
3203 * of the structure of the compiled regexp. [I'll say.]
3206 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3217 RExC_state_t RExC_state;
3218 RExC_state_t * const pRExC_state = &RExC_state;
3219 #ifdef TRIE_STUDY_OPT
3221 RExC_state_t copyRExC_state;
3224 GET_RE_DEBUG_FLAGS_DECL;
3227 FAIL("NULL regexp argument");
3229 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3232 DEBUG_r(if (!PL_colorset) reginitcolors());
3234 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
3235 PL_colors[4],PL_colors[5],PL_colors[0],
3236 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3238 RExC_flags = pm->op_pmflags;
3242 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3243 RExC_seen_evals = 0;
3246 /* First pass: determine size, legality. */
3253 RExC_emit = &PL_regdummy;
3254 RExC_whilem_seen = 0;
3255 #if 0 /* REGC() is (currently) a NOP at the first pass.
3256 * Clever compilers notice this and complain. --jhi */
3257 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3259 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3260 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3261 RExC_precomp = NULL;
3264 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3265 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3266 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3269 RExC_lastparse=NULL;
3273 /* Small enough for pointer-storage convention?
3274 If extralen==0, this means that we will not need long jumps. */
3275 if (RExC_size >= 0x10000L && RExC_extralen)
3276 RExC_size += RExC_extralen;
3279 if (RExC_whilem_seen > 15)
3280 RExC_whilem_seen = 15;
3282 /* Allocate space and initialize. */
3283 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3286 FAIL("Regexp out of space");
3289 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3290 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3293 r->prelen = xend - exp;
3294 r->precomp = savepvn(RExC_precomp, r->prelen);
3296 #ifdef PERL_OLD_COPY_ON_WRITE
3297 r->saved_copy = NULL;
3299 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3300 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3301 r->lastparen = 0; /* mg.c reads this. */
3303 r->substrs = 0; /* Useful during FAIL. */
3304 r->startp = 0; /* Useful during FAIL. */
3305 r->endp = 0; /* Useful during FAIL. */
3307 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3309 r->offsets[0] = RExC_size;
3311 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3312 "%s %"UVuf" bytes for offset annotations.\n",
3313 r->offsets ? "Got" : "Couldn't get",
3314 (UV)((2*RExC_size+1) * sizeof(U32))));
3318 /* Second pass: emit code. */
3319 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3324 RExC_emit_start = r->program;
3325 RExC_emit = r->program;
3326 /* Store the count of eval-groups for security checks: */
3327 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3328 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3330 if (reg(pRExC_state, 0, &flags,1) == NULL)
3332 /* XXXX To minimize changes to RE engine we always allocate
3333 3-units-long substrs field. */
3334 Newx(r->substrs, 1, struct reg_substr_data);
3337 Zero(r->substrs, 1, struct reg_substr_data);
3338 StructCopy(&zero_scan_data, &data, scan_data_t);
3340 #ifdef TRIE_STUDY_OPT
3342 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3343 RExC_state=copyRExC_state;
3344 if (data.longest_fixed)
3345 SvREFCNT_dec(data.longest_fixed);
3346 if (data.longest_float)
3347 SvREFCNT_dec(data.longest_float);
3348 if (data.last_found)
3349 SvREFCNT_dec(data.last_found);
3351 copyRExC_state=RExC_state;
3354 /* Dig out information for optimizations. */
3355 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3356 pm->op_pmflags = RExC_flags;
3358 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3359 r->regstclass = NULL;
3360 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3361 r->reganch |= ROPT_NAUGHTY;
3362 scan = r->program + 1; /* First BRANCH. */
3364 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3365 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3367 STRLEN longest_float_length, longest_fixed_length;
3368 struct regnode_charclass_class ch_class; /* pointed to by data */
3370 I32 last_close = 0; /* pointed to by data */
3373 /* Skip introductions and multiplicators >= 1. */
3374 while ((OP(first) == OPEN && (sawopen = 1)) ||
3375 /* An OR of *one* alternative - should not happen now. */
3376 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3377 /* for now we can't handle lookbehind IFMATCH*/
3378 (OP(first) == IFMATCH && !first->flags) ||
3379 (OP(first) == PLUS) ||
3380 (OP(first) == MINMOD) ||
3381 /* An {n,m} with n>0 */
3382 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3384 DEBUG_PEEP("first:",first,0);
3385 if (OP(first) == PLUS)
3388 first += regarglen[OP(first)];
3389 if (OP(first) == IFMATCH) {
3390 first = NEXTOPER(first);
3391 first += EXTRA_STEP_2ARGS;
3392 } else /* XXX possible optimisation for /(?=)/ */
3393 first = NEXTOPER(first);
3396 /* Starting-point info. */
3398 /* Ignore EXACT as we deal with it later. */
3399 if (PL_regkind[OP(first)] == EXACT) {
3400 if (OP(first) == EXACT)
3401 NOOP; /* Empty, get anchored substr later. */
3402 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3403 r->regstclass = first;
3406 else if (OP(first) == TRIE &&
3407 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3409 /* this can happen only on restudy */
3410 struct regnode_1 *trie_op;
3411 Newxz(trie_op,1,struct regnode_1);
3412 StructCopy(first,trie_op,struct regnode_1);
3413 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3414 r->regstclass = (regnode *)trie_op;
3417 else if (strchr((const char*)PL_simple,OP(first)))
3418 r->regstclass = first;
3419 else if (PL_regkind[OP(first)] == BOUND ||
3420 PL_regkind[OP(first)] == NBOUND)
3421 r->regstclass = first;
3422 else if (PL_regkind[OP(first)] == BOL) {
3423 r->reganch |= (OP(first) == MBOL
3425 : (OP(first) == SBOL
3428 first = NEXTOPER(first);
3431 else if (OP(first) == GPOS) {
3432 r->reganch |= ROPT_ANCH_GPOS;
3433 first = NEXTOPER(first);
3436 else if (!sawopen && (OP(first) == STAR &&
3437 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3438 !(r->reganch & ROPT_ANCH) )
3440 /* turn .* into ^.* with an implied $*=1 */
3442 (OP(NEXTOPER(first)) == REG_ANY)
3445 r->reganch |= type | ROPT_IMPLICIT;
3446 first = NEXTOPER(first);
3449 if (sawplus && (!sawopen || !RExC_sawback)
3450 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3451 /* x+ must match at the 1st pos of run of x's */
3452 r->reganch |= ROPT_SKIP;
3454 /* Scan is after the zeroth branch, first is atomic matcher. */
3455 #ifdef TRIE_STUDY_OPT
3458 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3459 (IV)(first - scan + 1))
3463 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3464 (IV)(first - scan + 1))
3470 * If there's something expensive in the r.e., find the
3471 * longest literal string that must appear and make it the
3472 * regmust. Resolve ties in favor of later strings, since
3473 * the regstart check works with the beginning of the r.e.
3474 * and avoiding duplication strengthens checking. Not a
3475 * strong reason, but sufficient in the absence of others.
3476 * [Now we resolve ties in favor of the earlier string if
3477 * it happens that c_offset_min has been invalidated, since the
3478 * earlier string may buy us something the later one won't.]
3482 data.longest_fixed = newSVpvs("");
3483 data.longest_float = newSVpvs("");
3484 data.last_found = newSVpvs("");
3485 data.longest = &(data.longest_fixed);
3487 if (!r->regstclass) {
3488 cl_init(pRExC_state, &ch_class);
3489 data.start_class = &ch_class;
3490 stclass_flag = SCF_DO_STCLASS_AND;
3491 } else /* XXXX Check for BOUND? */
3493 data.last_closep = &last_close;
3495 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3496 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3498 #ifdef TRIE_STUDY_OPT
3499 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3504 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3505 && data.last_start_min == 0 && data.last_end > 0
3506 && !RExC_seen_zerolen
3507 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3508 r->reganch |= ROPT_CHECK_ALL;
3509 scan_commit(pRExC_state, &data);
3510 SvREFCNT_dec(data.last_found);
3512 longest_float_length = CHR_SVLEN(data.longest_float);
3513 if (longest_float_length
3514 || (data.flags & SF_FL_BEFORE_EOL
3515 && (!(data.flags & SF_FL_BEFORE_MEOL)
3516 || (RExC_flags & PMf_MULTILINE)))) {
3519 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3520 && data.offset_fixed == data.offset_float_min
3521 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3522 goto remove_float; /* As in (a)+. */
3524 if (SvUTF8(data.longest_float)) {
3525 r->float_utf8 = data.longest_float;
3526 r->float_substr = NULL;
3528 r->float_substr = data.longest_float;
3529 r->float_utf8 = NULL;
3531 r->float_min_offset = data.offset_float_min;
3532 r->float_max_offset = data.offset_float_max;
3533 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3534 && (!(data.flags & SF_FL_BEFORE_MEOL)
3535 || (RExC_flags & PMf_MULTILINE)));
3536 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3540 r->float_substr = r->float_utf8 = NULL;
3541 SvREFCNT_dec(data.longest_float);
3542 longest_float_length = 0;
3545 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3546 if (longest_fixed_length
3547 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3548 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3549 || (RExC_flags & PMf_MULTILINE)))) {
3552 if (SvUTF8(data.longest_fixed)) {
3553 r->anchored_utf8 = data.longest_fixed;
3554 r->anchored_substr = NULL;
3556 r->anchored_substr = data.longest_fixed;
3557 r->anchored_utf8 = NULL;
3559 r->anchored_offset = data.offset_fixed;
3560 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3561 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3562 || (RExC_flags & PMf_MULTILINE)));
3563 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3566 r->anchored_substr = r->anchored_utf8 = NULL;
3567 SvREFCNT_dec(data.longest_fixed);
3568 longest_fixed_length = 0;
3571 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3572 r->regstclass = NULL;
3573 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3575 && !(data.start_class->flags & ANYOF_EOS)
3576 && !cl_is_anything(data.start_class))
3578 const I32 n = add_data(pRExC_state, 1, "f");
3580 Newx(RExC_rx->data->data[n], 1,
3581 struct regnode_charclass_class);
3582 StructCopy(data.start_class,
3583 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3584 struct regnode_charclass_class);
3585 r->regstclass = (regnode*)RExC_rx->data->data[n];
3586 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3587 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3588 regprop(r, sv, (regnode*)data.start_class);
3589 PerlIO_printf(Perl_debug_log,
3590 "synthetic stclass \"%s\".\n",
3591 SvPVX_const(sv));});
3594 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3595 if (longest_fixed_length > longest_float_length) {
3596 r->check_substr = r->anchored_substr;
3597 r->check_utf8 = r->anchored_utf8;
3598 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3599 if (r->reganch & ROPT_ANCH_SINGLE)
3600 r->reganch |= ROPT_NOSCAN;
3603 r->check_substr = r->float_substr;
3604 r->check_utf8 = r->float_utf8;
3605 r->check_offset_min = data.offset_float_min;
3606 r->check_offset_max = data.offset_float_max;
3608 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3609 This should be changed ASAP! */
3610 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3611 r->reganch |= RE_USE_INTUIT;
3612 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3613 r->reganch |= RE_INTUIT_TAIL;
3617 /* Several toplevels. Best we can is to set minlen. */
3619 struct regnode_charclass_class ch_class;
3622 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3624 scan = r->program + 1;
3625 cl_init(pRExC_state, &ch_class);
3626 data.start_class = &ch_class;
3627 data.last_closep = &last_close;
3629 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3630 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3632 #ifdef TRIE_STUDY_OPT
3633 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3638 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3639 = r->float_substr = r->float_utf8 = NULL;
3640 if (!(data.start_class->flags & ANYOF_EOS)
3641 && !cl_is_anything(data.start_class))
3643 const I32 n = add_data(pRExC_state, 1, "f");
3645 Newx(RExC_rx->data->data[n], 1,
3646 struct regnode_charclass_class);
3647 StructCopy(data.start_class,
3648 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3649 struct regnode_charclass_class);
3650 r->regstclass = (regnode*)RExC_rx->data->data[n];
3651 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3652 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3653 regprop(r, sv, (regnode*)data.start_class);
3654 PerlIO_printf(Perl_debug_log,
3655 "synthetic stclass \"%s\".\n",
3656 SvPVX_const(sv));});
3661 if (RExC_seen & REG_SEEN_GPOS)
3662 r->reganch |= ROPT_GPOS_SEEN;
3663 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3664 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3665 if (RExC_seen & REG_SEEN_EVAL)
3666 r->reganch |= ROPT_EVAL_SEEN;
3667 if (RExC_seen & REG_SEEN_CANY)
3668 r->reganch |= ROPT_CANY_SEEN;
3669 Newxz(r->startp, RExC_npar, I32);
3670 Newxz(r->endp, RExC_npar, I32);
3672 DEBUG_r( RX_DEBUG_on(r) );
3674 PerlIO_printf(Perl_debug_log,"Final program:\n");
3677 DEBUG_OFFSETS_r(if (r->offsets) {
3678 const U32 len = r->offsets[0];
3680 GET_RE_DEBUG_FLAGS_DECL;
3681 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3682 for (i = 1; i <= len; i++) {
3683 if (r->offsets[i*2-1] || r->offsets[i*2])
3684 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3685 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3687 PerlIO_printf(Perl_debug_log, "\n");
3693 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3694 int rem=(int)(RExC_end - RExC_parse); \
3703 if (RExC_lastparse!=RExC_parse) \
3704 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3707 iscut ? "..." : "<" \
3710 PerlIO_printf(Perl_debug_log,"%16s",""); \
3715 num=REG_NODE_NUM(RExC_emit); \
3716 if (RExC_lastnum!=num) \
3717 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3719 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3720 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3721 (int)((depth*2)), "", \
3725 RExC_lastparse=RExC_parse; \
3730 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3731 DEBUG_PARSE_MSG((funcname)); \
3732 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3735 - reg - regular expression, i.e. main body or parenthesized thing
3737 * Caller must absorb opening parenthesis.
3739 * Combining parenthesis handling with the base level of regular expression
3740 * is a trifle forced, but the need to tie the tails of the branches to what
3741 * follows makes it hard to avoid.
3743 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3745 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3747 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3751 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3752 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3755 register regnode *ret; /* Will be the head of the group. */
3756 register regnode *br;
3757 register regnode *lastbr;
3758 register regnode *ender = NULL;
3759 register I32 parno = 0;
3761 const I32 oregflags = RExC_flags;
3762 bool have_branch = 0;
3765 /* for (?g), (?gc), and (?o) warnings; warning
3766 about (?c) will warn about (?g) -- japhy */
3768 #define WASTED_O 0x01
3769 #define WASTED_G 0x02
3770 #define WASTED_C 0x04
3771 #define WASTED_GC (0x02|0x04)
3772 I32 wastedflags = 0x00;
3774 char * parse_start = RExC_parse; /* MJD */
3775 char * const oregcomp_parse = RExC_parse;
3777 GET_RE_DEBUG_FLAGS_DECL;
3778 DEBUG_PARSE("reg ");
3781 *flagp = 0; /* Tentatively. */
3784 /* Make an OPEN node, if parenthesized. */
3786 if (*RExC_parse == '?') { /* (?...) */
3787 U32 posflags = 0, negflags = 0;
3788 U32 *flagsp = &posflags;
3789 bool is_logical = 0;
3790 const char * const seqstart = RExC_parse;
3793 paren = *RExC_parse++;
3794 ret = NULL; /* For look-ahead/behind. */
3796 case '<': /* (?<...) */
3797 RExC_seen |= REG_SEEN_LOOKBEHIND;
3798 if (*RExC_parse == '!')
3800 if (*RExC_parse != '=' && *RExC_parse != '!')
3803 case '=': /* (?=...) */
3804 case '!': /* (?!...) */
3805 RExC_seen_zerolen++;
3806 case ':': /* (?:...) */
3807 case '>': /* (?>...) */
3809 case '$': /* (?$...) */
3810 case '@': /* (?@...) */
3811 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3813 case '#': /* (?#...) */
3814 while (*RExC_parse && *RExC_parse != ')')
3816 if (*RExC_parse != ')')
3817 FAIL("Sequence (?#... not terminated");
3818 nextchar(pRExC_state);
3821 case 'p': /* (?p...) */
3822 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3823 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3825 case '?': /* (??...) */
3827 if (*RExC_parse != '{')
3829 paren = *RExC_parse++;
3831 case '{': /* (?{...}) */
3833 I32 count = 1, n = 0;
3835 char *s = RExC_parse;
3837 RExC_seen_zerolen++;
3838 RExC_seen |= REG_SEEN_EVAL;
3839 while (count && (c = *RExC_parse)) {
3850 if (*RExC_parse != ')') {
3852 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3856 OP_4tree *sop, *rop;
3857 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3860 Perl_save_re_context(aTHX);
3861 rop = sv_compile_2op(sv, &sop, "re", &pad);
3862 sop->op_private |= OPpREFCOUNTED;
3863 /* re_dup will OpREFCNT_inc */
3864 OpREFCNT_set(sop, 1);
3867 n = add_data(pRExC_state, 3, "nop");
3868 RExC_rx->data->data[n] = (void*)rop;
3869 RExC_rx->data->data[n+1] = (void*)sop;
3870 RExC_rx->data->data[n+2] = (void*)pad;
3873 else { /* First pass */
3874 if (PL_reginterp_cnt < ++RExC_seen_evals
3876 /* No compiled RE interpolated, has runtime
3877 components ===> unsafe. */
3878 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3879 if (PL_tainting && PL_tainted)
3880 FAIL("Eval-group in insecure regular expression");
3881 #if PERL_VERSION > 8
3882 if (IN_PERL_COMPILETIME)
3887 nextchar(pRExC_state);
3889 ret = reg_node(pRExC_state, LOGICAL);
3892 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3893 /* deal with the length of this later - MJD */
3896 ret = reganode(pRExC_state, EVAL, n);
3897 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3898 Set_Node_Offset(ret, parse_start);
3901 case '(': /* (?(?{...})...) and (?(?=...)...) */
3903 if (RExC_parse[0] == '?') { /* (?(?...)) */
3904 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3905 || RExC_parse[1] == '<'
3906 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3909 ret = reg_node(pRExC_state, LOGICAL);
3912 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3916 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3919 parno = atoi(RExC_parse++);
3921 while (isDIGIT(*RExC_parse))
3923 ret = reganode(pRExC_state, GROUPP, parno);
3925 if ((c = *nextchar(pRExC_state)) != ')')
3926 vFAIL("Switch condition not recognized");
3928 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3929 br = regbranch(pRExC_state, &flags, 1,depth+1);
3931 br = reganode(pRExC_state, LONGJMP, 0);
3933 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3934 c = *nextchar(pRExC_state);
3938 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3939 regbranch(pRExC_state, &flags, 1,depth+1);
3940 REGTAIL(pRExC_state, ret, lastbr);
3943 c = *nextchar(pRExC_state);
3948 vFAIL("Switch (?(condition)... contains too many branches");
3949 ender = reg_node(pRExC_state, TAIL);
3950 REGTAIL(pRExC_state, br, ender);
3952 REGTAIL(pRExC_state, lastbr, ender);
3953 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3956 REGTAIL(pRExC_state, ret, ender);
3960 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3964 RExC_parse--; /* for vFAIL to print correctly */
3965 vFAIL("Sequence (? incomplete");
3969 parse_flags: /* (?i) */
3970 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3971 /* (?g), (?gc) and (?o) are useless here
3972 and must be globally applied -- japhy */
3974 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3975 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3976 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3977 if (! (wastedflags & wflagbit) ) {
3978 wastedflags |= wflagbit;
3981 "Useless (%s%c) - %suse /%c modifier",
3982 flagsp == &negflags ? "?-" : "?",
3984 flagsp == &negflags ? "don't " : "",
3990 else if (*RExC_parse == 'c') {
3991 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3992 if (! (wastedflags & WASTED_C) ) {
3993 wastedflags |= WASTED_GC;
3996 "Useless (%sc) - %suse /gc modifier",
3997 flagsp == &negflags ? "?-" : "?",
3998 flagsp == &negflags ? "don't " : ""
4003 else { pmflag(flagsp, *RExC_parse); }
4007 if (*RExC_parse == '-') {
4009 wastedflags = 0; /* reset so (?g-c) warns twice */
4013 RExC_flags |= posflags;
4014 RExC_flags &= ~negflags;
4015 if (*RExC_parse == ':') {
4021 if (*RExC_parse != ')') {
4023 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
4025 nextchar(pRExC_state);
4033 ret = reganode(pRExC_state, OPEN, parno);
4034 Set_Node_Length(ret, 1); /* MJD */
4035 Set_Node_Offset(ret, RExC_parse); /* MJD */
4042 /* Pick up the branches, linking them together. */
4043 parse_start = RExC_parse; /* MJD */
4044 br = regbranch(pRExC_state, &flags, 1,depth+1);
4045 /* branch_len = (paren != 0); */
4049 if (*RExC_parse == '|') {
4050 if (!SIZE_ONLY && RExC_extralen) {
4051 reginsert(pRExC_state, BRANCHJ, br);
4054 reginsert(pRExC_state, BRANCH, br);
4055 Set_Node_Length(br, paren != 0);
4056 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4060 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4062 else if (paren == ':') {
4063 *flagp |= flags&SIMPLE;
4065 if (is_open) { /* Starts with OPEN. */
4066 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4068 else if (paren != '?') /* Not Conditional */
4070 *flagp |= flags & (SPSTART | HASWIDTH);
4072 while (*RExC_parse == '|') {
4073 if (!SIZE_ONLY && RExC_extralen) {
4074 ender = reganode(pRExC_state, LONGJMP,0);
4075 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4078 RExC_extralen += 2; /* Account for LONGJMP. */
4079 nextchar(pRExC_state);
4080 br = regbranch(pRExC_state, &flags, 0, depth+1);
4084 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4088 *flagp |= flags&SPSTART;
4091 if (have_branch || paren != ':') {
4092 /* Make a closing node, and hook it on the end. */
4095 ender = reg_node(pRExC_state, TAIL);
4098 ender = reganode(pRExC_state, CLOSE, parno);
4099 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4100 Set_Node_Length(ender,1); /* MJD */
4106 *flagp &= ~HASWIDTH;
4109 ender = reg_node(pRExC_state, SUCCEED);
4112 ender = reg_node(pRExC_state, END);
4115 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4117 if (have_branch && !SIZE_ONLY) {
4118 /* Hook the tails of the branches to the closing node. */
4119 for (br = ret; br; br = regnext(br)) {
4120 const U8 op = PL_regkind[OP(br)];
4122 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4124 else if (op == BRANCHJ) {
4125 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4133 static const char parens[] = "=!<,>";
4135 if (paren && (p = strchr(parens, paren))) {
4136 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4137 int flag = (p - parens) > 1;
4140 node = SUSPEND, flag = 0;
4141 reginsert(pRExC_state, node,ret);
4142 Set_Node_Cur_Length(ret);
4143 Set_Node_Offset(ret, parse_start + 1);
4145 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4149 /* Check for proper termination. */
4151 RExC_flags = oregflags;
4152 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4153 RExC_parse = oregcomp_parse;
4154 vFAIL("Unmatched (");
4157 else if (!paren && RExC_parse < RExC_end) {
4158 if (*RExC_parse == ')') {
4160 vFAIL("Unmatched )");
4163 FAIL("Junk on end of regexp"); /* "Can't happen". */
4171 - regbranch - one alternative of an | operator
4173 * Implements the concatenation operator.
4176 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4179 register regnode *ret;
4180 register regnode *chain = NULL;
4181 register regnode *latest;
4182 I32 flags = 0, c = 0;
4183 GET_RE_DEBUG_FLAGS_DECL;
4184 DEBUG_PARSE("brnc");
4188 if (!SIZE_ONLY && RExC_extralen)
4189 ret = reganode(pRExC_state, BRANCHJ,0);
4191 ret = reg_node(pRExC_state, BRANCH);
4192 Set_Node_Length(ret, 1);
4196 if (!first && SIZE_ONLY)
4197 RExC_extralen += 1; /* BRANCHJ */
4199 *flagp = WORST; /* Tentatively. */
4202 nextchar(pRExC_state);
4203 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4205 latest = regpiece(pRExC_state, &flags,depth+1);
4206 if (latest == NULL) {
4207 if (flags & TRYAGAIN)
4211 else if (ret == NULL)
4213 *flagp |= flags&HASWIDTH;
4214 if (chain == NULL) /* First piece. */
4215 *flagp |= flags&SPSTART;
4218 REGTAIL(pRExC_state, chain, latest);
4223 if (chain == NULL) { /* Loop ran zero times. */
4224 chain = reg_node(pRExC_state, NOTHING);
4229 *flagp |= flags&SIMPLE;
4236 - regpiece - something followed by possible [*+?]
4238 * Note that the branching code sequences used for ? and the general cases
4239 * of * and + are somewhat optimized: they use the same NOTHING node as
4240 * both the endmarker for their branch list and the body of the last branch.
4241 * It might seem that this node could be dispensed with entirely, but the
4242 * endmarker role is not redundant.
4245 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4248 register regnode *ret;
4250 register char *next;
4252 const char * const origparse = RExC_parse;
4254 I32 max = REG_INFTY;
4256 GET_RE_DEBUG_FLAGS_DECL;
4257 DEBUG_PARSE("piec");
4259 ret = regatom(pRExC_state, &flags,depth+1);
4261 if (flags & TRYAGAIN)
4268 if (op == '{' && regcurly(RExC_parse)) {
4269 const char *maxpos = NULL;
4270 parse_start = RExC_parse; /* MJD */
4271 next = RExC_parse + 1;
4272 while (isDIGIT(*next) || *next == ',') {
4281 if (*next == '}') { /* got one */
4285 min = atoi(RExC_parse);
4289 maxpos = RExC_parse;
4291 if (!max && *maxpos != '0')
4292 max = REG_INFTY; /* meaning "infinity" */
4293 else if (max >= REG_INFTY)
4294 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4296 nextchar(pRExC_state);
4299 if ((flags&SIMPLE)) {
4300 RExC_naughty += 2 + RExC_naughty / 2;
4301 reginsert(pRExC_state, CURLY, ret);
4302 Set_Node_Offset(ret, parse_start+1); /* MJD */
4303 Set_Node_Cur_Length(ret);
4306 regnode * const w = reg_node(pRExC_state, WHILEM);
4309 REGTAIL(pRExC_state, ret, w);
4310 if (!SIZE_ONLY && RExC_extralen) {
4311 reginsert(pRExC_state, LONGJMP,ret);
4312 reginsert(pRExC_state, NOTHING,ret);
4313 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4315 reginsert(pRExC_state, CURLYX,ret);
4317 Set_Node_Offset(ret, parse_start+1);
4318 Set_Node_Length(ret,
4319 op == '{' ? (RExC_parse - parse_start) : 1);
4321 if (!SIZE_ONLY && RExC_extralen)
4322 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4323 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4325 RExC_whilem_seen++, RExC_extralen += 3;
4326 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4334 if (max && max < min)
4335 vFAIL("Can't do {n,m} with n > m");
4337 ARG1_SET(ret, (U16)min);
4338 ARG2_SET(ret, (U16)max);
4350 #if 0 /* Now runtime fix should be reliable. */
4352 /* if this is reinstated, don't forget to put this back into perldiag:
4354 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4356 (F) The part of the regexp subject to either the * or + quantifier
4357 could match an empty string. The {#} shows in the regular
4358 expression about where the problem was discovered.
4362 if (!(flags&HASWIDTH) && op != '?')
4363 vFAIL("Regexp *+ operand could be empty");
4366 parse_start = RExC_parse;
4367 nextchar(pRExC_state);
4369 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4371 if (op == '*' && (flags&SIMPLE)) {
4372 reginsert(pRExC_state, STAR, ret);
4376 else if (op == '*') {
4380 else if (op == '+' && (flags&SIMPLE)) {
4381 reginsert(pRExC_state, PLUS, ret);
4385 else if (op == '+') {
4389 else if (op == '?') {
4394 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4396 "%.*s matches null string many times",
4397 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4401 if (*RExC_parse == '?') {
4402 nextchar(pRExC_state);
4403 reginsert(pRExC_state, MINMOD, ret);
4404 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4406 if (ISMULT2(RExC_parse)) {
4408 vFAIL("Nested quantifiers");
4415 - regatom - the lowest level
4417 * Optimization: gobbles an entire sequence of ordinary characters so that
4418 * it can turn them into a single node, which is smaller to store and
4419 * faster to run. Backslashed characters are exceptions, each becoming a
4420 * separate node; the code is simpler that way and it's not worth fixing.
4422 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4423 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4426 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4429 register regnode *ret = NULL;
4431 char *parse_start = RExC_parse;
4432 GET_RE_DEBUG_FLAGS_DECL;
4433 DEBUG_PARSE("atom");
4434 *flagp = WORST; /* Tentatively. */
4437 switch (*RExC_parse) {
4439 RExC_seen_zerolen++;
4440 nextchar(pRExC_state);
4441 if (RExC_flags & PMf_MULTILINE)
4442 ret = reg_node(pRExC_state, MBOL);
4443 else if (RExC_flags & PMf_SINGLELINE)
4444 ret = reg_node(pRExC_state, SBOL);
4446 ret = reg_node(pRExC_state, BOL);
4447 Set_Node_Length(ret, 1); /* MJD */
4450 nextchar(pRExC_state);
4452 RExC_seen_zerolen++;
4453 if (RExC_flags & PMf_MULTILINE)
4454 ret = reg_node(pRExC_state, MEOL);
4455 else if (RExC_flags & PMf_SINGLELINE)
4456 ret = reg_node(pRExC_state, SEOL);
4458 ret = reg_node(pRExC_state, EOL);
4459 Set_Node_Length(ret, 1); /* MJD */
4462 nextchar(pRExC_state);
4463 if (RExC_flags & PMf_SINGLELINE)
4464 ret = reg_node(pRExC_state, SANY);
4466 ret = reg_node(pRExC_state, REG_ANY);
4467 *flagp |= HASWIDTH|SIMPLE;
4469 Set_Node_Length(ret, 1); /* MJD */
4473 char * const oregcomp_parse = ++RExC_parse;
4474 ret = regclass(pRExC_state,depth+1);
4475 if (*RExC_parse != ']') {
4476 RExC_parse = oregcomp_parse;
4477 vFAIL("Unmatched [");
4479 nextchar(pRExC_state);
4480 *flagp |= HASWIDTH|SIMPLE;
4481 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4485 nextchar(pRExC_state);
4486 ret = reg(pRExC_state, 1, &flags,depth+1);
4488 if (flags & TRYAGAIN) {
4489 if (RExC_parse == RExC_end) {
4490 /* Make parent create an empty node if needed. */
4498 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4502 if (flags & TRYAGAIN) {
4506 vFAIL("Internal urp");
4507 /* Supposed to be caught earlier. */
4510 if (!regcurly(RExC_parse)) {
4519 vFAIL("Quantifier follows nothing");
4522 switch (*++RExC_parse) {
4524 RExC_seen_zerolen++;
4525 ret = reg_node(pRExC_state, SBOL);
4527 nextchar(pRExC_state);
4528 Set_Node_Length(ret, 2); /* MJD */
4531 ret = reg_node(pRExC_state, GPOS);
4532 RExC_seen |= REG_SEEN_GPOS;
4534 nextchar(pRExC_state);
4535 Set_Node_Length(ret, 2); /* MJD */
4538 ret = reg_node(pRExC_state, SEOL);
4540 RExC_seen_zerolen++; /* Do not optimize RE away */
4541 nextchar(pRExC_state);
4544 ret = reg_node(pRExC_state, EOS);
4546 RExC_seen_zerolen++; /* Do not optimize RE away */
4547 nextchar(pRExC_state);
4548 Set_Node_Length(ret, 2); /* MJD */
4551 ret = reg_node(pRExC_state, CANY);
4552 RExC_seen |= REG_SEEN_CANY;
4553 *flagp |= HASWIDTH|SIMPLE;
4554 nextchar(pRExC_state);
4555 Set_Node_Length(ret, 2); /* MJD */
4558 ret = reg_node(pRExC_state, CLUMP);
4560 nextchar(pRExC_state);
4561 Set_Node_Length(ret, 2); /* MJD */
4564 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4565 *flagp |= HASWIDTH|SIMPLE;
4566 nextchar(pRExC_state);
4567 Set_Node_Length(ret, 2); /* MJD */
4570 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4571 *flagp |= HASWIDTH|SIMPLE;
4572 nextchar(pRExC_state);
4573 Set_Node_Length(ret, 2); /* MJD */
4576 RExC_seen_zerolen++;
4577 RExC_seen |= REG_SEEN_LOOKBEHIND;
4578 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4580 nextchar(pRExC_state);
4581 Set_Node_Length(ret, 2); /* MJD */
4584 RExC_seen_zerolen++;
4585 RExC_seen |= REG_SEEN_LOOKBEHIND;
4586 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4588 nextchar(pRExC_state);
4589 Set_Node_Length(ret, 2); /* MJD */
4592 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4593 *flagp |= HASWIDTH|SIMPLE;
4594 nextchar(pRExC_state);
4595 Set_Node_Length(ret, 2); /* MJD */
4598 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4599 *flagp |= HASWIDTH|SIMPLE;
4600 nextchar(pRExC_state);
4601 Set_Node_Length(ret, 2); /* MJD */
4604 ret = reg_node(pRExC_state, DIGIT);
4605 *flagp |= HASWIDTH|SIMPLE;
4606 nextchar(pRExC_state);
4607 Set_Node_Length(ret, 2); /* MJD */
4610 ret = reg_node(pRExC_state, NDIGIT);
4611 *flagp |= HASWIDTH|SIMPLE;
4612 nextchar(pRExC_state);
4613 Set_Node_Length(ret, 2); /* MJD */
4618 char* const oldregxend = RExC_end;
4619 char* parse_start = RExC_parse - 2;
4621 if (RExC_parse[1] == '{') {
4622 /* a lovely hack--pretend we saw [\pX] instead */
4623 RExC_end = strchr(RExC_parse, '}');
4625 const U8 c = (U8)*RExC_parse;
4627 RExC_end = oldregxend;
4628 vFAIL2("Missing right brace on \\%c{}", c);
4633 RExC_end = RExC_parse + 2;
4634 if (RExC_end > oldregxend)
4635 RExC_end = oldregxend;
4639 ret = regclass(pRExC_state,depth+1);
4641 RExC_end = oldregxend;
4644 Set_Node_Offset(ret, parse_start + 2);
4645 Set_Node_Cur_Length(ret);
4646 nextchar(pRExC_state);
4647 *flagp |= HASWIDTH|SIMPLE;
4660 case '1': case '2': case '3': case '4':
4661 case '5': case '6': case '7': case '8': case '9':
4663 const I32 num = atoi(RExC_parse);
4665 if (num > 9 && num >= RExC_npar)
4668 char * const parse_start = RExC_parse - 1; /* MJD */
4669 while (isDIGIT(*RExC_parse))
4672 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4673 vFAIL("Reference to nonexistent group");
4675 ret = reganode(pRExC_state,
4676 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4680 /* override incorrect value set in reganode MJD */
4681 Set_Node_Offset(ret, parse_start+1);
4682 Set_Node_Cur_Length(ret); /* MJD */
4684 nextchar(pRExC_state);
4689 if (RExC_parse >= RExC_end)
4690 FAIL("Trailing \\");
4693 /* Do not generate "unrecognized" warnings here, we fall
4694 back into the quick-grab loop below */
4701 if (RExC_flags & PMf_EXTENDED) {
4702 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4704 if (RExC_parse < RExC_end)
4710 register STRLEN len;
4715 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4717 parse_start = RExC_parse - 1;
4723 ret = reg_node(pRExC_state,
4724 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4726 for (len = 0, p = RExC_parse - 1;
4727 len < 127 && p < RExC_end;
4730 char * const oldp = p;
4732 if (RExC_flags & PMf_EXTENDED)
4733 p = regwhite(p, RExC_end);
4780 ender = ASCII_TO_NATIVE('\033');
4784 ender = ASCII_TO_NATIVE('\007');
4789 char* const e = strchr(p, '}');
4793 vFAIL("Missing right brace on \\x{}");
4796 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4797 | PERL_SCAN_DISALLOW_PREFIX;
4798 STRLEN numlen = e - p - 1;
4799 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4806 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4808 ender = grok_hex(p, &numlen, &flags, NULL);
4814 ender = UCHARAT(p++);
4815 ender = toCTRL(ender);
4817 case '0': case '1': case '2': case '3':case '4':
4818 case '5': case '6': case '7': case '8':case '9':
4820 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4823 ender = grok_oct(p, &numlen, &flags, NULL);
4833 FAIL("Trailing \\");
4836 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4837 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4838 goto normal_default;
4843 if (UTF8_IS_START(*p) && UTF) {
4845 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4846 &numlen, UTF8_ALLOW_DEFAULT);
4853 if (RExC_flags & PMf_EXTENDED)
4854 p = regwhite(p, RExC_end);
4856 /* Prime the casefolded buffer. */
4857 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4859 if (ISMULT2(p)) { /* Back off on ?+*. */
4864 /* Emit all the Unicode characters. */
4866 for (foldbuf = tmpbuf;
4868 foldlen -= numlen) {
4869 ender = utf8_to_uvchr(foldbuf, &numlen);
4871 const STRLEN unilen = reguni(pRExC_state, ender, s);
4874 /* In EBCDIC the numlen
4875 * and unilen can differ. */
4877 if (numlen >= foldlen)
4881 break; /* "Can't happen." */
4885 const STRLEN unilen = reguni(pRExC_state, ender, s);
4894 REGC((char)ender, s++);
4900 /* Emit all the Unicode characters. */
4902 for (foldbuf = tmpbuf;
4904 foldlen -= numlen) {
4905 ender = utf8_to_uvchr(foldbuf, &numlen);
4907 const STRLEN unilen = reguni(pRExC_state, ender, s);
4910 /* In EBCDIC the numlen
4911 * and unilen can differ. */
4913 if (numlen >= foldlen)
4921 const STRLEN unilen = reguni(pRExC_state, ender, s);
4930 REGC((char)ender, s++);
4934 Set_Node_Cur_Length(ret); /* MJD */
4935 nextchar(pRExC_state);
4937 /* len is STRLEN which is unsigned, need to copy to signed */
4940 vFAIL("Internal disaster");
4944 if (len == 1 && UNI_IS_INVARIANT(ender))
4948 RExC_size += STR_SZ(len);
4951 RExC_emit += STR_SZ(len);
4957 /* If the encoding pragma is in effect recode the text of
4958 * any EXACT-kind nodes. */
4959 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4960 const STRLEN oldlen = STR_LEN(ret);
4961 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4965 if (sv_utf8_downgrade(sv, TRUE)) {
4966 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4967 const STRLEN newlen = SvCUR(sv);
4972 GET_RE_DEBUG_FLAGS_DECL;
4973 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4974 (int)oldlen, STRING(ret),
4976 Copy(s, STRING(ret), newlen, char);
4977 STR_LEN(ret) += newlen - oldlen;
4978 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4980 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4988 S_regwhite(char *p, const char *e)
4993 else if (*p == '#') {
4996 } while (p < e && *p != '\n');
5004 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5005 Character classes ([:foo:]) can also be negated ([:^foo:]).
5006 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5007 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
5008 but trigger failures because they are currently unimplemented. */
5010 #define POSIXCC_DONE(c) ((c) == ':')
5011 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5012 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5015 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
5018 I32 namedclass = OOB_NAMEDCLASS;
5020 if (value == '[' && RExC_parse + 1 < RExC_end &&
5021 /* I smell either [: or [= or [. -- POSIX has been here, right? */
5022 POSIXCC(UCHARAT(RExC_parse))) {
5023 const char c = UCHARAT(RExC_parse);
5024 char* const s = RExC_parse++;
5026 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
5028 if (RExC_parse == RExC_end)
5029 /* Grandfather lone [:, [=, [. */
5032 const char* const t = RExC_parse++; /* skip over the c */
5035 if (UCHARAT(RExC_parse) == ']') {
5036 const char *posixcc = s + 1;
5037 RExC_parse++; /* skip over the ending ] */
5040 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5041 const I32 skip = t - posixcc;
5043 /* Initially switch on the length of the name. */
5046 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5047 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5050 /* Names all of length 5. */
5051 /* alnum alpha ascii blank cntrl digit graph lower
5052 print punct space upper */
5053 /* Offset 4 gives the best switch position. */
5054 switch (posixcc[4]) {
5056 if (memEQ(posixcc, "alph", 4)) /* alpha */
5057 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5060 if (memEQ(posixcc, "spac", 4)) /* space */
5061 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5064 if (memEQ(posixcc, "grap", 4)) /* graph */
5065 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5068 if (memEQ(posixcc, "asci", 4)) /* ascii */
5069 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5072 if (memEQ(posixcc, "blan", 4)) /* blank */
5073 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5076 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5077 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5080 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5081 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5084 if (memEQ(posixcc, "lowe", 4)) /* lower */
5085 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5086 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5087 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5090 if (memEQ(posixcc, "digi", 4)) /* digit */
5091 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5092 else if (memEQ(posixcc, "prin", 4)) /* print */
5093 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5094 else if (memEQ(posixcc, "punc", 4)) /* punct */
5095 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5100 if (memEQ(posixcc, "xdigit", 6))
5101 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5105 if (namedclass == OOB_NAMEDCLASS)
5106 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5108 assert (posixcc[skip] == ':');
5109 assert (posixcc[skip+1] == ']');
5110 } else if (!SIZE_ONLY) {
5111 /* [[=foo=]] and [[.foo.]] are still future. */
5113 /* adjust RExC_parse so the warning shows after
5115 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5117 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5120 /* Maternal grandfather:
5121 * "[:" ending in ":" but not in ":]" */
5131 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5134 if (POSIXCC(UCHARAT(RExC_parse))) {
5135 const char *s = RExC_parse;
5136 const char c = *s++;
5140 if (*s && c == *s && s[1] == ']') {
5141 if (ckWARN(WARN_REGEXP))
5143 "POSIX syntax [%c %c] belongs inside character classes",
5146 /* [[=foo=]] and [[.foo.]] are still future. */
5147 if (POSIXCC_NOTYET(c)) {
5148 /* adjust RExC_parse so the error shows after
5150 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5152 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5160 parse a class specification and produce either an ANYOF node that
5161 matches the pattern. If the pattern matches a single char only and
5162 that char is < 256 then we produce an EXACT node instead.
5165 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5169 register UV nextvalue;
5170 register IV prevvalue = OOB_UNICODE;
5171 register IV range = 0;
5172 register regnode *ret;
5175 char *rangebegin = NULL;
5176 bool need_class = 0;
5179 bool optimize_invert = TRUE;
5180 AV* unicode_alternate = NULL;
5182 UV literal_endpoint = 0;
5184 UV stored = 0; /* number of chars stored in the class */
5186 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5187 case we need to change the emitted regop to an EXACT. */
5188 const char * orig_parse = RExC_parse;
5189 GET_RE_DEBUG_FLAGS_DECL;
5191 PERL_UNUSED_ARG(depth);
5194 DEBUG_PARSE("clas");
5196 /* Assume we are going to generate an ANYOF node. */
5197 ret = reganode(pRExC_state, ANYOF, 0);
5200 ANYOF_FLAGS(ret) = 0;
5202 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5206 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5210 RExC_size += ANYOF_SKIP;
5211 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5214 RExC_emit += ANYOF_SKIP;
5216 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5218 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5219 ANYOF_BITMAP_ZERO(ret);
5220 listsv = newSVpvs("# comment\n");
5223 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5225 if (!SIZE_ONLY && POSIXCC(nextvalue))
5226 checkposixcc(pRExC_state);
5228 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5229 if (UCHARAT(RExC_parse) == ']')
5232 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5236 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5239 rangebegin = RExC_parse;
5241 value = utf8n_to_uvchr((U8*)RExC_parse,
5242 RExC_end - RExC_parse,
5243 &numlen, UTF8_ALLOW_DEFAULT);
5244 RExC_parse += numlen;
5247 value = UCHARAT(RExC_parse++);
5249 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5250 if (value == '[' && POSIXCC(nextvalue))
5251 namedclass = regpposixcc(pRExC_state, value);
5252 else if (value == '\\') {
5254 value = utf8n_to_uvchr((U8*)RExC_parse,
5255 RExC_end - RExC_parse,
5256 &numlen, UTF8_ALLOW_DEFAULT);
5257 RExC_parse += numlen;
5260 value = UCHARAT(RExC_parse++);
5261 /* Some compilers cannot handle switching on 64-bit integer
5262 * values, therefore value cannot be an UV. Yes, this will
5263 * be a problem later if we want switch on Unicode.
5264 * A similar issue a little bit later when switching on
5265 * namedclass. --jhi */
5266 switch ((I32)value) {
5267 case 'w': namedclass = ANYOF_ALNUM; break;
5268 case 'W': namedclass = ANYOF_NALNUM; break;
5269 case 's': namedclass = ANYOF_SPACE; break;
5270 case 'S': namedclass = ANYOF_NSPACE; break;
5271 case 'd': namedclass = ANYOF_DIGIT; break;
5272 case 'D': namedclass = ANYOF_NDIGIT; break;
5277 if (RExC_parse >= RExC_end)
5278 vFAIL2("Empty \\%c{}", (U8)value);
5279 if (*RExC_parse == '{') {
5280 const U8 c = (U8)value;
5281 e = strchr(RExC_parse++, '}');
5283 vFAIL2("Missing right brace on \\%c{}", c);
5284 while (isSPACE(UCHARAT(RExC_parse)))
5286 if (e == RExC_parse)
5287 vFAIL2("Empty \\%c{}", c);
5289 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5297 if (UCHARAT(RExC_parse) == '^') {
5300 value = value == 'p' ? 'P' : 'p'; /* toggle */
5301 while (isSPACE(UCHARAT(RExC_parse))) {
5306 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5307 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5310 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5311 namedclass = ANYOF_MAX; /* no official name, but it's named */
5314 case 'n': value = '\n'; break;
5315 case 'r': value = '\r'; break;
5316 case 't': value = '\t'; break;
5317 case 'f': value = '\f'; break;
5318 case 'b': value = '\b'; break;
5319 case 'e': value = ASCII_TO_NATIVE('\033');break;
5320 case 'a': value = ASCII_TO_NATIVE('\007');break;
5322 if (*RExC_parse == '{') {
5323 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5324 | PERL_SCAN_DISALLOW_PREFIX;
5325 char * const e = strchr(RExC_parse++, '}');
5327 vFAIL("Missing right brace on \\x{}");
5329 numlen = e - RExC_parse;
5330 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5334 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5336 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5337 RExC_parse += numlen;
5341 value = UCHARAT(RExC_parse++);
5342 value = toCTRL(value);
5344 case '0': case '1': case '2': case '3': case '4':
5345 case '5': case '6': case '7': case '8': case '9':
5349 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5350 RExC_parse += numlen;
5354 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5356 "Unrecognized escape \\%c in character class passed through",
5360 } /* end of \blah */
5366 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5368 if (!SIZE_ONLY && !need_class)
5369 ANYOF_CLASS_ZERO(ret);
5373 /* a bad range like a-\d, a-[:digit:] ? */
5376 if (ckWARN(WARN_REGEXP)) {
5378 RExC_parse >= rangebegin ?
5379 RExC_parse - rangebegin : 0;
5381 "False [] range \"%*.*s\"",
5384 if (prevvalue < 256) {
5385 ANYOF_BITMAP_SET(ret, prevvalue);
5386 ANYOF_BITMAP_SET(ret, '-');
5389 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5390 Perl_sv_catpvf(aTHX_ listsv,
5391 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5395 range = 0; /* this was not a true range */
5399 const char *what = NULL;
5402 if (namedclass > OOB_NAMEDCLASS)
5403 optimize_invert = FALSE;
5404 /* Possible truncation here but in some 64-bit environments
5405 * the compiler gets heartburn about switch on 64-bit values.
5406 * A similar issue a little earlier when switching on value.
5408 switch ((I32)namedclass) {
5411 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5413 for (value = 0; value < 256; value++)
5415 ANYOF_BITMAP_SET(ret, value);
5422 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5424 for (value = 0; value < 256; value++)
5425 if (!isALNUM(value))
5426 ANYOF_BITMAP_SET(ret, value);
5433 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5435 for (value = 0; value < 256; value++)
5436 if (isALNUMC(value))
5437 ANYOF_BITMAP_SET(ret, value);
5444 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5446 for (value = 0; value < 256; value++)
5447 if (!isALNUMC(value))
5448 ANYOF_BITMAP_SET(ret, value);
5455 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5457 for (value = 0; value < 256; value++)
5459 ANYOF_BITMAP_SET(ret, value);
5466 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5468 for (value = 0; value < 256; value++)
5469 if (!isALPHA(value))
5470 ANYOF_BITMAP_SET(ret, value);
5477 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5480 for (value = 0; value < 128; value++)
5481 ANYOF_BITMAP_SET(ret, value);
5483 for (value = 0; value < 256; value++) {
5485 ANYOF_BITMAP_SET(ret, value);
5494 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5497 for (value = 128; value < 256; value++)
5498 ANYOF_BITMAP_SET(ret, value);
5500 for (value = 0; value < 256; value++) {
5501 if (!isASCII(value))
5502 ANYOF_BITMAP_SET(ret, value);
5511 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5513 for (value = 0; value < 256; value++)
5515 ANYOF_BITMAP_SET(ret, value);
5522 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5524 for (value = 0; value < 256; value++)
5525 if (!isBLANK(value))
5526 ANYOF_BITMAP_SET(ret, value);
5533 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5535 for (value = 0; value < 256; value++)
5537 ANYOF_BITMAP_SET(ret, value);
5544 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5546 for (value = 0; value < 256; value++)
5547 if (!isCNTRL(value))
5548 ANYOF_BITMAP_SET(ret, value);
5555 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5557 /* consecutive digits assumed */
5558 for (value = '0'; value <= '9'; value++)
5559 ANYOF_BITMAP_SET(ret, value);
5566 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5568 /* consecutive digits assumed */
5569 for (value = 0; value < '0'; value++)
5570 ANYOF_BITMAP_SET(ret, value);
5571 for (value = '9' + 1; value < 256; value++)
5572 ANYOF_BITMAP_SET(ret, value);
5579 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5581 for (value = 0; value < 256; value++)
5583 ANYOF_BITMAP_SET(ret, value);
5590 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5592 for (value = 0; value < 256; value++)
5593 if (!isGRAPH(value))
5594 ANYOF_BITMAP_SET(ret, value);
5601 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5603 for (value = 0; value < 256; value++)
5605 ANYOF_BITMAP_SET(ret, value);
5612 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5614 for (value = 0; value < 256; value++)
5615 if (!isLOWER(value))
5616 ANYOF_BITMAP_SET(ret, value);
5623 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5625 for (value = 0; value < 256; value++)
5627 ANYOF_BITMAP_SET(ret, value);
5634 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5636 for (value = 0; value < 256; value++)
5637 if (!isPRINT(value))
5638 ANYOF_BITMAP_SET(ret, value);
5645 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5647 for (value = 0; value < 256; value++)
5648 if (isPSXSPC(value))
5649 ANYOF_BITMAP_SET(ret, value);
5656 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5658 for (value = 0; value < 256; value++)
5659 if (!isPSXSPC(value))
5660 ANYOF_BITMAP_SET(ret, value);
5667 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5669 for (value = 0; value < 256; value++)
5671 ANYOF_BITMAP_SET(ret, value);
5678 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5680 for (value = 0; value < 256; value++)
5681 if (!isPUNCT(value))
5682 ANYOF_BITMAP_SET(ret, value);
5689 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5691 for (value = 0; value < 256; value++)
5693 ANYOF_BITMAP_SET(ret, value);
5700 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5702 for (value = 0; value < 256; value++)
5703 if (!isSPACE(value))
5704 ANYOF_BITMAP_SET(ret, value);
5711 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5713 for (value = 0; value < 256; value++)
5715 ANYOF_BITMAP_SET(ret, value);
5722 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5724 for (value = 0; value < 256; value++)
5725 if (!isUPPER(value))
5726 ANYOF_BITMAP_SET(ret, value);
5733 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5735 for (value = 0; value < 256; value++)
5736 if (isXDIGIT(value))
5737 ANYOF_BITMAP_SET(ret, value);
5744 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5746 for (value = 0; value < 256; value++)
5747 if (!isXDIGIT(value))
5748 ANYOF_BITMAP_SET(ret, value);
5754 /* this is to handle \p and \P */
5757 vFAIL("Invalid [::] class");
5761 /* Strings such as "+utf8::isWord\n" */
5762 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5765 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5768 } /* end of namedclass \blah */
5771 if (prevvalue > (IV)value) /* b-a */ {
5772 const int w = RExC_parse - rangebegin;
5773 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5774 range = 0; /* not a valid range */
5778 prevvalue = value; /* save the beginning of the range */
5779 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5780 RExC_parse[1] != ']') {
5783 /* a bad range like \w-, [:word:]- ? */
5784 if (namedclass > OOB_NAMEDCLASS) {
5785 if (ckWARN(WARN_REGEXP)) {
5787 RExC_parse >= rangebegin ?
5788 RExC_parse - rangebegin : 0;
5790 "False [] range \"%*.*s\"",
5794 ANYOF_BITMAP_SET(ret, '-');
5796 range = 1; /* yeah, it's a range! */
5797 continue; /* but do it the next time */
5801 /* now is the next time */
5802 /*stored += (value - prevvalue + 1);*/
5804 if (prevvalue < 256) {
5805 const IV ceilvalue = value < 256 ? value : 255;
5808 /* In EBCDIC [\x89-\x91] should include
5809 * the \x8e but [i-j] should not. */
5810 if (literal_endpoint == 2 &&
5811 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5812 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5814 if (isLOWER(prevvalue)) {
5815 for (i = prevvalue; i <= ceilvalue; i++)
5817 ANYOF_BITMAP_SET(ret, i);
5819 for (i = prevvalue; i <= ceilvalue; i++)
5821 ANYOF_BITMAP_SET(ret, i);
5826 for (i = prevvalue; i <= ceilvalue; i++) {
5827 if (!ANYOF_BITMAP_TEST(ret,i)) {
5829 ANYOF_BITMAP_SET(ret, i);
5833 if (value > 255 || UTF) {
5834 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5835 const UV natvalue = NATIVE_TO_UNI(value);
5836 stored+=2; /* can't optimize this class */
5837 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5838 if (prevnatvalue < natvalue) { /* what about > ? */
5839 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5840 prevnatvalue, natvalue);
5842 else if (prevnatvalue == natvalue) {
5843 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5845 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5847 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5849 /* If folding and foldable and a single
5850 * character, insert also the folded version
5851 * to the charclass. */
5853 if (foldlen == (STRLEN)UNISKIP(f))
5854 Perl_sv_catpvf(aTHX_ listsv,
5857 /* Any multicharacter foldings
5858 * require the following transform:
5859 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5860 * where E folds into "pq" and F folds
5861 * into "rst", all other characters
5862 * fold to single characters. We save
5863 * away these multicharacter foldings,
5864 * to be later saved as part of the
5865 * additional "s" data. */
5868 if (!unicode_alternate)
5869 unicode_alternate = newAV();
5870 sv = newSVpvn((char*)foldbuf, foldlen);
5872 av_push(unicode_alternate, sv);
5876 /* If folding and the value is one of the Greek
5877 * sigmas insert a few more sigmas to make the
5878 * folding rules of the sigmas to work right.
5879 * Note that not all the possible combinations
5880 * are handled here: some of them are handled
5881 * by the standard folding rules, and some of
5882 * them (literal or EXACTF cases) are handled
5883 * during runtime in regexec.c:S_find_byclass(). */
5884 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5885 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5886 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5887 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5888 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5890 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5891 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5892 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5897 literal_endpoint = 0;
5901 range = 0; /* this range (if it was one) is done now */
5905 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5907 RExC_size += ANYOF_CLASS_ADD_SKIP;
5909 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5915 /****** !SIZE_ONLY AFTER HERE *********/
5917 if( stored == 1 && value < 256
5918 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5920 /* optimize single char class to an EXACT node
5921 but *only* when its not a UTF/high char */
5922 const char * cur_parse= RExC_parse;
5923 RExC_emit = (regnode *)orig_emit;
5924 RExC_parse = (char *)orig_parse;
5925 ret = reg_node(pRExC_state,
5926 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5927 RExC_parse = (char *)cur_parse;
5928 *STRING(ret)= (char)value;
5930 RExC_emit += STR_SZ(1);
5933 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5934 if ( /* If the only flag is folding (plus possibly inversion). */
5935 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5937 for (value = 0; value < 256; ++value) {
5938 if (ANYOF_BITMAP_TEST(ret, value)) {
5939 UV fold = PL_fold[value];
5942 ANYOF_BITMAP_SET(ret, fold);
5945 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5948 /* optimize inverted simple patterns (e.g. [^a-z]) */
5949 if (optimize_invert &&
5950 /* If the only flag is inversion. */
5951 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5952 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5953 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5954 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5957 AV * const av = newAV();
5959 /* The 0th element stores the character class description
5960 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5961 * to initialize the appropriate swash (which gets stored in
5962 * the 1st element), and also useful for dumping the regnode.
5963 * The 2nd element stores the multicharacter foldings,
5964 * used later (regexec.c:S_reginclass()). */
5965 av_store(av, 0, listsv);
5966 av_store(av, 1, NULL);
5967 av_store(av, 2, (SV*)unicode_alternate);
5968 rv = newRV_noinc((SV*)av);
5969 n = add_data(pRExC_state, 1, "s");
5970 RExC_rx->data->data[n] = (void*)rv;
5977 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5979 char* const retval = RExC_parse++;
5982 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5983 RExC_parse[2] == '#') {
5984 while (*RExC_parse != ')') {
5985 if (RExC_parse == RExC_end)
5986 FAIL("Sequence (?#... not terminated");
5992 if (RExC_flags & PMf_EXTENDED) {
5993 if (isSPACE(*RExC_parse)) {
5997 else if (*RExC_parse == '#') {
5998 while (RExC_parse < RExC_end)
5999 if (*RExC_parse++ == '\n') break;
6008 - reg_node - emit a node
6010 STATIC regnode * /* Location. */
6011 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
6014 register regnode *ptr;
6015 regnode * const ret = RExC_emit;
6016 GET_RE_DEBUG_FLAGS_DECL;
6019 SIZE_ALIGN(RExC_size);
6023 NODE_ALIGN_FILL(ret);
6025 FILL_ADVANCE_NODE(ptr, op);
6026 if (RExC_offsets) { /* MJD */
6027 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
6028 "reg_node", __LINE__,
6030 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6031 ? "Overwriting end of array!\n" : "OK",
6032 (UV)(RExC_emit - RExC_emit_start),
6033 (UV)(RExC_parse - RExC_start),
6034 (UV)RExC_offsets[0]));
6035 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
6044 - reganode - emit a node with an argument
6046 STATIC regnode * /* Location. */
6047 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6050 register regnode *ptr;
6051 regnode * const ret = RExC_emit;
6052 GET_RE_DEBUG_FLAGS_DECL;
6055 SIZE_ALIGN(RExC_size);
6060 NODE_ALIGN_FILL(ret);
6062 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6063 if (RExC_offsets) { /* MJD */
6064 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6068 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6069 "Overwriting end of array!\n" : "OK",
6070 (UV)(RExC_emit - RExC_emit_start),
6071 (UV)(RExC_parse - RExC_start),
6072 (UV)RExC_offsets[0]));
6073 Set_Cur_Node_Offset;
6082 - reguni - emit (if appropriate) a Unicode character
6085 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6088 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6092 - reginsert - insert an operator in front of already-emitted operand
6094 * Means relocating the operand.
6097 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6100 register regnode *src;
6101 register regnode *dst;
6102 register regnode *place;
6103 const int offset = regarglen[(U8)op];
6104 GET_RE_DEBUG_FLAGS_DECL;
6105 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6108 RExC_size += NODE_STEP_REGNODE + offset;
6113 RExC_emit += NODE_STEP_REGNODE + offset;
6115 while (src > opnd) {
6116 StructCopy(--src, --dst, regnode);
6117 if (RExC_offsets) { /* MJD 20010112 */
6118 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6122 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6123 ? "Overwriting end of array!\n" : "OK",
6124 (UV)(src - RExC_emit_start),
6125 (UV)(dst - RExC_emit_start),
6126 (UV)RExC_offsets[0]));
6127 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6128 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6133 place = opnd; /* Op node, where operand used to be. */
6134 if (RExC_offsets) { /* MJD */
6135 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6139 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6140 ? "Overwriting end of array!\n" : "OK",
6141 (UV)(place - RExC_emit_start),
6142 (UV)(RExC_parse - RExC_start),
6144 Set_Node_Offset(place, RExC_parse);
6145 Set_Node_Length(place, 1);
6147 src = NEXTOPER(place);
6148 FILL_ADVANCE_NODE(place, op);
6149 Zero(src, offset, regnode);
6153 - regtail - set the next-pointer at the end of a node chain of p to val.
6154 - SEE ALSO: regtail_study
6156 /* TODO: All three parms should be const */
6158 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6161 register regnode *scan;
6162 GET_RE_DEBUG_FLAGS_DECL;
6164 PERL_UNUSED_ARG(depth);
6170 /* Find last node. */
6173 regnode * const temp = regnext(scan);
6175 SV * const mysv=sv_newmortal();
6176 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6177 regprop(RExC_rx, mysv, scan);
6178 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6179 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6186 if (reg_off_by_arg[OP(scan)]) {
6187 ARG_SET(scan, val - scan);
6190 NEXT_OFF(scan) = val - scan;
6196 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6197 - Look for optimizable sequences at the same time.
6198 - currently only looks for EXACT chains.
6200 This is expermental code. The idea is to use this routine to perform
6201 in place optimizations on branches and groups as they are constructed,
6202 with the long term intention of removing optimization from study_chunk so
6203 that it is purely analytical.
6205 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6206 to control which is which.
6209 /* TODO: All four parms should be const */
6212 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6215 register regnode *scan;
6217 #ifdef EXPERIMENTAL_INPLACESCAN
6221 GET_RE_DEBUG_FLAGS_DECL;
6227 /* Find last node. */
6231 regnode * const temp = regnext(scan);
6232 #ifdef EXPERIMENTAL_INPLACESCAN
6233 if (PL_regkind[OP(scan)] == EXACT)
6234 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6242 if( exact == PSEUDO )
6244 else if ( exact != OP(scan) )
6253 SV * const mysv=sv_newmortal();
6254 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6255 regprop(RExC_rx, mysv, scan);
6256 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6257 SvPV_nolen_const(mysv),
6259 REG_NODE_NUM(scan));
6266 SV * const mysv_val=sv_newmortal();
6267 DEBUG_PARSE_MSG("");
6268 regprop(RExC_rx, mysv_val, val);
6269 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6270 SvPV_nolen_const(mysv_val),
6275 if (reg_off_by_arg[OP(scan)]) {
6276 ARG_SET(scan, val - scan);
6279 NEXT_OFF(scan) = val - scan;
6287 - regcurly - a little FSA that accepts {\d+,?\d*}
6290 S_regcurly(register const char *s)
6309 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6312 Perl_regdump(pTHX_ const regexp *r)
6316 SV * const sv = sv_newmortal();
6318 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
6320 /* Header fields of interest. */
6321 if (r->anchored_substr)
6322 PerlIO_printf(Perl_debug_log,
6323 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
6325 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
6326 SvPVX_const(r->anchored_substr),
6328 SvTAIL(r->anchored_substr) ? "$" : "",
6329 (IV)r->anchored_offset);
6330 else if (r->anchored_utf8)
6331 PerlIO_printf(Perl_debug_log,
6332 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
6334 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
6335 SvPVX_const(r->anchored_utf8),
6337 SvTAIL(r->anchored_utf8) ? "$" : "",
6338 (IV)r->anchored_offset);
6339 if (r->float_substr)
6340 PerlIO_printf(Perl_debug_log,
6341 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6343 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
6344 SvPVX_const(r->float_substr),
6346 SvTAIL(r->float_substr) ? "$" : "",
6347 (IV)r->float_min_offset, (UV)r->float_max_offset);
6348 else if (r->float_utf8)
6349 PerlIO_printf(Perl_debug_log,
6350 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6352 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
6353 SvPVX_const(r->float_utf8),
6355 SvTAIL(r->float_utf8) ? "$" : "",
6356 (IV)r->float_min_offset, (UV)r->float_max_offset);
6357 if (r->check_substr || r->check_utf8)
6358 PerlIO_printf(Perl_debug_log,
6359 r->check_substr == r->float_substr
6360 && r->check_utf8 == r->float_utf8
6361 ? "(checking floating" : "(checking anchored");
6362 if (r->reganch & ROPT_NOSCAN)
6363 PerlIO_printf(Perl_debug_log, " noscan");
6364 if (r->reganch & ROPT_CHECK_ALL)
6365 PerlIO_printf(Perl_debug_log, " isall");
6366 if (r->check_substr || r->check_utf8)
6367 PerlIO_printf(Perl_debug_log, ") ");
6369 if (r->regstclass) {
6370 regprop(r, sv, r->regstclass);
6371 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6373 if (r->reganch & ROPT_ANCH) {
6374 PerlIO_printf(Perl_debug_log, "anchored");
6375 if (r->reganch & ROPT_ANCH_BOL)
6376 PerlIO_printf(Perl_debug_log, "(BOL)");
6377 if (r->reganch & ROPT_ANCH_MBOL)
6378 PerlIO_printf(Perl_debug_log, "(MBOL)");
6379 if (r->reganch & ROPT_ANCH_SBOL)
6380 PerlIO_printf(Perl_debug_log, "(SBOL)");
6381 if (r->reganch & ROPT_ANCH_GPOS)
6382 PerlIO_printf(Perl_debug_log, "(GPOS)");
6383 PerlIO_putc(Perl_debug_log, ' ');
6385 if (r->reganch & ROPT_GPOS_SEEN)
6386 PerlIO_printf(Perl_debug_log, "GPOS ");
6387 if (r->reganch & ROPT_SKIP)
6388 PerlIO_printf(Perl_debug_log, "plus ");
6389 if (r->reganch & ROPT_IMPLICIT)
6390 PerlIO_printf(Perl_debug_log, "implicit ");
6391 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6392 if (r->reganch & ROPT_EVAL_SEEN)
6393 PerlIO_printf(Perl_debug_log, "with eval ");
6394 PerlIO_printf(Perl_debug_log, "\n");
6396 PERL_UNUSED_CONTEXT;
6398 #endif /* DEBUGGING */
6402 - regprop - printable representation of opcode
6405 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6411 sv_setpvn(sv, "", 0);
6412 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6413 /* It would be nice to FAIL() here, but this may be called from
6414 regexec.c, and it would be hard to supply pRExC_state. */
6415 Perl_croak(aTHX_ "Corrupted regexp opcode");
6416 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6418 k = PL_regkind[OP(o)];
6421 SV * const dsv = sv_2mortal(newSVpvs(""));
6422 /* Using is_utf8_string() is a crude hack but it may
6423 * be the best for now since we have no flag "this EXACTish
6424 * node was UTF-8" --jhi */
6425 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
6426 RE_PV_DISPLAY_DECL(s, len, do_utf8, dsv, STRING(o), STR_LEN(o), 60);
6428 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6432 } else if (k == TRIE) {
6433 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6434 /* print the details of the trie in dumpuntil instead, as
6435 * prog->data isn't available here */
6436 } else if (k == CURLY) {
6437 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6438 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6439 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6441 else if (k == WHILEM && o->flags) /* Ordinal/of */
6442 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6443 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6444 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6445 else if (k == LOGICAL)
6446 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6447 else if (k == ANYOF) {
6448 int i, rangestart = -1;
6449 const U8 flags = ANYOF_FLAGS(o);
6451 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6452 static const char * const anyofs[] = {
6485 if (flags & ANYOF_LOCALE)
6486 sv_catpvs(sv, "{loc}");
6487 if (flags & ANYOF_FOLD)
6488 sv_catpvs(sv, "{i}");
6489 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6490 if (flags & ANYOF_INVERT)
6492 for (i = 0; i <= 256; i++) {
6493 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6494 if (rangestart == -1)
6496 } else if (rangestart != -1) {
6497 if (i <= rangestart + 3)
6498 for (; rangestart < i; rangestart++)
6499 put_byte(sv, rangestart);
6501 put_byte(sv, rangestart);
6503 put_byte(sv, i - 1);
6509 if (o->flags & ANYOF_CLASS)
6510 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6511 if (ANYOF_CLASS_TEST(o,i))
6512 sv_catpv(sv, anyofs[i]);
6514 if (flags & ANYOF_UNICODE)
6515 sv_catpvs(sv, "{unicode}");
6516 else if (flags & ANYOF_UNICODE_ALL)
6517 sv_catpvs(sv, "{unicode_all}");
6521 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6525 U8 s[UTF8_MAXBYTES_CASE+1];
6527 for (i = 0; i <= 256; i++) { /* just the first 256 */
6528 uvchr_to_utf8(s, i);
6530 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6531 if (rangestart == -1)
6533 } else if (rangestart != -1) {
6534 if (i <= rangestart + 3)
6535 for (; rangestart < i; rangestart++) {
6536 const U8 * const e = uvchr_to_utf8(s,rangestart);
6538 for(p = s; p < e; p++)
6542 const U8 *e = uvchr_to_utf8(s,rangestart);
6544 for (p = s; p < e; p++)
6547 e = uvchr_to_utf8(s, i-1);
6548 for (p = s; p < e; p++)
6555 sv_catpvs(sv, "..."); /* et cetera */
6559 char *s = savesvpv(lv);
6560 char * const origs = s;
6562 while (*s && *s != '\n')
6566 const char * const t = ++s;
6584 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6586 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6587 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
6589 PERL_UNUSED_CONTEXT;
6590 PERL_UNUSED_ARG(sv);
6592 PERL_UNUSED_ARG(prog);
6593 #endif /* DEBUGGING */
6597 Perl_re_intuit_string(pTHX_ regexp *prog)
6598 { /* Assume that RE_INTUIT is set */
6600 GET_RE_DEBUG_FLAGS_DECL;
6601 PERL_UNUSED_CONTEXT;
6605 const char * const s = SvPV_nolen_const(prog->check_substr
6606 ? prog->check_substr : prog->check_utf8);
6608 if (!PL_colorset) reginitcolors();
6609 PerlIO_printf(Perl_debug_log,
6610 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6612 prog->check_substr ? "" : "utf8 ",
6613 PL_colors[5],PL_colors[0],
6616 (strlen(s) > 60 ? "..." : ""));
6619 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6623 Perl_pregfree(pTHX_ struct regexp *r)
6629 GET_RE_DEBUG_FLAGS_DECL;
6631 if (!r || (--r->refcnt > 0))
6633 DEBUG_COMPILE_r(if (RX_DEBUG(r)){
6634 RE_PV_DISPLAY_DECL(s, len, (r->reganch & ROPT_UTF8),
6635 PERL_DEBUG_PAD_ZERO(0), r->precomp, r->prelen, 60);
6639 PerlIO_printf(Perl_debug_log,
6640 "%sFreeing REx:%s %s%*.*s%s%s\n",
6641 PL_colors[4],PL_colors[5],PL_colors[0],
6644 len > 60 ? "..." : "");
6647 /* gcov results gave these as non-null 100% of the time, so there's no
6648 optimisation in checking them before calling Safefree */
6649 Safefree(r->precomp);
6650 Safefree(r->offsets); /* 20010421 MJD */
6651 RX_MATCH_COPY_FREE(r);
6652 #ifdef PERL_OLD_COPY_ON_WRITE
6654 SvREFCNT_dec(r->saved_copy);
6657 if (r->anchored_substr)
6658 SvREFCNT_dec(r->anchored_substr);
6659 if (r->anchored_utf8)
6660 SvREFCNT_dec(r->anchored_utf8);
6661 if (r->float_substr)
6662 SvREFCNT_dec(r->float_substr);
6664 SvREFCNT_dec(r->float_utf8);
6665 Safefree(r->substrs);
6668 int n = r->data->count;
6669 PAD* new_comppad = NULL;
6674 /* If you add a ->what type here, update the comment in regcomp.h */
6675 switch (r->data->what[n]) {
6677 SvREFCNT_dec((SV*)r->data->data[n]);
6680 Safefree(r->data->data[n]);
6683 new_comppad = (AV*)r->data->data[n];
6686 if (new_comppad == NULL)
6687 Perl_croak(aTHX_ "panic: pregfree comppad");
6688 PAD_SAVE_LOCAL(old_comppad,
6689 /* Watch out for global destruction's random ordering. */
6690 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6693 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6696 op_free((OP_4tree*)r->data->data[n]);
6698 PAD_RESTORE_LOCAL(old_comppad);
6699 SvREFCNT_dec((SV*)new_comppad);
6705 { /* Aho Corasick add-on structure for a trie node.
6706 Used in stclass optimization only */
6708 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6710 refcount = --aho->refcount;
6713 Safefree(aho->states);
6714 Safefree(aho->fail);
6715 aho->trie=NULL; /* not necessary to free this as it is
6716 handled by the 't' case */
6717 Safefree(r->data->data[n]); /* do this last!!!! */
6718 Safefree(r->regstclass);
6724 /* trie structure. */
6726 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6728 refcount = --trie->refcount;
6731 Safefree(trie->charmap);
6732 if (trie->widecharmap)
6733 SvREFCNT_dec((SV*)trie->widecharmap);
6734 Safefree(trie->states);
6735 Safefree(trie->trans);
6737 Safefree(trie->bitmap);
6739 Safefree(trie->wordlen);
6743 SvREFCNT_dec((SV*)trie->words);
6744 if (trie->revcharmap)
6745 SvREFCNT_dec((SV*)trie->revcharmap);
6748 Safefree(r->data->data[n]); /* do this last!!!! */
6753 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6756 Safefree(r->data->what);
6759 Safefree(r->startp);
6764 #ifndef PERL_IN_XSUB_RE
6766 - regnext - dig the "next" pointer out of a node
6769 Perl_regnext(pTHX_ register regnode *p)
6772 register I32 offset;
6774 if (p == &PL_regdummy)
6777 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6786 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6789 STRLEN l1 = strlen(pat1);
6790 STRLEN l2 = strlen(pat2);
6793 const char *message;
6799 Copy(pat1, buf, l1 , char);
6800 Copy(pat2, buf + l1, l2 , char);
6801 buf[l1 + l2] = '\n';
6802 buf[l1 + l2 + 1] = '\0';
6804 /* ANSI variant takes additional second argument */
6805 va_start(args, pat2);
6809 msv = vmess(buf, &args);
6811 message = SvPV_const(msv,l1);
6814 Copy(message, buf, l1 , char);
6815 buf[l1-1] = '\0'; /* Overwrite \n */
6816 Perl_croak(aTHX_ "%s", buf);
6819 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6821 #ifndef PERL_IN_XSUB_RE
6823 Perl_save_re_context(pTHX)
6827 struct re_save_state *state;
6829 SAVEVPTR(PL_curcop);
6830 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6832 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6833 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6834 SSPUSHINT(SAVEt_RE_STATE);
6836 Copy(&PL_reg_state, state, 1, struct re_save_state);
6838 PL_reg_start_tmp = 0;
6839 PL_reg_start_tmpl = 0;
6840 PL_reg_oldsaved = NULL;
6841 PL_reg_oldsavedlen = 0;
6843 PL_reg_leftiter = 0;
6844 PL_reg_poscache = NULL;
6845 PL_reg_poscache_size = 0;
6846 #ifdef PERL_OLD_COPY_ON_WRITE
6850 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6852 const REGEXP * const rx = PM_GETRE(PL_curpm);
6855 for (i = 1; i <= rx->nparens; i++) {
6856 char digits[TYPE_CHARS(long)];
6857 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
6858 GV *const *const gvp
6859 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6862 GV * const gv = *gvp;
6863 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6873 clear_re(pTHX_ void *r)
6876 ReREFCNT_dec((regexp *)r);
6882 S_put_byte(pTHX_ SV *sv, int c)
6884 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6885 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6886 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6887 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6889 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6892 #define CLEAR_OPTSTART \
6893 if (optstart) STMT_START { \
6894 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
6898 #define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6900 STATIC const regnode *
6901 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6902 const regnode *last, SV* sv, I32 l)
6905 register U8 op = EXACT; /* Arbitrary non-END op. */
6906 register const regnode *next;
6907 const regnode *optstart= NULL;
6908 GET_RE_DEBUG_FLAGS_DECL;
6910 while (op != END && (!last || node < last)) {
6911 /* While that wasn't END last time... */
6917 next = regnext((regnode *)node);
6920 if (OP(node) == OPTIMIZED) {
6921 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
6928 regprop(r, sv, node);
6929 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6930 (int)(2*l + 1), "", SvPVX_const(sv));
6932 if (OP(node) != OPTIMIZED) {
6933 if (next == NULL) /* Next ptr. */
6934 PerlIO_printf(Perl_debug_log, "(0)");
6936 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6937 (void)PerlIO_putc(Perl_debug_log, '\n');
6941 if (PL_regkind[(U8)op] == BRANCHJ) {
6944 register const regnode *nnode = (OP(next) == LONGJMP
6945 ? regnext((regnode *)next)
6947 if (last && nnode > last)
6949 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6952 else if (PL_regkind[(U8)op] == BRANCH) {
6954 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
6956 else if ( PL_regkind[(U8)op] == TRIE ) {
6957 const I32 n = ARG(node);
6958 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6959 const I32 arry_len = av_len(trie->words)+1;
6961 PerlIO_printf(Perl_debug_log,
6962 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
6966 TRIE_WORDCOUNT(trie),
6967 (int)TRIE_CHARCOUNT(trie),
6968 trie->uniquecharcount,
6969 (IV)TRIE_LASTSTATE(trie)-1,
6976 sv_setpvn(sv, "", 0);
6977 for (i = 0; i <= 256; i++) {
6978 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6979 if (rangestart == -1)
6981 } else if (rangestart != -1) {
6982 if (i <= rangestart + 3)
6983 for (; rangestart < i; rangestart++)
6984 put_byte(sv, rangestart);
6986 put_byte(sv, rangestart);
6988 put_byte(sv, i - 1);
6993 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
6995 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
6997 for (word_idx=0; word_idx < arry_len; word_idx++) {
6998 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
7000 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
7003 SvPV_nolen_const(*elem_ptr),
7009 node = NEXTOPER(node);
7010 node += regarglen[(U8)op];
7013 else if ( op == CURLY) { /* "next" might be very big: optimizer */
7014 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7015 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7017 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
7019 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7022 else if ( op == PLUS || op == STAR) {
7023 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
7025 else if (op == ANYOF) {
7026 /* arglen 1 + class block */
7027 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7028 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7029 node = NEXTOPER(node);
7031 else if (PL_regkind[(U8)op] == EXACT) {
7032 /* Literal string, where present. */
7033 node += NODE_SZ_STR(node) - 1;
7034 node = NEXTOPER(node);
7037 node = NEXTOPER(node);
7038 node += regarglen[(U8)op];
7040 if (op == CURLYX || op == OPEN)
7042 else if (op == WHILEM)
7049 #endif /* DEBUGGING */
7053 * c-indentation-style: bsd
7055 * indent-tabs-mode: t
7058 * ex: set ts=8 sts=4 sw=4 noet: