5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
176 /* Length of a variant. */
178 typedef struct scan_data_t {
184 I32 last_end; /* min value, <0 unless valid. */
187 SV **longest; /* Either &l_fixed, or &l_float. */
191 I32 offset_float_min;
192 I32 offset_float_max;
196 struct regnode_charclass_class *start_class;
200 * Forward declarations for pregcomp()'s friends.
203 static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL 0x0001
208 #define SF_BEFORE_MEOL 0x0002
209 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
213 # define SF_FIX_SHIFT_EOL (0+2)
214 # define SF_FL_SHIFT_EOL (0+4)
216 # define SF_FIX_SHIFT_EOL (+2)
217 # define SF_FL_SHIFT_EOL (+4)
220 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF 0x0040
226 #define SF_HAS_PAR 0x0080
227 #define SF_IN_PAR 0x0100
228 #define SF_HAS_EVAL 0x0200
229 #define SCF_DO_SUBSTR 0x0400
230 #define SCF_DO_STCLASS_AND 0x0800
231 #define SCF_DO_STCLASS_OR 0x1000
232 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS 0x2000
235 #define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
237 #define UTF (RExC_utf8 != 0)
238 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
239 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
241 #define OOB_UNICODE 12345678
242 #define OOB_NAMEDCLASS -1
244 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248 /* length of regex to show in messages that don't mark a position within */
249 #define RegexLengthToShowInErrorMessages 127
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
256 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
257 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
259 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 #define FAIL(msg) STMT_START { \
267 const char *ellipses = ""; \
268 IV len = RExC_end - RExC_precomp; \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
284 #define Simple_vFAIL(m) STMT_START { \
285 const IV offset = RExC_parse - RExC_precomp; \
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
293 #define vFAIL(m) STMT_START { \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
300 * Like Simple_vFAIL(), but accepts two arguments.
302 #define Simple_vFAIL2(m,a1) STMT_START { \
303 const IV offset = RExC_parse - RExC_precomp; \
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
311 #define vFAIL2(m,a1) STMT_START { \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
319 * Like Simple_vFAIL(), but accepts three arguments.
321 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
322 const IV offset = RExC_parse - RExC_precomp; \
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
330 #define vFAIL3(m,a1,a2) STMT_START { \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
337 * Like Simple_vFAIL(), but accepts four arguments.
339 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
340 const IV offset = RExC_parse - RExC_precomp; \
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
345 #define vWARN(loc,m) STMT_START { \
346 const IV offset = loc - RExC_precomp; \
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351 #define vWARNdep(loc,m) STMT_START { \
352 const IV offset = loc - RExC_precomp; \
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359 #define vWARN2(loc, m, a1) STMT_START { \
360 const IV offset = loc - RExC_precomp; \
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365 #define vWARN3(loc, m, a1, a2) STMT_START { \
366 const IV offset = loc - RExC_precomp; \
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
372 const IV offset = loc - RExC_precomp; \
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
378 const IV offset = loc - RExC_precomp; \
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 /* Allow for side effects in s */
385 #define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389 /* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
393 * Position is 1 indexed.
396 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
399 __LINE__, (node), (int)(byte))); \
401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
403 RExC_offsets[2*(node)-1] = (byte); \
408 #define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
412 #define Set_Node_Length_To_R(node,len) STMT_START { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
415 __LINE__, (int)(node), (int)(len))); \
417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
419 RExC_offsets[2*(node)] = (len); \
424 #define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427 #define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
430 /* Get offsets and lengths */
431 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
434 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
440 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441 #define EXPERIMENTAL_INPLACESCAN
444 static void clear_re(pTHX_ void *r);
446 /* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
451 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
457 SvSetMagicSV(*data->longest, data->last_found);
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
464 data->flags &= ~SF_FIX_BEFORE_EOL;
467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
468 data->offset_float_max = (l
469 ? data->last_start_max
470 : data->pos_min + data->pos_delta);
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
473 if (data->flags & SF_BEFORE_EOL)
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
477 data->flags &= ~SF_FL_BEFORE_EOL;
480 SvCUR_set(data->last_found, 0);
482 SV * const sv = data->last_found;
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
490 data->flags &= ~SF_BEFORE_EOL;
493 /* Can match anything (initialization) */
495 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
497 ANYOF_CLASS_ZERO(cl);
498 ANYOF_BITMAP_SETALL(cl);
499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
501 cl->flags |= ANYOF_LOCALE;
504 /* Can match anything (initialization) */
506 S_cl_is_anything(const struct regnode_charclass_class *cl)
510 for (value = 0; value <= ANYOF_MAX; value += 2)
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
515 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
520 /* Can match anything (initialization) */
522 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 Zero(cl, 1, struct regnode_charclass_class);
526 cl_anything(pRExC_state, cl);
530 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 Zero(cl, 1, struct regnode_charclass_class);
534 cl_anything(pRExC_state, cl);
536 cl->flags |= ANYOF_LOCALE;
539 /* 'And' a given class with another one. Can create false positives */
540 /* We assume that cl is not inverted */
542 S_cl_and(struct regnode_charclass_class *cl,
543 const struct regnode_charclass_class *and_with)
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
570 cl->flags &= ~ANYOF_UNICODE_ALL;
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
573 cl->flags &= ~ANYOF_UNICODE;
576 /* 'OR' a given class with another one. Can create false positives */
577 /* We assume that cl is not inverted */
579 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
581 if (or_with->flags & ANYOF_INVERT) {
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
600 cl_anything(pRExC_state, cl);
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && (!(or_with->flags & ANYOF_FOLD)
606 || (cl->flags & ANYOF_FOLD)) ) {
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
619 cl_anything(pRExC_state, cl);
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
638 make_trie(startbranch,first,last,tail,flags,depth)
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
648 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
650 A trie is an N'ary tree where the branches are determined by digital
651 decomposition of the key. IE, at the root node you look up the 1st character and
652 follow that branch repeat until you find the end of the branches. Nodes can be
653 marked as "accepting" meaning they represent a complete word. Eg:
657 would convert into the following structure. Numbers represent states, letters
658 following numbers represent valid transitions on the letter from that state, if
659 the number is in square brackets it represents an accepting state, otherwise it
660 will be in parenthesis.
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
666 (1) +-i->(6)-+-s->[7]
668 +-s->(3)-+-h->(4)-+-e->[5]
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
672 This shows that when matching against the string 'hers' we will begin at state 1
673 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676 single traverse. We store a mapping from accepting to state to which word was
677 matched, and then when we have multiple possibilities we try to complete the
678 rest of the regex in the order in which they occured in the alternation.
680 The only prior NFA like behaviour that would be changed by the TRIE support is
681 the silent ignoring of duplicate alternations which are of the form:
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
685 Thus EVAL blocks follwing a trie may be called a different number of times with
686 and without the optimisation. With the optimisations dupes will be silently
687 ignored. This inconsistant behaviour of EVAL type nodes is well established as
688 the following demonstrates:
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
692 which prints out 'word' three times, but
694 'words'=~/(word|word|word)(?{ print $1 })S/
696 which doesnt print it out at all. This is due to other optimisations kicking in.
698 Example of what happens on a structural level:
700 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
702 1: CURLYM[1] {1,32767}(18)
713 This would be optimizable with startbranch=5, first=5, last=16, tail=16
714 and should turn into:
716 1: CURLYM[1] {1,32767}(18)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
726 Cases where tail != last would be like /(?foo|bar)baz/:
736 which would be optimizable with startbranch=1, first=1, last=7, tail=8
737 and would end up looking like:
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
747 d = uvuni_to_utf8_flags(d, uv, 0);
749 is the recommended Unicode-aware way of saying
754 #define TRIE_STORE_REVCHAR \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 if (UTF) SvUTF8_on(tmp); \
758 av_push( TRIE_REVCHARMAP(trie), tmp ); \
761 #define TRIE_READ_CHAR STMT_START { \
765 if ( foldlen > 0 ) { \
766 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
771 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
772 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
773 foldlen -= UNISKIP( uvc ); \
774 scan = foldbuf + UNISKIP( uvc ); \
777 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
786 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
787 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
788 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
789 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
791 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
792 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
793 TRIE_LIST_LEN( state ) *= 2; \
794 Renew( trie->states[ state ].trans.list, \
795 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
798 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
799 TRIE_LIST_CUR( state )++; \
802 #define TRIE_LIST_NEW(state) STMT_START { \
803 Newxz( trie->states[ state ].trans.list, \
804 4, reg_trie_trans_le ); \
805 TRIE_LIST_CUR( state ) = 1; \
806 TRIE_LIST_LEN( state ) = 4; \
809 #define TRIE_HANDLE_WORD(state) STMT_START { \
810 if ( !trie->states[ state ].wordnum ) { \
811 /* we haven't inserted this word into the structure yet. */ \
813 trie->wordlen[ curword ] = wordlen; \
814 trie->states[ state ].wordnum = ++curword; \
816 /* store the word for dumping */ \
818 if (OP(noper) != NOTHING) \
819 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
821 tmp = newSVpvn( "", 0 ); \
822 if ( UTF ) SvUTF8_on( tmp ); \
823 av_push( trie->words, tmp ); \
826 NOOP; /* It's a dupe. So ignore it. */ \
833 dump_trie_interim_list(trie,next_alloc)
834 dump_trie_interim_table(trie,next_alloc)
836 These routines dump out a trie in a somewhat readable format.
837 The _interim_ variants are used for debugging the interim
838 tables that are used to generate the final compressed
839 representation which is what dump_trie expects.
841 Part of the reason for their existance is to provide a form
842 of documentation as to how the different representations function.
848 Dumps the final compressed table form of the trie to Perl_debug_log.
849 Used for debugging make_trie().
853 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
856 SV *sv=sv_newmortal();
857 int colwidth= trie->widecharmap ? 6 : 4;
858 GET_RE_DEBUG_FLAGS_DECL;
861 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
862 (int)depth * 2 + 2,"",
863 "Match","Base","Ofs" );
865 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
866 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
868 PerlIO_printf( Perl_debug_log, "%*s",
870 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
871 PL_colors[0], PL_colors[1],
872 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
873 PERL_PV_ESCAPE_FIRSTCHAR
878 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
879 (int)depth * 2 + 2,"");
881 for( state = 0 ; state < trie->uniquecharcount ; state++ )
882 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
883 PerlIO_printf( Perl_debug_log, "\n");
885 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
886 const U32 base = trie->states[ state ].trans.base;
888 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
890 if ( trie->states[ state ].wordnum ) {
891 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
893 PerlIO_printf( Perl_debug_log, "%6s", "" );
896 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
901 while( ( base + ofs < trie->uniquecharcount ) ||
902 ( base + ofs - trie->uniquecharcount < trie->lasttrans
903 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
906 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
908 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
909 if ( ( base + ofs >= trie->uniquecharcount ) &&
910 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
911 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
913 PerlIO_printf( Perl_debug_log, "%*"UVXf,
915 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
917 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
921 PerlIO_printf( Perl_debug_log, "]");
924 PerlIO_printf( Perl_debug_log, "\n" );
928 dump_trie_interim_list(trie,next_alloc)
929 Dumps a fully constructed but uncompressed trie in list form.
930 List tries normally only are used for construction when the number of
931 possible chars (trie->uniquecharcount) is very high.
932 Used for debugging make_trie().
935 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
938 SV *sv=sv_newmortal();
939 int colwidth= trie->widecharmap ? 6 : 4;
940 GET_RE_DEBUG_FLAGS_DECL;
941 /* print out the table precompression. */
942 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
943 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
944 "------:-----+-----------------\n" );
946 for( state=1 ; state < next_alloc ; state ++ ) {
949 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
950 (int)depth * 2 + 2,"", (UV)state );
951 if ( ! trie->states[ state ].wordnum ) {
952 PerlIO_printf( Perl_debug_log, "%5s| ","");
954 PerlIO_printf( Perl_debug_log, "W%4x| ",
955 trie->states[ state ].wordnum
958 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
959 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
961 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
963 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
964 PL_colors[0], PL_colors[1],
965 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
966 PERL_PV_ESCAPE_FIRSTCHAR
968 TRIE_LIST_ITEM(state,charid).forid,
969 (UV)TRIE_LIST_ITEM(state,charid).newstate
973 PerlIO_printf( Perl_debug_log, "\n");
978 dump_trie_interim_table(trie,next_alloc)
979 Dumps a fully constructed but uncompressed trie in table form.
980 This is the normal DFA style state transition table, with a few
981 twists to facilitate compression later.
982 Used for debugging make_trie().
985 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
989 SV *sv=sv_newmortal();
990 int colwidth= trie->widecharmap ? 6 : 4;
991 GET_RE_DEBUG_FLAGS_DECL;
994 print out the table precompression so that we can do a visual check
995 that they are identical.
998 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1000 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1001 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
1003 PerlIO_printf( Perl_debug_log, "%*s",
1005 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1006 PL_colors[0], PL_colors[1],
1007 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1008 PERL_PV_ESCAPE_FIRSTCHAR
1014 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1016 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1017 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1020 PerlIO_printf( Perl_debug_log, "\n" );
1022 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1024 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1025 (int)depth * 2 + 2,"",
1026 (UV)TRIE_NODENUM( state ) );
1028 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1029 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1031 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1033 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1035 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1036 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1038 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1039 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1046 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1047 ( ( base + charid >= ucharcount \
1048 && base + charid < ubound \
1049 && state == trie->trans[ base - ucharcount + charid ].check \
1050 && trie->trans[ base - ucharcount + charid ].next ) \
1051 ? trie->trans[ base - ucharcount + charid ].next \
1052 : ( state==1 ? special : 0 ) \
1056 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1058 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1060 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1061 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1064 We find the fail state for each state in the trie, this state is the longest proper
1065 suffix of the current states 'word' that is also a proper prefix of another word in our
1066 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1067 the DFA not to have to restart after its tried and failed a word at a given point, it
1068 simply continues as though it had been matching the other word in the first place.
1070 'abcdgu'=~/abcdefg|cdgu/
1071 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1072 fail, which would bring use to the state representing 'd' in the second word where we would
1073 try 'g' and succeed, prodceding to match 'cdgu'.
1075 /* add a fail transition */
1076 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1078 const U32 ucharcount = trie->uniquecharcount;
1079 const U32 numstates = trie->laststate;
1080 const U32 ubound = trie->lasttrans + ucharcount;
1084 U32 base = trie->states[ 1 ].trans.base;
1087 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1088 GET_RE_DEBUG_FLAGS_DECL;
1090 PERL_UNUSED_ARG(depth);
1094 ARG_SET( stclass, data_slot );
1095 Newxz( aho, 1, reg_ac_data );
1096 RExC_rx->data->data[ data_slot ] = (void*)aho;
1098 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1099 (trie->laststate+1)*sizeof(reg_trie_state));
1100 Newxz( q, numstates, U32);
1101 Newxz( aho->fail, numstates, U32 );
1104 fail[ 0 ] = fail[ 1 ] = 1;
1106 for ( charid = 0; charid < ucharcount ; charid++ ) {
1107 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1109 q[ q_write ] = newstate;
1110 /* set to point at the root */
1111 fail[ q[ q_write++ ] ]=1;
1114 while ( q_read < q_write) {
1115 const U32 cur = q[ q_read++ % numstates ];
1116 base = trie->states[ cur ].trans.base;
1118 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1119 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1121 U32 fail_state = cur;
1124 fail_state = fail[ fail_state ];
1125 fail_base = aho->states[ fail_state ].trans.base;
1126 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1128 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1129 fail[ ch_state ] = fail_state;
1130 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1132 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1134 q[ q_write++ % numstates] = ch_state;
1139 DEBUG_TRIE_COMPILE_MORE_r({
1140 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1141 for( q_read=2; q_read<numstates; q_read++ ) {
1142 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1144 PerlIO_printf(Perl_debug_log, "\n");
1147 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1153 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1156 /* first pass, loop through and scan words */
1157 reg_trie_data *trie;
1159 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1164 /* we just use folder as a flag in utf8 */
1165 const U8 * const folder = ( flags == EXACTF
1167 : ( flags == EXACTFL
1173 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1174 SV *re_trie_maxbuff;
1176 /* these are only used during construction but are useful during
1177 * debugging so we store them in the struct when debugging.
1178 * Wordcount is actually superfluous in debugging as we have
1179 * (AV*)trie->words to use for it, but that's not available when
1180 * not debugging... We could make the macro use the AV during
1181 * debugging though...
1183 U16 trie_wordcount=0;
1184 STRLEN trie_charcount=0;
1185 /*U32 trie_laststate=0;*/
1186 AV *trie_revcharmap;
1188 GET_RE_DEBUG_FLAGS_DECL;
1190 PERL_UNUSED_ARG(depth);
1193 Newxz( trie, 1, reg_trie_data );
1195 trie->startstate = 1;
1196 RExC_rx->data->data[ data_slot ] = (void*)trie;
1197 Newxz( trie->charmap, 256, U16 );
1198 if (!(UTF && folder))
1199 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1201 trie->words = newAV();
1203 TRIE_REVCHARMAP(trie) = newAV();
1205 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1206 if (!SvIOK(re_trie_maxbuff)) {
1207 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1210 PerlIO_printf( Perl_debug_log,
1211 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1212 (int)depth * 2 + 2, "",
1213 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1214 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1216 /* -- First loop and Setup --
1218 We first traverse the branches and scan each word to determine if it
1219 contains widechars, and how many unique chars there are, this is
1220 important as we have to build a table with at least as many columns as we
1223 We use an array of integers to represent the character codes 0..255
1224 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1225 native representation of the character value as the key and IV's for the
1228 *TODO* If we keep track of how many times each character is used we can
1229 remap the columns so that the table compression later on is more
1230 efficient in terms of memory by ensuring most common value is in the
1231 middle and the least common are on the outside. IMO this would be better
1232 than a most to least common mapping as theres a decent chance the most
1233 common letter will share a node with the least common, meaning the node
1234 will not be compressable. With a middle is most common approach the worst
1235 case is when we have the least common nodes twice.
1239 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1240 regnode * const noper = NEXTOPER( cur );
1241 const U8 *uc = (U8*)STRING( noper );
1242 const U8 * const e = uc + STR_LEN( noper );
1244 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1245 const U8 *scan = (U8*)NULL;
1246 U32 wordlen = 0; /* required init */
1249 TRIE_WORDCOUNT(trie)++;
1250 if (OP(noper) == NOTHING) {
1255 TRIE_BITMAP_SET(trie,*uc);
1256 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1258 for ( ; uc < e ; uc += len ) {
1259 TRIE_CHARCOUNT(trie)++;
1263 if ( !trie->charmap[ uvc ] ) {
1264 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1266 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1271 if ( !trie->widecharmap )
1272 trie->widecharmap = newHV();
1274 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1277 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1279 if ( !SvTRUE( *svpp ) ) {
1280 sv_setiv( *svpp, ++trie->uniquecharcount );
1285 if( cur == first ) {
1288 } else if (chars < trie->minlen) {
1290 } else if (chars > trie->maxlen) {
1294 } /* end first pass */
1295 DEBUG_TRIE_COMPILE_r(
1296 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1297 (int)depth * 2 + 2,"",
1298 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1299 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1300 (int)trie->minlen, (int)trie->maxlen )
1302 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
1305 We now know what we are dealing with in terms of unique chars and
1306 string sizes so we can calculate how much memory a naive
1307 representation using a flat table will take. If it's over a reasonable
1308 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1309 conservative but potentially much slower representation using an array
1312 At the end we convert both representations into the same compressed
1313 form that will be used in regexec.c for matching with. The latter
1314 is a form that cannot be used to construct with but has memory
1315 properties similar to the list form and access properties similar
1316 to the table form making it both suitable for fast searches and
1317 small enough that its feasable to store for the duration of a program.
1319 See the comment in the code where the compressed table is produced
1320 inplace from the flat tabe representation for an explanation of how
1321 the compression works.
1326 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1328 Second Pass -- Array Of Lists Representation
1330 Each state will be represented by a list of charid:state records
1331 (reg_trie_trans_le) the first such element holds the CUR and LEN
1332 points of the allocated array. (See defines above).
1334 We build the initial structure using the lists, and then convert
1335 it into the compressed table form which allows faster lookups
1336 (but cant be modified once converted).
1339 STRLEN transcount = 1;
1341 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1345 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1347 regnode * const noper = NEXTOPER( cur );
1348 U8 *uc = (U8*)STRING( noper );
1349 const U8 * const e = uc + STR_LEN( noper );
1350 U32 state = 1; /* required init */
1351 U16 charid = 0; /* sanity init */
1352 U8 *scan = (U8*)NULL; /* sanity init */
1353 STRLEN foldlen = 0; /* required init */
1354 U32 wordlen = 0; /* required init */
1355 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1357 if (OP(noper) != NOTHING) {
1358 for ( ; uc < e ; uc += len ) {
1363 charid = trie->charmap[ uvc ];
1365 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1369 charid=(U16)SvIV( *svpp );
1378 if ( !trie->states[ state ].trans.list ) {
1379 TRIE_LIST_NEW( state );
1381 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1382 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1383 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1388 newstate = next_alloc++;
1389 TRIE_LIST_PUSH( state, charid, newstate );
1394 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1396 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1399 TRIE_HANDLE_WORD(state);
1401 } /* end second pass */
1403 TRIE_LASTSTATE(trie) = next_alloc;
1404 Renew( trie->states, next_alloc, reg_trie_state );
1406 /* and now dump it out before we compress it */
1407 DEBUG_TRIE_COMPILE_MORE_r(
1408 dump_trie_interim_list(trie,next_alloc,depth+1)
1411 Newxz( trie->trans, transcount ,reg_trie_trans );
1418 for( state=1 ; state < next_alloc ; state ++ ) {
1422 DEBUG_TRIE_COMPILE_MORE_r(
1423 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1427 if (trie->states[state].trans.list) {
1428 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1432 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1433 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1434 if ( forid < minid ) {
1436 } else if ( forid > maxid ) {
1440 if ( transcount < tp + maxid - minid + 1) {
1442 Renew( trie->trans, transcount, reg_trie_trans );
1443 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1445 base = trie->uniquecharcount + tp - minid;
1446 if ( maxid == minid ) {
1448 for ( ; zp < tp ; zp++ ) {
1449 if ( ! trie->trans[ zp ].next ) {
1450 base = trie->uniquecharcount + zp - minid;
1451 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1452 trie->trans[ zp ].check = state;
1458 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1459 trie->trans[ tp ].check = state;
1464 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1465 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1466 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1467 trie->trans[ tid ].check = state;
1469 tp += ( maxid - minid + 1 );
1471 Safefree(trie->states[ state ].trans.list);
1474 DEBUG_TRIE_COMPILE_MORE_r(
1475 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1478 trie->states[ state ].trans.base=base;
1480 trie->lasttrans = tp + 1;
1484 Second Pass -- Flat Table Representation.
1486 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1487 We know that we will need Charcount+1 trans at most to store the data
1488 (one row per char at worst case) So we preallocate both structures
1489 assuming worst case.
1491 We then construct the trie using only the .next slots of the entry
1494 We use the .check field of the first entry of the node temporarily to
1495 make compression both faster and easier by keeping track of how many non
1496 zero fields are in the node.
1498 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1501 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1502 number representing the first entry of the node, and state as a
1503 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1504 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1505 are 2 entrys per node. eg:
1513 The table is internally in the right hand, idx form. However as we also
1514 have to deal with the states array which is indexed by nodenum we have to
1515 use TRIE_NODENUM() to convert.
1520 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1522 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1523 next_alloc = trie->uniquecharcount + 1;
1526 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1528 regnode * const noper = NEXTOPER( cur );
1529 const U8 *uc = (U8*)STRING( noper );
1530 const U8 * const e = uc + STR_LEN( noper );
1532 U32 state = 1; /* required init */
1534 U16 charid = 0; /* sanity init */
1535 U32 accept_state = 0; /* sanity init */
1536 U8 *scan = (U8*)NULL; /* sanity init */
1538 STRLEN foldlen = 0; /* required init */
1539 U32 wordlen = 0; /* required init */
1540 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1542 if ( OP(noper) != NOTHING ) {
1543 for ( ; uc < e ; uc += len ) {
1548 charid = trie->charmap[ uvc ];
1550 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1551 charid = svpp ? (U16)SvIV(*svpp) : 0;
1555 if ( !trie->trans[ state + charid ].next ) {
1556 trie->trans[ state + charid ].next = next_alloc;
1557 trie->trans[ state ].check++;
1558 next_alloc += trie->uniquecharcount;
1560 state = trie->trans[ state + charid ].next;
1562 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1564 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1567 accept_state = TRIE_NODENUM( state );
1568 TRIE_HANDLE_WORD(accept_state);
1570 } /* end second pass */
1572 /* and now dump it out before we compress it */
1573 DEBUG_TRIE_COMPILE_MORE_r(
1574 dump_trie_interim_table(trie,next_alloc,depth+1)
1579 * Inplace compress the table.*
1581 For sparse data sets the table constructed by the trie algorithm will
1582 be mostly 0/FAIL transitions or to put it another way mostly empty.
1583 (Note that leaf nodes will not contain any transitions.)
1585 This algorithm compresses the tables by eliminating most such
1586 transitions, at the cost of a modest bit of extra work during lookup:
1588 - Each states[] entry contains a .base field which indicates the
1589 index in the state[] array wheres its transition data is stored.
1591 - If .base is 0 there are no valid transitions from that node.
1593 - If .base is nonzero then charid is added to it to find an entry in
1596 -If trans[states[state].base+charid].check!=state then the
1597 transition is taken to be a 0/Fail transition. Thus if there are fail
1598 transitions at the front of the node then the .base offset will point
1599 somewhere inside the previous nodes data (or maybe even into a node
1600 even earlier), but the .check field determines if the transition is
1603 The following process inplace converts the table to the compressed
1604 table: We first do not compress the root node 1,and mark its all its
1605 .check pointers as 1 and set its .base pointer as 1 as well. This
1606 allows to do a DFA construction from the compressed table later, and
1607 ensures that any .base pointers we calculate later are greater than
1610 - We set 'pos' to indicate the first entry of the second node.
1612 - We then iterate over the columns of the node, finding the first and
1613 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1614 and set the .check pointers accordingly, and advance pos
1615 appropriately and repreat for the next node. Note that when we copy
1616 the next pointers we have to convert them from the original
1617 NODEIDX form to NODENUM form as the former is not valid post
1620 - If a node has no transitions used we mark its base as 0 and do not
1621 advance the pos pointer.
1623 - If a node only has one transition we use a second pointer into the
1624 structure to fill in allocated fail transitions from other states.
1625 This pointer is independent of the main pointer and scans forward
1626 looking for null transitions that are allocated to a state. When it
1627 finds one it writes the single transition into the "hole". If the
1628 pointer doesnt find one the single transition is appeneded as normal.
1630 - Once compressed we can Renew/realloc the structures to release the
1633 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1634 specifically Fig 3.47 and the associated pseudocode.
1638 const U32 laststate = TRIE_NODENUM( next_alloc );
1641 TRIE_LASTSTATE(trie) = laststate;
1643 for ( state = 1 ; state < laststate ; state++ ) {
1645 const U32 stateidx = TRIE_NODEIDX( state );
1646 const U32 o_used = trie->trans[ stateidx ].check;
1647 U32 used = trie->trans[ stateidx ].check;
1648 trie->trans[ stateidx ].check = 0;
1650 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1651 if ( flag || trie->trans[ stateidx + charid ].next ) {
1652 if ( trie->trans[ stateidx + charid ].next ) {
1654 for ( ; zp < pos ; zp++ ) {
1655 if ( ! trie->trans[ zp ].next ) {
1659 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1660 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1661 trie->trans[ zp ].check = state;
1662 if ( ++zp > pos ) pos = zp;
1669 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1671 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1672 trie->trans[ pos ].check = state;
1677 trie->lasttrans = pos + 1;
1678 Renew( trie->states, laststate + 1, reg_trie_state);
1679 DEBUG_TRIE_COMPILE_MORE_r(
1680 PerlIO_printf( Perl_debug_log,
1681 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1682 (int)depth * 2 + 2,"",
1683 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1686 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1689 } /* end table compress */
1691 /* resize the trans array to remove unused space */
1692 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1694 /* and now dump out the compressed format */
1695 DEBUG_TRIE_COMPILE_r(
1696 dump_trie(trie,depth+1)
1699 { /* Modify the program and insert the new TRIE node*/
1701 U8 nodetype =(U8)(flags & 0xFF);
1708 This means we convert either the first branch or the first Exact,
1709 depending on whether the thing following (in 'last') is a branch
1710 or not and whther first is the startbranch (ie is it a sub part of
1711 the alternation or is it the whole thing.)
1712 Assuming its a sub part we conver the EXACT otherwise we convert
1713 the whole branch sequence, including the first.
1715 /* Find the node we are going to overwrite */
1716 if ( first == startbranch && OP( last ) != BRANCH ) {
1717 /* whole branch chain */
1720 const regnode *nop = NEXTOPER( convert );
1721 mjd_offset= Node_Offset((nop));
1722 mjd_nodelen= Node_Length((nop));
1725 /* branch sub-chain */
1726 convert = NEXTOPER( first );
1727 NEXT_OFF( first ) = (U16)(last - first);
1729 mjd_offset= Node_Offset((convert));
1730 mjd_nodelen= Node_Length((convert));
1734 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1735 (int)depth * 2 + 2, "",
1736 mjd_offset,mjd_nodelen)
1739 /* But first we check to see if there is a common prefix we can
1740 split out as an EXACT and put in front of the TRIE node. */
1741 trie->startstate= 1;
1742 if ( trie->bitmap && !trie->widecharmap ) {
1745 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1746 (int)depth * 2 + 2, "",
1747 TRIE_LASTSTATE(trie))
1749 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1753 const U32 base = trie->states[ state ].trans.base;
1755 if ( trie->states[state].wordnum )
1758 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1759 if ( ( base + ofs >= trie->uniquecharcount ) &&
1760 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1761 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1763 if ( ++count > 1 ) {
1764 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1765 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1766 if ( state == 1 ) break;
1768 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1770 PerlIO_printf(Perl_debug_log,
1771 "%*sNew Start State=%"UVuf" Class: [",
1772 (int)depth * 2 + 2, "",
1775 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1776 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1778 TRIE_BITMAP_SET(trie,*ch);
1780 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1782 PerlIO_printf(Perl_debug_log, (char*)ch)
1786 TRIE_BITMAP_SET(trie,*ch);
1788 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1789 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1795 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1796 const char *ch = SvPV_nolen_const( *tmp );
1798 PerlIO_printf( Perl_debug_log,
1799 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1800 (int)depth * 2 + 2, "",
1804 OP( convert ) = nodetype;
1805 str=STRING(convert);
1814 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1820 regnode *n = convert+NODE_SZ_STR(convert);
1821 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1822 trie->startstate = state;
1823 trie->minlen -= (state - 1);
1824 trie->maxlen -= (state - 1);
1826 regnode *fix = convert;
1828 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1829 while( ++fix < n ) {
1830 Set_Node_Offset_Length(fix, 0, 0);
1836 NEXT_OFF(convert) = (U16)(tail - convert);
1840 if ( trie->maxlen ) {
1841 OP( convert ) = TRIE;
1842 NEXT_OFF( convert ) = (U16)(tail - convert);
1843 ARG_SET( convert, data_slot );
1845 /* store the type in the flags */
1846 convert->flags = nodetype;
1847 /* XXX We really should free up the resource in trie now, as we wont use them */
1849 /* needed for dumping*/
1851 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1852 regnode *opt = convert;
1853 while (++opt<optimize) {
1854 Set_Node_Offset_Length(opt,0,0);
1856 /* We now need to mark all of the space originally used by the
1857 branches as optimized away. This keeps the dumpuntil from
1858 throwing a wobbly as it doesnt use regnext() to traverse the
1860 We also "fix" the offsets
1862 while( optimize < last ) {
1863 mjd_nodelen += Node_Length((optimize));
1864 OP( optimize ) = OPTIMIZED;
1865 Set_Node_Offset_Length(optimize,0,0);
1868 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1870 } /* end node insert */
1872 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1878 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1879 * These need to be revisited when a newer toolchain becomes available.
1881 #if defined(__sparc64__) && defined(__GNUC__)
1882 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1883 # undef SPARC64_GCC_WORKAROUND
1884 # define SPARC64_GCC_WORKAROUND 1
1888 #define DEBUG_PEEP(str,scan,depth) \
1889 DEBUG_OPTIMISE_r({ \
1890 SV * const mysv=sv_newmortal(); \
1891 regnode *Next = regnext(scan); \
1892 regprop(RExC_rx, mysv, scan); \
1893 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1894 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1895 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1898 #define JOIN_EXACT(scan,min,flags) \
1899 if (PL_regkind[OP(scan)] == EXACT) \
1900 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1903 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1904 /* Merge several consecutive EXACTish nodes into one. */
1905 regnode *n = regnext(scan);
1907 regnode *next = scan + NODE_SZ_STR(scan);
1911 regnode *stop = scan;
1912 GET_RE_DEBUG_FLAGS_DECL;
1914 PERL_UNUSED_ARG(depth);
1916 #ifndef EXPERIMENTAL_INPLACESCAN
1917 PERL_UNUSED_ARG(flags);
1918 PERL_UNUSED_ARG(val);
1920 DEBUG_PEEP("join",scan,depth);
1922 /* Skip NOTHING, merge EXACT*. */
1924 ( PL_regkind[OP(n)] == NOTHING ||
1925 (stringok && (OP(n) == OP(scan))))
1927 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1929 if (OP(n) == TAIL || n > next)
1931 if (PL_regkind[OP(n)] == NOTHING) {
1932 DEBUG_PEEP("skip:",n,depth);
1933 NEXT_OFF(scan) += NEXT_OFF(n);
1934 next = n + NODE_STEP_REGNODE;
1941 else if (stringok) {
1942 const int oldl = STR_LEN(scan);
1943 regnode * const nnext = regnext(n);
1945 DEBUG_PEEP("merg",n,depth);
1948 if (oldl + STR_LEN(n) > U8_MAX)
1950 NEXT_OFF(scan) += NEXT_OFF(n);
1951 STR_LEN(scan) += STR_LEN(n);
1952 next = n + NODE_SZ_STR(n);
1953 /* Now we can overwrite *n : */
1954 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1962 #ifdef EXPERIMENTAL_INPLACESCAN
1963 if (flags && !NEXT_OFF(n)) {
1964 DEBUG_PEEP("atch", val, depth);
1965 if (reg_off_by_arg[OP(n)]) {
1966 ARG_SET(n, val - n);
1969 NEXT_OFF(n) = val - n;
1976 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1978 Two problematic code points in Unicode casefolding of EXACT nodes:
1980 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1981 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1987 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1988 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1990 This means that in case-insensitive matching (or "loose matching",
1991 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1992 length of the above casefolded versions) can match a target string
1993 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1994 This would rather mess up the minimum length computation.
1996 What we'll do is to look for the tail four bytes, and then peek
1997 at the preceding two bytes to see whether we need to decrease
1998 the minimum length by four (six minus two).
2000 Thanks to the design of UTF-8, there cannot be false matches:
2001 A sequence of valid UTF-8 bytes cannot be a subsequence of
2002 another valid sequence of UTF-8 bytes.
2005 char * const s0 = STRING(scan), *s, *t;
2006 char * const s1 = s0 + STR_LEN(scan) - 1;
2007 char * const s2 = s1 - 4;
2008 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2009 const char t0[] = "\xaf\x49\xaf\x42";
2011 const char t0[] = "\xcc\x88\xcc\x81";
2013 const char * const t1 = t0 + 3;
2016 s < s2 && (t = ninstr(s, s1, t0, t1));
2019 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2020 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2022 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2023 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2031 n = scan + NODE_SZ_STR(scan);
2033 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2040 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2044 /* REx optimizer. Converts nodes into quickier variants "in place".
2045 Finds fixed substrings. */
2047 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2048 to the position after last scanned or to NULL. */
2053 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2054 regnode *last, scan_data_t *data, U32 flags, U32 depth)
2055 /* scanp: Start here (read-write). */
2056 /* deltap: Write maxlen-minlen here. */
2057 /* last: Stop before this one. */
2060 I32 min = 0, pars = 0, code;
2061 regnode *scan = *scanp, *next;
2063 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2064 int is_inf_internal = 0; /* The studied chunk is infinite */
2065 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2066 scan_data_t data_fake;
2067 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2068 SV *re_trie_maxbuff = NULL;
2070 GET_RE_DEBUG_FLAGS_DECL;
2072 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2075 while (scan && OP(scan) != END && scan < last) {
2076 /* Peephole optimizer: */
2077 DEBUG_PEEP("Peep",scan,depth);
2079 JOIN_EXACT(scan,&min,0);
2081 /* Follow the next-chain of the current node and optimize
2082 away all the NOTHINGs from it. */
2083 if (OP(scan) != CURLYX) {
2084 const int max = (reg_off_by_arg[OP(scan)]
2086 /* I32 may be smaller than U16 on CRAYs! */
2087 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2088 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2092 /* Skip NOTHING and LONGJMP. */
2093 while ((n = regnext(n))
2094 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2095 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2096 && off + noff < max)
2098 if (reg_off_by_arg[OP(scan)])
2101 NEXT_OFF(scan) = off;
2106 /* The principal pseudo-switch. Cannot be a switch, since we
2107 look into several different things. */
2108 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2109 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2110 next = regnext(scan);
2112 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2114 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2115 I32 max1 = 0, min1 = I32_MAX, num = 0;
2116 struct regnode_charclass_class accum;
2117 regnode * const startbranch=scan;
2119 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2120 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2121 if (flags & SCF_DO_STCLASS)
2122 cl_init_zero(pRExC_state, &accum);
2124 while (OP(scan) == code) {
2125 I32 deltanext, minnext, f = 0, fake;
2126 struct regnode_charclass_class this_class;
2129 data_fake.flags = 0;
2131 data_fake.whilem_c = data->whilem_c;
2132 data_fake.last_closep = data->last_closep;
2135 data_fake.last_closep = &fake;
2136 next = regnext(scan);
2137 scan = NEXTOPER(scan);
2139 scan = NEXTOPER(scan);
2140 if (flags & SCF_DO_STCLASS) {
2141 cl_init(pRExC_state, &this_class);
2142 data_fake.start_class = &this_class;
2143 f = SCF_DO_STCLASS_AND;
2145 if (flags & SCF_WHILEM_VISITED_POS)
2146 f |= SCF_WHILEM_VISITED_POS;
2148 /* we suppose the run is continuous, last=next...*/
2149 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2150 next, &data_fake, f,depth+1);
2153 if (max1 < minnext + deltanext)
2154 max1 = minnext + deltanext;
2155 if (deltanext == I32_MAX)
2156 is_inf = is_inf_internal = 1;
2158 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2161 if (data_fake.flags & SF_HAS_EVAL)
2162 data->flags |= SF_HAS_EVAL;
2163 data->whilem_c = data_fake.whilem_c;
2165 if (flags & SCF_DO_STCLASS)
2166 cl_or(pRExC_state, &accum, &this_class);
2167 if (code == SUSPEND)
2170 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2172 if (flags & SCF_DO_SUBSTR) {
2173 data->pos_min += min1;
2174 data->pos_delta += max1 - min1;
2175 if (max1 != min1 || is_inf)
2176 data->longest = &(data->longest_float);
2179 delta += max1 - min1;
2180 if (flags & SCF_DO_STCLASS_OR) {
2181 cl_or(pRExC_state, data->start_class, &accum);
2183 cl_and(data->start_class, &and_with);
2184 flags &= ~SCF_DO_STCLASS;
2187 else if (flags & SCF_DO_STCLASS_AND) {
2189 cl_and(data->start_class, &accum);
2190 flags &= ~SCF_DO_STCLASS;
2193 /* Switch to OR mode: cache the old value of
2194 * data->start_class */
2195 StructCopy(data->start_class, &and_with,
2196 struct regnode_charclass_class);
2197 flags &= ~SCF_DO_STCLASS_AND;
2198 StructCopy(&accum, data->start_class,
2199 struct regnode_charclass_class);
2200 flags |= SCF_DO_STCLASS_OR;
2201 data->start_class->flags |= ANYOF_EOS;
2207 Assuming this was/is a branch we are dealing with: 'scan' now
2208 points at the item that follows the branch sequence, whatever
2209 it is. We now start at the beginning of the sequence and look
2215 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2217 If we can find such a subseqence we need to turn the first
2218 element into a trie and then add the subsequent branch exact
2219 strings to the trie.
2223 1. patterns where the whole set of branch can be converted to a trie,
2225 2. patterns where only a subset of the alternations can be
2226 converted to a trie.
2228 In case 1 we can replace the whole set with a single regop
2229 for the trie. In case 2 we need to keep the start and end
2232 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2233 becomes BRANCH TRIE; BRANCH X;
2235 Hypthetically when we know the regex isnt anchored we can
2236 turn a case 1 into a DFA and let it rip... Every time it finds a match
2237 it would just call its tail, no WHILEM/CURLY needed.
2240 if (PERL_ENABLE_TRIE_OPTIMISATION) {
2242 if (!re_trie_maxbuff) {
2243 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2244 if (!SvIOK(re_trie_maxbuff))
2245 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2247 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2249 regnode *first = (regnode *)NULL;
2250 regnode *last = (regnode *)NULL;
2251 regnode *tail = scan;
2256 SV * const mysv = sv_newmortal(); /* for dumping */
2258 /* var tail is used because there may be a TAIL
2259 regop in the way. Ie, the exacts will point to the
2260 thing following the TAIL, but the last branch will
2261 point at the TAIL. So we advance tail. If we
2262 have nested (?:) we may have to move through several
2266 while ( OP( tail ) == TAIL ) {
2267 /* this is the TAIL generated by (?:) */
2268 tail = regnext( tail );
2273 regprop(RExC_rx, mysv, tail );
2274 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2275 (int)depth * 2 + 2, "",
2276 "Looking for TRIE'able sequences. Tail node is: ",
2277 SvPV_nolen_const( mysv )
2283 step through the branches, cur represents each
2284 branch, noper is the first thing to be matched
2285 as part of that branch and noper_next is the
2286 regnext() of that node. if noper is an EXACT
2287 and noper_next is the same as scan (our current
2288 position in the regex) then the EXACT branch is
2289 a possible optimization target. Once we have
2290 two or more consequetive such branches we can
2291 create a trie of the EXACT's contents and stich
2292 it in place. If the sequence represents all of
2293 the branches we eliminate the whole thing and
2294 replace it with a single TRIE. If it is a
2295 subsequence then we need to stitch it in. This
2296 means the first branch has to remain, and needs
2297 to be repointed at the item on the branch chain
2298 following the last branch optimized. This could
2299 be either a BRANCH, in which case the
2300 subsequence is internal, or it could be the
2301 item following the branch sequence in which
2302 case the subsequence is at the end.
2306 /* dont use tail as the end marker for this traverse */
2307 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2308 regnode * const noper = NEXTOPER( cur );
2309 regnode * const noper_next = regnext( noper );
2312 regprop(RExC_rx, mysv, cur);
2313 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2314 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2316 regprop(RExC_rx, mysv, noper);
2317 PerlIO_printf( Perl_debug_log, " -> %s",
2318 SvPV_nolen_const(mysv));
2321 regprop(RExC_rx, mysv, noper_next );
2322 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2323 SvPV_nolen_const(mysv));
2325 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2326 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2328 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2329 : PL_regkind[ OP( noper ) ] == EXACT )
2330 || OP(noper) == NOTHING )
2331 && noper_next == tail && count<U16_MAX)
2334 if ( !first || optype == NOTHING ) {
2335 if (!first) first = cur;
2336 optype = OP( noper );
2342 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2344 if ( PL_regkind[ OP( noper ) ] == EXACT
2345 && noper_next == tail )
2349 optype = OP( noper );
2359 regprop(RExC_rx, mysv, cur);
2360 PerlIO_printf( Perl_debug_log,
2361 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2362 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2366 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2367 #ifdef TRIE_STUDY_OPT
2368 if ( made && startbranch == first ) {
2369 if ( OP(first)!=TRIE )
2370 flags |= SCF_EXACT_TRIE;
2372 regnode *chk=*scanp;
2373 while ( OP( chk ) == OPEN )
2374 chk = regnext( chk );
2376 flags |= SCF_EXACT_TRIE;
2385 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2386 scan = NEXTOPER(NEXTOPER(scan));
2387 } else /* single branch is optimized. */
2388 scan = NEXTOPER(scan);
2391 else if (OP(scan) == EXACT) {
2392 I32 l = STR_LEN(scan);
2395 const U8 * const s = (U8*)STRING(scan);
2396 l = utf8_length(s, s + l);
2397 uc = utf8_to_uvchr(s, NULL);
2399 uc = *((U8*)STRING(scan));
2402 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2403 /* The code below prefers earlier match for fixed
2404 offset, later match for variable offset. */
2405 if (data->last_end == -1) { /* Update the start info. */
2406 data->last_start_min = data->pos_min;
2407 data->last_start_max = is_inf
2408 ? I32_MAX : data->pos_min + data->pos_delta;
2410 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2412 SvUTF8_on(data->last_found);
2414 SV * const sv = data->last_found;
2415 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2416 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2417 if (mg && mg->mg_len >= 0)
2418 mg->mg_len += utf8_length((U8*)STRING(scan),
2419 (U8*)STRING(scan)+STR_LEN(scan));
2421 data->last_end = data->pos_min + l;
2422 data->pos_min += l; /* As in the first entry. */
2423 data->flags &= ~SF_BEFORE_EOL;
2425 if (flags & SCF_DO_STCLASS_AND) {
2426 /* Check whether it is compatible with what we know already! */
2430 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2431 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2432 && (!(data->start_class->flags & ANYOF_FOLD)
2433 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2436 ANYOF_CLASS_ZERO(data->start_class);
2437 ANYOF_BITMAP_ZERO(data->start_class);
2439 ANYOF_BITMAP_SET(data->start_class, uc);
2440 data->start_class->flags &= ~ANYOF_EOS;
2442 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2444 else if (flags & SCF_DO_STCLASS_OR) {
2445 /* false positive possible if the class is case-folded */
2447 ANYOF_BITMAP_SET(data->start_class, uc);
2449 data->start_class->flags |= ANYOF_UNICODE_ALL;
2450 data->start_class->flags &= ~ANYOF_EOS;
2451 cl_and(data->start_class, &and_with);
2453 flags &= ~SCF_DO_STCLASS;
2455 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2456 I32 l = STR_LEN(scan);
2457 UV uc = *((U8*)STRING(scan));
2459 /* Search for fixed substrings supports EXACT only. */
2460 if (flags & SCF_DO_SUBSTR) {
2462 scan_commit(pRExC_state, data);
2465 const U8 * const s = (U8 *)STRING(scan);
2466 l = utf8_length(s, s + l);
2467 uc = utf8_to_uvchr(s, NULL);
2470 if (flags & SCF_DO_SUBSTR)
2472 if (flags & SCF_DO_STCLASS_AND) {
2473 /* Check whether it is compatible with what we know already! */
2477 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2478 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2479 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2481 ANYOF_CLASS_ZERO(data->start_class);
2482 ANYOF_BITMAP_ZERO(data->start_class);
2484 ANYOF_BITMAP_SET(data->start_class, uc);
2485 data->start_class->flags &= ~ANYOF_EOS;
2486 data->start_class->flags |= ANYOF_FOLD;
2487 if (OP(scan) == EXACTFL)
2488 data->start_class->flags |= ANYOF_LOCALE;
2491 else if (flags & SCF_DO_STCLASS_OR) {
2492 if (data->start_class->flags & ANYOF_FOLD) {
2493 /* false positive possible if the class is case-folded.
2494 Assume that the locale settings are the same... */
2496 ANYOF_BITMAP_SET(data->start_class, uc);
2497 data->start_class->flags &= ~ANYOF_EOS;
2499 cl_and(data->start_class, &and_with);
2501 flags &= ~SCF_DO_STCLASS;
2503 #ifdef TRIE_STUDY_OPT
2504 else if (OP(scan) == TRIE) {
2505 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
2506 min += trie->minlen;
2507 delta += (trie->maxlen - trie->minlen);
2508 flags &= ~SCF_DO_STCLASS; /* xxx */
2509 if (flags & SCF_DO_SUBSTR) {
2510 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2511 data->pos_min += trie->minlen;
2512 data->pos_delta += (trie->maxlen - trie->minlen);
2513 if (trie->maxlen != trie->minlen)
2514 data->longest = &(data->longest_float);
2518 else if (strchr((const char*)PL_varies,OP(scan))) {
2519 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2520 I32 f = flags, pos_before = 0;
2521 regnode * const oscan = scan;
2522 struct regnode_charclass_class this_class;
2523 struct regnode_charclass_class *oclass = NULL;
2524 I32 next_is_eval = 0;
2526 switch (PL_regkind[OP(scan)]) {
2527 case WHILEM: /* End of (?:...)* . */
2528 scan = NEXTOPER(scan);
2531 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2532 next = NEXTOPER(scan);
2533 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2535 maxcount = REG_INFTY;
2536 next = regnext(scan);
2537 scan = NEXTOPER(scan);
2541 if (flags & SCF_DO_SUBSTR)
2546 if (flags & SCF_DO_STCLASS) {
2548 maxcount = REG_INFTY;
2549 next = regnext(scan);
2550 scan = NEXTOPER(scan);
2553 is_inf = is_inf_internal = 1;
2554 scan = regnext(scan);
2555 if (flags & SCF_DO_SUBSTR) {
2556 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2557 data->longest = &(data->longest_float);
2559 goto optimize_curly_tail;
2561 mincount = ARG1(scan);
2562 maxcount = ARG2(scan);
2563 next = regnext(scan);
2564 if (OP(scan) == CURLYX) {
2565 I32 lp = (data ? *(data->last_closep) : 0);
2566 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2568 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2569 next_is_eval = (OP(scan) == EVAL);
2571 if (flags & SCF_DO_SUBSTR) {
2572 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2573 pos_before = data->pos_min;
2577 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2579 data->flags |= SF_IS_INF;
2581 if (flags & SCF_DO_STCLASS) {
2582 cl_init(pRExC_state, &this_class);
2583 oclass = data->start_class;
2584 data->start_class = &this_class;
2585 f |= SCF_DO_STCLASS_AND;
2586 f &= ~SCF_DO_STCLASS_OR;
2588 /* These are the cases when once a subexpression
2589 fails at a particular position, it cannot succeed
2590 even after backtracking at the enclosing scope.
2592 XXXX what if minimal match and we are at the
2593 initial run of {n,m}? */
2594 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2595 f &= ~SCF_WHILEM_VISITED_POS;
2597 /* This will finish on WHILEM, setting scan, or on NULL: */
2598 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2600 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2602 if (flags & SCF_DO_STCLASS)
2603 data->start_class = oclass;
2604 if (mincount == 0 || minnext == 0) {
2605 if (flags & SCF_DO_STCLASS_OR) {
2606 cl_or(pRExC_state, data->start_class, &this_class);
2608 else if (flags & SCF_DO_STCLASS_AND) {
2609 /* Switch to OR mode: cache the old value of
2610 * data->start_class */
2611 StructCopy(data->start_class, &and_with,
2612 struct regnode_charclass_class);
2613 flags &= ~SCF_DO_STCLASS_AND;
2614 StructCopy(&this_class, data->start_class,
2615 struct regnode_charclass_class);
2616 flags |= SCF_DO_STCLASS_OR;
2617 data->start_class->flags |= ANYOF_EOS;
2619 } else { /* Non-zero len */
2620 if (flags & SCF_DO_STCLASS_OR) {
2621 cl_or(pRExC_state, data->start_class, &this_class);
2622 cl_and(data->start_class, &and_with);
2624 else if (flags & SCF_DO_STCLASS_AND)
2625 cl_and(data->start_class, &this_class);
2626 flags &= ~SCF_DO_STCLASS;
2628 if (!scan) /* It was not CURLYX, but CURLY. */
2630 if ( /* ? quantifier ok, except for (?{ ... }) */
2631 (next_is_eval || !(mincount == 0 && maxcount == 1))
2632 && (minnext == 0) && (deltanext == 0)
2633 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2634 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2635 && ckWARN(WARN_REGEXP))
2638 "Quantifier unexpected on zero-length expression");
2641 min += minnext * mincount;
2642 is_inf_internal |= ((maxcount == REG_INFTY
2643 && (minnext + deltanext) > 0)
2644 || deltanext == I32_MAX);
2645 is_inf |= is_inf_internal;
2646 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2648 /* Try powerful optimization CURLYX => CURLYN. */
2649 if ( OP(oscan) == CURLYX && data
2650 && data->flags & SF_IN_PAR
2651 && !(data->flags & SF_HAS_EVAL)
2652 && !deltanext && minnext == 1 ) {
2653 /* Try to optimize to CURLYN. */
2654 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2655 regnode * const nxt1 = nxt;
2662 if (!strchr((const char*)PL_simple,OP(nxt))
2663 && !(PL_regkind[OP(nxt)] == EXACT
2664 && STR_LEN(nxt) == 1))
2670 if (OP(nxt) != CLOSE)
2672 /* Now we know that nxt2 is the only contents: */
2673 oscan->flags = (U8)ARG(nxt);
2675 OP(nxt1) = NOTHING; /* was OPEN. */
2677 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2678 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2679 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2680 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2681 OP(nxt + 1) = OPTIMIZED; /* was count. */
2682 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2687 /* Try optimization CURLYX => CURLYM. */
2688 if ( OP(oscan) == CURLYX && data
2689 && !(data->flags & SF_HAS_PAR)
2690 && !(data->flags & SF_HAS_EVAL)
2691 && !deltanext /* atom is fixed width */
2692 && minnext != 0 /* CURLYM can't handle zero width */
2694 /* XXXX How to optimize if data == 0? */
2695 /* Optimize to a simpler form. */
2696 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2700 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2701 && (OP(nxt2) != WHILEM))
2703 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2704 /* Need to optimize away parenths. */
2705 if (data->flags & SF_IN_PAR) {
2706 /* Set the parenth number. */
2707 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2709 if (OP(nxt) != CLOSE)
2710 FAIL("Panic opt close");
2711 oscan->flags = (U8)ARG(nxt);
2712 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2713 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2715 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2716 OP(nxt + 1) = OPTIMIZED; /* was count. */
2717 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2718 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2721 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2722 regnode *nnxt = regnext(nxt1);
2725 if (reg_off_by_arg[OP(nxt1)])
2726 ARG_SET(nxt1, nxt2 - nxt1);
2727 else if (nxt2 - nxt1 < U16_MAX)
2728 NEXT_OFF(nxt1) = nxt2 - nxt1;
2730 OP(nxt) = NOTHING; /* Cannot beautify */
2735 /* Optimize again: */
2736 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2742 else if ((OP(oscan) == CURLYX)
2743 && (flags & SCF_WHILEM_VISITED_POS)
2744 /* See the comment on a similar expression above.
2745 However, this time it not a subexpression
2746 we care about, but the expression itself. */
2747 && (maxcount == REG_INFTY)
2748 && data && ++data->whilem_c < 16) {
2749 /* This stays as CURLYX, we can put the count/of pair. */
2750 /* Find WHILEM (as in regexec.c) */
2751 regnode *nxt = oscan + NEXT_OFF(oscan);
2753 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2755 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2756 | (RExC_whilem_seen << 4)); /* On WHILEM */
2758 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2760 if (flags & SCF_DO_SUBSTR) {
2761 SV *last_str = NULL;
2762 int counted = mincount != 0;
2764 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2765 #if defined(SPARC64_GCC_WORKAROUND)
2768 const char *s = NULL;
2771 if (pos_before >= data->last_start_min)
2774 b = data->last_start_min;
2777 s = SvPV_const(data->last_found, l);
2778 old = b - data->last_start_min;
2781 I32 b = pos_before >= data->last_start_min
2782 ? pos_before : data->last_start_min;
2784 const char * const s = SvPV_const(data->last_found, l);
2785 I32 old = b - data->last_start_min;
2789 old = utf8_hop((U8*)s, old) - (U8*)s;
2792 /* Get the added string: */
2793 last_str = newSVpvn(s + old, l);
2795 SvUTF8_on(last_str);
2796 if (deltanext == 0 && pos_before == b) {
2797 /* What was added is a constant string */
2799 SvGROW(last_str, (mincount * l) + 1);
2800 repeatcpy(SvPVX(last_str) + l,
2801 SvPVX_const(last_str), l, mincount - 1);
2802 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2803 /* Add additional parts. */
2804 SvCUR_set(data->last_found,
2805 SvCUR(data->last_found) - l);
2806 sv_catsv(data->last_found, last_str);
2808 SV * sv = data->last_found;
2810 SvUTF8(sv) && SvMAGICAL(sv) ?
2811 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2812 if (mg && mg->mg_len >= 0)
2813 mg->mg_len += CHR_SVLEN(last_str);
2815 data->last_end += l * (mincount - 1);
2818 /* start offset must point into the last copy */
2819 data->last_start_min += minnext * (mincount - 1);
2820 data->last_start_max += is_inf ? I32_MAX
2821 : (maxcount - 1) * (minnext + data->pos_delta);
2824 /* It is counted once already... */
2825 data->pos_min += minnext * (mincount - counted);
2826 data->pos_delta += - counted * deltanext +
2827 (minnext + deltanext) * maxcount - minnext * mincount;
2828 if (mincount != maxcount) {
2829 /* Cannot extend fixed substrings found inside
2831 scan_commit(pRExC_state,data);
2832 if (mincount && last_str) {
2833 SV * const sv = data->last_found;
2834 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2835 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2839 sv_setsv(sv, last_str);
2840 data->last_end = data->pos_min;
2841 data->last_start_min =
2842 data->pos_min - CHR_SVLEN(last_str);
2843 data->last_start_max = is_inf
2845 : data->pos_min + data->pos_delta
2846 - CHR_SVLEN(last_str);
2848 data->longest = &(data->longest_float);
2850 SvREFCNT_dec(last_str);
2852 if (data && (fl & SF_HAS_EVAL))
2853 data->flags |= SF_HAS_EVAL;
2854 optimize_curly_tail:
2855 if (OP(oscan) != CURLYX) {
2856 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2858 NEXT_OFF(oscan) += NEXT_OFF(next);
2861 default: /* REF and CLUMP only? */
2862 if (flags & SCF_DO_SUBSTR) {
2863 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2864 data->longest = &(data->longest_float);
2866 is_inf = is_inf_internal = 1;
2867 if (flags & SCF_DO_STCLASS_OR)
2868 cl_anything(pRExC_state, data->start_class);
2869 flags &= ~SCF_DO_STCLASS;
2873 else if (strchr((const char*)PL_simple,OP(scan))) {
2876 if (flags & SCF_DO_SUBSTR) {
2877 scan_commit(pRExC_state,data);
2881 if (flags & SCF_DO_STCLASS) {
2882 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2884 /* Some of the logic below assumes that switching
2885 locale on will only add false positives. */
2886 switch (PL_regkind[OP(scan)]) {
2890 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2891 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2892 cl_anything(pRExC_state, data->start_class);
2895 if (OP(scan) == SANY)
2897 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2898 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2899 || (data->start_class->flags & ANYOF_CLASS));
2900 cl_anything(pRExC_state, data->start_class);
2902 if (flags & SCF_DO_STCLASS_AND || !value)
2903 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2906 if (flags & SCF_DO_STCLASS_AND)
2907 cl_and(data->start_class,
2908 (struct regnode_charclass_class*)scan);
2910 cl_or(pRExC_state, data->start_class,
2911 (struct regnode_charclass_class*)scan);
2914 if (flags & SCF_DO_STCLASS_AND) {
2915 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2916 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2917 for (value = 0; value < 256; value++)
2918 if (!isALNUM(value))
2919 ANYOF_BITMAP_CLEAR(data->start_class, value);
2923 if (data->start_class->flags & ANYOF_LOCALE)
2924 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2926 for (value = 0; value < 256; value++)
2928 ANYOF_BITMAP_SET(data->start_class, value);
2933 if (flags & SCF_DO_STCLASS_AND) {
2934 if (data->start_class->flags & ANYOF_LOCALE)
2935 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2938 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2939 data->start_class->flags |= ANYOF_LOCALE;
2943 if (flags & SCF_DO_STCLASS_AND) {
2944 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2945 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2946 for (value = 0; value < 256; value++)
2948 ANYOF_BITMAP_CLEAR(data->start_class, value);
2952 if (data->start_class->flags & ANYOF_LOCALE)
2953 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2955 for (value = 0; value < 256; value++)
2956 if (!isALNUM(value))
2957 ANYOF_BITMAP_SET(data->start_class, value);
2962 if (flags & SCF_DO_STCLASS_AND) {
2963 if (data->start_class->flags & ANYOF_LOCALE)
2964 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2967 data->start_class->flags |= ANYOF_LOCALE;
2968 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2972 if (flags & SCF_DO_STCLASS_AND) {
2973 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2974 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2975 for (value = 0; value < 256; value++)
2976 if (!isSPACE(value))
2977 ANYOF_BITMAP_CLEAR(data->start_class, value);
2981 if (data->start_class->flags & ANYOF_LOCALE)
2982 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2984 for (value = 0; value < 256; value++)
2986 ANYOF_BITMAP_SET(data->start_class, value);
2991 if (flags & SCF_DO_STCLASS_AND) {
2992 if (data->start_class->flags & ANYOF_LOCALE)
2993 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2996 data->start_class->flags |= ANYOF_LOCALE;
2997 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3001 if (flags & SCF_DO_STCLASS_AND) {
3002 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3003 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3004 for (value = 0; value < 256; value++)
3006 ANYOF_BITMAP_CLEAR(data->start_class, value);
3010 if (data->start_class->flags & ANYOF_LOCALE)
3011 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3013 for (value = 0; value < 256; value++)
3014 if (!isSPACE(value))
3015 ANYOF_BITMAP_SET(data->start_class, value);
3020 if (flags & SCF_DO_STCLASS_AND) {
3021 if (data->start_class->flags & ANYOF_LOCALE) {
3022 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3023 for (value = 0; value < 256; value++)
3024 if (!isSPACE(value))
3025 ANYOF_BITMAP_CLEAR(data->start_class, value);
3029 data->start_class->flags |= ANYOF_LOCALE;
3030 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3034 if (flags & SCF_DO_STCLASS_AND) {
3035 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3036 for (value = 0; value < 256; value++)
3037 if (!isDIGIT(value))
3038 ANYOF_BITMAP_CLEAR(data->start_class, value);
3041 if (data->start_class->flags & ANYOF_LOCALE)
3042 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3044 for (value = 0; value < 256; value++)
3046 ANYOF_BITMAP_SET(data->start_class, value);
3051 if (flags & SCF_DO_STCLASS_AND) {
3052 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3053 for (value = 0; value < 256; value++)
3055 ANYOF_BITMAP_CLEAR(data->start_class, value);
3058 if (data->start_class->flags & ANYOF_LOCALE)
3059 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3061 for (value = 0; value < 256; value++)
3062 if (!isDIGIT(value))
3063 ANYOF_BITMAP_SET(data->start_class, value);
3068 if (flags & SCF_DO_STCLASS_OR)
3069 cl_and(data->start_class, &and_with);
3070 flags &= ~SCF_DO_STCLASS;
3073 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3074 data->flags |= (OP(scan) == MEOL
3078 else if ( PL_regkind[OP(scan)] == BRANCHJ
3079 /* Lookbehind, or need to calculate parens/evals/stclass: */
3080 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3081 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3082 /* Lookahead/lookbehind */
3083 I32 deltanext, minnext, fake = 0;
3085 struct regnode_charclass_class intrnl;
3088 data_fake.flags = 0;
3090 data_fake.whilem_c = data->whilem_c;
3091 data_fake.last_closep = data->last_closep;
3094 data_fake.last_closep = &fake;
3095 if ( flags & SCF_DO_STCLASS && !scan->flags
3096 && OP(scan) == IFMATCH ) { /* Lookahead */
3097 cl_init(pRExC_state, &intrnl);
3098 data_fake.start_class = &intrnl;
3099 f |= SCF_DO_STCLASS_AND;
3101 if (flags & SCF_WHILEM_VISITED_POS)
3102 f |= SCF_WHILEM_VISITED_POS;
3103 next = regnext(scan);
3104 nscan = NEXTOPER(NEXTOPER(scan));
3105 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3108 vFAIL("Variable length lookbehind not implemented");
3110 else if (minnext > U8_MAX) {
3111 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3113 scan->flags = (U8)minnext;
3116 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3118 if (data_fake.flags & SF_HAS_EVAL)
3119 data->flags |= SF_HAS_EVAL;
3120 data->whilem_c = data_fake.whilem_c;
3122 if (f & SCF_DO_STCLASS_AND) {
3123 const int was = (data->start_class->flags & ANYOF_EOS);
3125 cl_and(data->start_class, &intrnl);
3127 data->start_class->flags |= ANYOF_EOS;
3130 else if (OP(scan) == OPEN) {
3133 else if (OP(scan) == CLOSE) {
3134 if ((I32)ARG(scan) == is_par) {
3135 next = regnext(scan);
3137 if ( next && (OP(next) != WHILEM) && next < last)
3138 is_par = 0; /* Disable optimization */
3141 *(data->last_closep) = ARG(scan);
3143 else if (OP(scan) == EVAL) {
3145 data->flags |= SF_HAS_EVAL;
3147 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3148 if (flags & SCF_DO_SUBSTR) {
3149 scan_commit(pRExC_state,data);
3150 data->longest = &(data->longest_float);
3152 is_inf = is_inf_internal = 1;
3153 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3154 cl_anything(pRExC_state, data->start_class);
3155 flags &= ~SCF_DO_STCLASS;
3157 /* Else: zero-length, ignore. */
3158 scan = regnext(scan);
3163 *deltap = is_inf_internal ? I32_MAX : delta;
3164 if (flags & SCF_DO_SUBSTR && is_inf)
3165 data->pos_delta = I32_MAX - data->pos_min;
3166 if (is_par > U8_MAX)
3168 if (is_par && pars==1 && data) {
3169 data->flags |= SF_IN_PAR;
3170 data->flags &= ~SF_HAS_PAR;
3172 else if (pars && data) {
3173 data->flags |= SF_HAS_PAR;
3174 data->flags &= ~SF_IN_PAR;
3176 if (flags & SCF_DO_STCLASS_OR)
3177 cl_and(data->start_class, &and_with);
3178 if (flags & SCF_EXACT_TRIE)
3179 data->flags |= SCF_EXACT_TRIE;
3184 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3186 if (RExC_rx->data) {
3187 Renewc(RExC_rx->data,
3188 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3189 char, struct reg_data);
3190 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3191 RExC_rx->data->count += n;
3194 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3195 char, struct reg_data);
3196 Newx(RExC_rx->data->what, n, U8);
3197 RExC_rx->data->count = n;
3199 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3200 return RExC_rx->data->count - n;
3203 #ifndef PERL_IN_XSUB_RE
3205 Perl_reginitcolors(pTHX)
3208 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3210 char *t = savepv(s);
3214 t = strchr(t, '\t');
3220 PL_colors[i] = t = (char *)"";
3225 PL_colors[i++] = (char *)"";
3233 - pregcomp - compile a regular expression into internal code
3235 * We can't allocate space until we know how big the compiled form will be,
3236 * but we can't compile it (and thus know how big it is) until we've got a
3237 * place to put the code. So we cheat: we compile it twice, once with code
3238 * generation turned off and size counting turned on, and once "for real".
3239 * This also means that we don't allocate space until we are sure that the
3240 * thing really will compile successfully, and we never have to move the
3241 * code and thus invalidate pointers into it. (Note that it has to be in
3242 * one piece because free() must be able to free it all.) [NB: not true in perl]
3244 * Beware that the optimization-preparation code in here knows about some
3245 * of the structure of the compiled regexp. [I'll say.]
3248 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3259 RExC_state_t RExC_state;
3260 RExC_state_t * const pRExC_state = &RExC_state;
3261 #ifdef TRIE_STUDY_OPT
3263 RExC_state_t copyRExC_state;
3266 GET_RE_DEBUG_FLAGS_DECL;
3269 FAIL("NULL regexp argument");
3271 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3274 DEBUG_r(if (!PL_colorset) reginitcolors());
3276 SV *dsv= sv_newmortal();
3277 RE_PV_QUOTED_DECL(s, RExC_utf8,
3278 dsv, RExC_precomp, (xend - exp), 60);
3279 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3280 PL_colors[4],PL_colors[5],s);
3282 RExC_flags = pm->op_pmflags;
3286 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3287 RExC_seen_evals = 0;
3290 /* First pass: determine size, legality. */
3297 RExC_emit = &PL_regdummy;
3298 RExC_whilem_seen = 0;
3299 #if 0 /* REGC() is (currently) a NOP at the first pass.
3300 * Clever compilers notice this and complain. --jhi */
3301 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3303 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3304 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3305 RExC_precomp = NULL;
3308 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3309 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3310 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3313 RExC_lastparse=NULL;
3317 /* Small enough for pointer-storage convention?
3318 If extralen==0, this means that we will not need long jumps. */
3319 if (RExC_size >= 0x10000L && RExC_extralen)
3320 RExC_size += RExC_extralen;
3323 if (RExC_whilem_seen > 15)
3324 RExC_whilem_seen = 15;
3326 /* Allocate space and initialize. */
3327 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3330 FAIL("Regexp out of space");
3333 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3334 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3337 r->prelen = xend - exp;
3338 r->precomp = savepvn(RExC_precomp, r->prelen);
3340 #ifdef PERL_OLD_COPY_ON_WRITE
3341 r->saved_copy = NULL;
3343 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3344 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3345 r->lastparen = 0; /* mg.c reads this. */
3347 r->substrs = 0; /* Useful during FAIL. */
3348 r->startp = 0; /* Useful during FAIL. */
3349 r->endp = 0; /* Useful during FAIL. */
3351 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3353 r->offsets[0] = RExC_size;
3355 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3356 "%s %"UVuf" bytes for offset annotations.\n",
3357 r->offsets ? "Got" : "Couldn't get",
3358 (UV)((2*RExC_size+1) * sizeof(U32))));
3362 /* Second pass: emit code. */
3363 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3368 RExC_emit_start = r->program;
3369 RExC_emit = r->program;
3370 /* Store the count of eval-groups for security checks: */
3371 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3372 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3374 if (reg(pRExC_state, 0, &flags,1) == NULL)
3376 /* XXXX To minimize changes to RE engine we always allocate
3377 3-units-long substrs field. */
3378 Newx(r->substrs, 1, struct reg_substr_data);
3381 Zero(r->substrs, 1, struct reg_substr_data);
3382 StructCopy(&zero_scan_data, &data, scan_data_t);
3384 #ifdef TRIE_STUDY_OPT
3386 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3387 RExC_state=copyRExC_state;
3388 if (data.longest_fixed)
3389 SvREFCNT_dec(data.longest_fixed);
3390 if (data.longest_float)
3391 SvREFCNT_dec(data.longest_float);
3392 if (data.last_found)
3393 SvREFCNT_dec(data.last_found);
3395 copyRExC_state=RExC_state;
3398 /* Dig out information for optimizations. */
3399 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3400 pm->op_pmflags = RExC_flags;
3402 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3403 r->regstclass = NULL;
3404 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3405 r->reganch |= ROPT_NAUGHTY;
3406 scan = r->program + 1; /* First BRANCH. */
3408 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3409 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3411 STRLEN longest_float_length, longest_fixed_length;
3412 struct regnode_charclass_class ch_class; /* pointed to by data */
3414 I32 last_close = 0; /* pointed to by data */
3417 /* Skip introductions and multiplicators >= 1. */
3418 while ((OP(first) == OPEN && (sawopen = 1)) ||
3419 /* An OR of *one* alternative - should not happen now. */
3420 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3421 /* for now we can't handle lookbehind IFMATCH*/
3422 (OP(first) == IFMATCH && !first->flags) ||
3423 (OP(first) == PLUS) ||
3424 (OP(first) == MINMOD) ||
3425 /* An {n,m} with n>0 */
3426 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3428 DEBUG_PEEP("first:",first,0);
3429 if (OP(first) == PLUS)
3432 first += regarglen[OP(first)];
3433 if (OP(first) == IFMATCH) {
3434 first = NEXTOPER(first);
3435 first += EXTRA_STEP_2ARGS;
3436 } else /* XXX possible optimisation for /(?=)/ */
3437 first = NEXTOPER(first);
3440 /* Starting-point info. */
3442 /* Ignore EXACT as we deal with it later. */
3443 if (PL_regkind[OP(first)] == EXACT) {
3444 if (OP(first) == EXACT)
3445 NOOP; /* Empty, get anchored substr later. */
3446 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3447 r->regstclass = first;
3450 else if (OP(first) == TRIE &&
3451 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3453 /* this can happen only on restudy */
3454 struct regnode_1 *trie_op;
3455 Newxz(trie_op,1,struct regnode_1);
3456 StructCopy(first,trie_op,struct regnode_1);
3457 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3458 r->regstclass = (regnode *)trie_op;
3461 else if (strchr((const char*)PL_simple,OP(first)))
3462 r->regstclass = first;
3463 else if (PL_regkind[OP(first)] == BOUND ||
3464 PL_regkind[OP(first)] == NBOUND)
3465 r->regstclass = first;
3466 else if (PL_regkind[OP(first)] == BOL) {
3467 r->reganch |= (OP(first) == MBOL
3469 : (OP(first) == SBOL
3472 first = NEXTOPER(first);
3475 else if (OP(first) == GPOS) {
3476 r->reganch |= ROPT_ANCH_GPOS;
3477 first = NEXTOPER(first);
3480 else if (!sawopen && (OP(first) == STAR &&
3481 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3482 !(r->reganch & ROPT_ANCH) )
3484 /* turn .* into ^.* with an implied $*=1 */
3486 (OP(NEXTOPER(first)) == REG_ANY)
3489 r->reganch |= type | ROPT_IMPLICIT;
3490 first = NEXTOPER(first);
3493 if (sawplus && (!sawopen || !RExC_sawback)
3494 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3495 /* x+ must match at the 1st pos of run of x's */
3496 r->reganch |= ROPT_SKIP;
3498 /* Scan is after the zeroth branch, first is atomic matcher. */
3499 #ifdef TRIE_STUDY_OPT
3502 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3503 (IV)(first - scan + 1))
3507 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3508 (IV)(first - scan + 1))
3514 * If there's something expensive in the r.e., find the
3515 * longest literal string that must appear and make it the
3516 * regmust. Resolve ties in favor of later strings, since
3517 * the regstart check works with the beginning of the r.e.
3518 * and avoiding duplication strengthens checking. Not a
3519 * strong reason, but sufficient in the absence of others.
3520 * [Now we resolve ties in favor of the earlier string if
3521 * it happens that c_offset_min has been invalidated, since the
3522 * earlier string may buy us something the later one won't.]
3526 data.longest_fixed = newSVpvs("");
3527 data.longest_float = newSVpvs("");
3528 data.last_found = newSVpvs("");
3529 data.longest = &(data.longest_fixed);
3531 if (!r->regstclass) {
3532 cl_init(pRExC_state, &ch_class);
3533 data.start_class = &ch_class;
3534 stclass_flag = SCF_DO_STCLASS_AND;
3535 } else /* XXXX Check for BOUND? */
3537 data.last_closep = &last_close;
3539 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3540 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3542 #ifdef TRIE_STUDY_OPT
3543 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3548 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3549 && data.last_start_min == 0 && data.last_end > 0
3550 && !RExC_seen_zerolen
3551 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3552 r->reganch |= ROPT_CHECK_ALL;
3553 scan_commit(pRExC_state, &data);
3554 SvREFCNT_dec(data.last_found);
3556 longest_float_length = CHR_SVLEN(data.longest_float);
3557 if (longest_float_length
3558 || (data.flags & SF_FL_BEFORE_EOL
3559 && (!(data.flags & SF_FL_BEFORE_MEOL)
3560 || (RExC_flags & PMf_MULTILINE)))) {
3563 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3564 && data.offset_fixed == data.offset_float_min
3565 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3566 goto remove_float; /* As in (a)+. */
3568 if (SvUTF8(data.longest_float)) {
3569 r->float_utf8 = data.longest_float;
3570 r->float_substr = NULL;
3572 r->float_substr = data.longest_float;
3573 r->float_utf8 = NULL;
3575 r->float_min_offset = data.offset_float_min;
3576 r->float_max_offset = data.offset_float_max;
3577 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3578 && (!(data.flags & SF_FL_BEFORE_MEOL)
3579 || (RExC_flags & PMf_MULTILINE)));
3580 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3584 r->float_substr = r->float_utf8 = NULL;
3585 SvREFCNT_dec(data.longest_float);
3586 longest_float_length = 0;
3589 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3590 if (longest_fixed_length
3591 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3592 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3593 || (RExC_flags & PMf_MULTILINE)))) {
3596 if (SvUTF8(data.longest_fixed)) {
3597 r->anchored_utf8 = data.longest_fixed;
3598 r->anchored_substr = NULL;
3600 r->anchored_substr = data.longest_fixed;
3601 r->anchored_utf8 = NULL;
3603 r->anchored_offset = data.offset_fixed;
3604 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3605 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3606 || (RExC_flags & PMf_MULTILINE)));
3607 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3610 r->anchored_substr = r->anchored_utf8 = NULL;
3611 SvREFCNT_dec(data.longest_fixed);
3612 longest_fixed_length = 0;
3615 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3616 r->regstclass = NULL;
3617 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3619 && !(data.start_class->flags & ANYOF_EOS)
3620 && !cl_is_anything(data.start_class))
3622 const I32 n = add_data(pRExC_state, 1, "f");
3624 Newx(RExC_rx->data->data[n], 1,
3625 struct regnode_charclass_class);
3626 StructCopy(data.start_class,
3627 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3628 struct regnode_charclass_class);
3629 r->regstclass = (regnode*)RExC_rx->data->data[n];
3630 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3631 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3632 regprop(r, sv, (regnode*)data.start_class);
3633 PerlIO_printf(Perl_debug_log,
3634 "synthetic stclass \"%s\".\n",
3635 SvPVX_const(sv));});
3638 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3639 if (longest_fixed_length > longest_float_length) {
3640 r->check_substr = r->anchored_substr;
3641 r->check_utf8 = r->anchored_utf8;
3642 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3643 if (r->reganch & ROPT_ANCH_SINGLE)
3644 r->reganch |= ROPT_NOSCAN;
3647 r->check_substr = r->float_substr;
3648 r->check_utf8 = r->float_utf8;
3649 r->check_offset_min = data.offset_float_min;
3650 r->check_offset_max = data.offset_float_max;
3652 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3653 This should be changed ASAP! */
3654 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3655 r->reganch |= RE_USE_INTUIT;
3656 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3657 r->reganch |= RE_INTUIT_TAIL;
3661 /* Several toplevels. Best we can is to set minlen. */
3663 struct regnode_charclass_class ch_class;
3666 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3668 scan = r->program + 1;
3669 cl_init(pRExC_state, &ch_class);
3670 data.start_class = &ch_class;
3671 data.last_closep = &last_close;
3673 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3674 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3676 #ifdef TRIE_STUDY_OPT
3677 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3682 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3683 = r->float_substr = r->float_utf8 = NULL;
3684 if (!(data.start_class->flags & ANYOF_EOS)
3685 && !cl_is_anything(data.start_class))
3687 const I32 n = add_data(pRExC_state, 1, "f");
3689 Newx(RExC_rx->data->data[n], 1,
3690 struct regnode_charclass_class);
3691 StructCopy(data.start_class,
3692 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3693 struct regnode_charclass_class);
3694 r->regstclass = (regnode*)RExC_rx->data->data[n];
3695 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3696 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3697 regprop(r, sv, (regnode*)data.start_class);
3698 PerlIO_printf(Perl_debug_log,
3699 "synthetic stclass \"%s\".\n",
3700 SvPVX_const(sv));});
3705 if (RExC_seen & REG_SEEN_GPOS)
3706 r->reganch |= ROPT_GPOS_SEEN;
3707 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3708 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3709 if (RExC_seen & REG_SEEN_EVAL)
3710 r->reganch |= ROPT_EVAL_SEEN;
3711 if (RExC_seen & REG_SEEN_CANY)
3712 r->reganch |= ROPT_CANY_SEEN;
3713 Newxz(r->startp, RExC_npar, I32);
3714 Newxz(r->endp, RExC_npar, I32);
3716 DEBUG_r( RX_DEBUG_on(r) );
3718 PerlIO_printf(Perl_debug_log,"Final program:\n");
3721 DEBUG_OFFSETS_r(if (r->offsets) {
3722 const U32 len = r->offsets[0];
3724 GET_RE_DEBUG_FLAGS_DECL;
3725 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3726 for (i = 1; i <= len; i++) {
3727 if (r->offsets[i*2-1] || r->offsets[i*2])
3728 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3729 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3731 PerlIO_printf(Perl_debug_log, "\n");
3737 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3738 int rem=(int)(RExC_end - RExC_parse); \
3747 if (RExC_lastparse!=RExC_parse) \
3748 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3751 iscut ? "..." : "<" \
3754 PerlIO_printf(Perl_debug_log,"%16s",""); \
3759 num=REG_NODE_NUM(RExC_emit); \
3760 if (RExC_lastnum!=num) \
3761 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3763 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3764 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3765 (int)((depth*2)), "", \
3769 RExC_lastparse=RExC_parse; \
3774 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3775 DEBUG_PARSE_MSG((funcname)); \
3776 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3779 - reg - regular expression, i.e. main body or parenthesized thing
3781 * Caller must absorb opening parenthesis.
3783 * Combining parenthesis handling with the base level of regular expression
3784 * is a trifle forced, but the need to tie the tails of the branches to what
3785 * follows makes it hard to avoid.
3787 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3789 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3791 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3795 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3796 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3799 register regnode *ret; /* Will be the head of the group. */
3800 register regnode *br;
3801 register regnode *lastbr;
3802 register regnode *ender = NULL;
3803 register I32 parno = 0;
3805 const I32 oregflags = RExC_flags;
3806 bool have_branch = 0;
3809 /* for (?g), (?gc), and (?o) warnings; warning
3810 about (?c) will warn about (?g) -- japhy */
3812 #define WASTED_O 0x01
3813 #define WASTED_G 0x02
3814 #define WASTED_C 0x04
3815 #define WASTED_GC (0x02|0x04)
3816 I32 wastedflags = 0x00;
3818 char * parse_start = RExC_parse; /* MJD */
3819 char * const oregcomp_parse = RExC_parse;
3821 GET_RE_DEBUG_FLAGS_DECL;
3822 DEBUG_PARSE("reg ");
3825 *flagp = 0; /* Tentatively. */
3828 /* Make an OPEN node, if parenthesized. */
3830 if (*RExC_parse == '?') { /* (?...) */
3831 U32 posflags = 0, negflags = 0;
3832 U32 *flagsp = &posflags;
3833 bool is_logical = 0;
3834 const char * const seqstart = RExC_parse;
3837 paren = *RExC_parse++;
3838 ret = NULL; /* For look-ahead/behind. */
3840 case '<': /* (?<...) */
3841 RExC_seen |= REG_SEEN_LOOKBEHIND;
3842 if (*RExC_parse == '!')
3844 if (*RExC_parse != '=' && *RExC_parse != '!')
3847 case '=': /* (?=...) */
3848 case '!': /* (?!...) */
3849 RExC_seen_zerolen++;
3850 case ':': /* (?:...) */
3851 case '>': /* (?>...) */
3853 case '$': /* (?$...) */
3854 case '@': /* (?@...) */
3855 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3857 case '#': /* (?#...) */
3858 while (*RExC_parse && *RExC_parse != ')')
3860 if (*RExC_parse != ')')
3861 FAIL("Sequence (?#... not terminated");
3862 nextchar(pRExC_state);
3865 case 'p': /* (?p...) */
3866 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3867 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3869 case '?': /* (??...) */
3871 if (*RExC_parse != '{')
3873 paren = *RExC_parse++;
3875 case '{': /* (?{...}) */
3877 I32 count = 1, n = 0;
3879 char *s = RExC_parse;
3881 RExC_seen_zerolen++;
3882 RExC_seen |= REG_SEEN_EVAL;
3883 while (count && (c = *RExC_parse)) {
3894 if (*RExC_parse != ')') {
3896 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3900 OP_4tree *sop, *rop;
3901 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3904 Perl_save_re_context(aTHX);
3905 rop = sv_compile_2op(sv, &sop, "re", &pad);
3906 sop->op_private |= OPpREFCOUNTED;
3907 /* re_dup will OpREFCNT_inc */
3908 OpREFCNT_set(sop, 1);
3911 n = add_data(pRExC_state, 3, "nop");
3912 RExC_rx->data->data[n] = (void*)rop;
3913 RExC_rx->data->data[n+1] = (void*)sop;
3914 RExC_rx->data->data[n+2] = (void*)pad;
3917 else { /* First pass */
3918 if (PL_reginterp_cnt < ++RExC_seen_evals
3920 /* No compiled RE interpolated, has runtime
3921 components ===> unsafe. */
3922 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3923 if (PL_tainting && PL_tainted)
3924 FAIL("Eval-group in insecure regular expression");
3925 #if PERL_VERSION > 8
3926 if (IN_PERL_COMPILETIME)
3931 nextchar(pRExC_state);
3933 ret = reg_node(pRExC_state, LOGICAL);
3936 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3937 /* deal with the length of this later - MJD */
3940 ret = reganode(pRExC_state, EVAL, n);
3941 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3942 Set_Node_Offset(ret, parse_start);
3945 case '(': /* (?(?{...})...) and (?(?=...)...) */
3947 if (RExC_parse[0] == '?') { /* (?(?...)) */
3948 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3949 || RExC_parse[1] == '<'
3950 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3953 ret = reg_node(pRExC_state, LOGICAL);
3956 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3960 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3963 parno = atoi(RExC_parse++);
3965 while (isDIGIT(*RExC_parse))
3967 ret = reganode(pRExC_state, GROUPP, parno);
3969 if ((c = *nextchar(pRExC_state)) != ')')
3970 vFAIL("Switch condition not recognized");
3972 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3973 br = regbranch(pRExC_state, &flags, 1,depth+1);
3975 br = reganode(pRExC_state, LONGJMP, 0);
3977 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3978 c = *nextchar(pRExC_state);
3982 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3983 regbranch(pRExC_state, &flags, 1,depth+1);
3984 REGTAIL(pRExC_state, ret, lastbr);
3987 c = *nextchar(pRExC_state);
3992 vFAIL("Switch (?(condition)... contains too many branches");
3993 ender = reg_node(pRExC_state, TAIL);
3994 REGTAIL(pRExC_state, br, ender);
3996 REGTAIL(pRExC_state, lastbr, ender);
3997 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
4000 REGTAIL(pRExC_state, ret, ender);
4004 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
4008 RExC_parse--; /* for vFAIL to print correctly */
4009 vFAIL("Sequence (? incomplete");
4013 parse_flags: /* (?i) */
4014 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
4015 /* (?g), (?gc) and (?o) are useless here
4016 and must be globally applied -- japhy */
4018 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4019 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
4020 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
4021 if (! (wastedflags & wflagbit) ) {
4022 wastedflags |= wflagbit;
4025 "Useless (%s%c) - %suse /%c modifier",
4026 flagsp == &negflags ? "?-" : "?",
4028 flagsp == &negflags ? "don't " : "",
4034 else if (*RExC_parse == 'c') {
4035 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
4036 if (! (wastedflags & WASTED_C) ) {
4037 wastedflags |= WASTED_GC;
4040 "Useless (%sc) - %suse /gc modifier",
4041 flagsp == &negflags ? "?-" : "?",
4042 flagsp == &negflags ? "don't " : ""
4047 else { pmflag(flagsp, *RExC_parse); }
4051 if (*RExC_parse == '-') {
4053 wastedflags = 0; /* reset so (?g-c) warns twice */
4057 RExC_flags |= posflags;
4058 RExC_flags &= ~negflags;
4059 if (*RExC_parse == ':') {
4065 if (*RExC_parse != ')') {
4067 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
4069 nextchar(pRExC_state);
4077 ret = reganode(pRExC_state, OPEN, parno);
4078 Set_Node_Length(ret, 1); /* MJD */
4079 Set_Node_Offset(ret, RExC_parse); /* MJD */
4086 /* Pick up the branches, linking them together. */
4087 parse_start = RExC_parse; /* MJD */
4088 br = regbranch(pRExC_state, &flags, 1,depth+1);
4089 /* branch_len = (paren != 0); */
4093 if (*RExC_parse == '|') {
4094 if (!SIZE_ONLY && RExC_extralen) {
4095 reginsert(pRExC_state, BRANCHJ, br);
4098 reginsert(pRExC_state, BRANCH, br);
4099 Set_Node_Length(br, paren != 0);
4100 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4104 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4106 else if (paren == ':') {
4107 *flagp |= flags&SIMPLE;
4109 if (is_open) { /* Starts with OPEN. */
4110 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4112 else if (paren != '?') /* Not Conditional */
4114 *flagp |= flags & (SPSTART | HASWIDTH);
4116 while (*RExC_parse == '|') {
4117 if (!SIZE_ONLY && RExC_extralen) {
4118 ender = reganode(pRExC_state, LONGJMP,0);
4119 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4122 RExC_extralen += 2; /* Account for LONGJMP. */
4123 nextchar(pRExC_state);
4124 br = regbranch(pRExC_state, &flags, 0, depth+1);
4128 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4132 *flagp |= flags&SPSTART;
4135 if (have_branch || paren != ':') {
4136 /* Make a closing node, and hook it on the end. */
4139 ender = reg_node(pRExC_state, TAIL);
4142 ender = reganode(pRExC_state, CLOSE, parno);
4143 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4144 Set_Node_Length(ender,1); /* MJD */
4150 *flagp &= ~HASWIDTH;
4153 ender = reg_node(pRExC_state, SUCCEED);
4156 ender = reg_node(pRExC_state, END);
4159 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4161 if (have_branch && !SIZE_ONLY) {
4162 /* Hook the tails of the branches to the closing node. */
4163 for (br = ret; br; br = regnext(br)) {
4164 const U8 op = PL_regkind[OP(br)];
4166 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4168 else if (op == BRANCHJ) {
4169 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4177 static const char parens[] = "=!<,>";
4179 if (paren && (p = strchr(parens, paren))) {
4180 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4181 int flag = (p - parens) > 1;
4184 node = SUSPEND, flag = 0;
4185 reginsert(pRExC_state, node,ret);
4186 Set_Node_Cur_Length(ret);
4187 Set_Node_Offset(ret, parse_start + 1);
4189 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4193 /* Check for proper termination. */
4195 RExC_flags = oregflags;
4196 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4197 RExC_parse = oregcomp_parse;
4198 vFAIL("Unmatched (");
4201 else if (!paren && RExC_parse < RExC_end) {
4202 if (*RExC_parse == ')') {
4204 vFAIL("Unmatched )");
4207 FAIL("Junk on end of regexp"); /* "Can't happen". */
4215 - regbranch - one alternative of an | operator
4217 * Implements the concatenation operator.
4220 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4223 register regnode *ret;
4224 register regnode *chain = NULL;
4225 register regnode *latest;
4226 I32 flags = 0, c = 0;
4227 GET_RE_DEBUG_FLAGS_DECL;
4228 DEBUG_PARSE("brnc");
4232 if (!SIZE_ONLY && RExC_extralen)
4233 ret = reganode(pRExC_state, BRANCHJ,0);
4235 ret = reg_node(pRExC_state, BRANCH);
4236 Set_Node_Length(ret, 1);
4240 if (!first && SIZE_ONLY)
4241 RExC_extralen += 1; /* BRANCHJ */
4243 *flagp = WORST; /* Tentatively. */
4246 nextchar(pRExC_state);
4247 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4249 latest = regpiece(pRExC_state, &flags,depth+1);
4250 if (latest == NULL) {
4251 if (flags & TRYAGAIN)
4255 else if (ret == NULL)
4257 *flagp |= flags&HASWIDTH;
4258 if (chain == NULL) /* First piece. */
4259 *flagp |= flags&SPSTART;
4262 REGTAIL(pRExC_state, chain, latest);
4267 if (chain == NULL) { /* Loop ran zero times. */
4268 chain = reg_node(pRExC_state, NOTHING);
4273 *flagp |= flags&SIMPLE;
4280 - regpiece - something followed by possible [*+?]
4282 * Note that the branching code sequences used for ? and the general cases
4283 * of * and + are somewhat optimized: they use the same NOTHING node as
4284 * both the endmarker for their branch list and the body of the last branch.
4285 * It might seem that this node could be dispensed with entirely, but the
4286 * endmarker role is not redundant.
4289 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4292 register regnode *ret;
4294 register char *next;
4296 const char * const origparse = RExC_parse;
4298 I32 max = REG_INFTY;
4300 const char *maxpos = NULL;
4301 GET_RE_DEBUG_FLAGS_DECL;
4302 DEBUG_PARSE("piec");
4304 ret = regatom(pRExC_state, &flags,depth+1);
4306 if (flags & TRYAGAIN)
4313 if (op == '{' && regcurly(RExC_parse)) {
4315 parse_start = RExC_parse; /* MJD */
4316 next = RExC_parse + 1;
4317 while (isDIGIT(*next) || *next == ',') {
4326 if (*next == '}') { /* got one */
4330 min = atoi(RExC_parse);
4334 maxpos = RExC_parse;
4336 if (!max && *maxpos != '0')
4337 max = REG_INFTY; /* meaning "infinity" */
4338 else if (max >= REG_INFTY)
4339 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4341 nextchar(pRExC_state);
4344 if ((flags&SIMPLE)) {
4345 RExC_naughty += 2 + RExC_naughty / 2;
4346 reginsert(pRExC_state, CURLY, ret);
4347 Set_Node_Offset(ret, parse_start+1); /* MJD */
4348 Set_Node_Cur_Length(ret);
4351 regnode * const w = reg_node(pRExC_state, WHILEM);
4354 REGTAIL(pRExC_state, ret, w);
4355 if (!SIZE_ONLY && RExC_extralen) {
4356 reginsert(pRExC_state, LONGJMP,ret);
4357 reginsert(pRExC_state, NOTHING,ret);
4358 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4360 reginsert(pRExC_state, CURLYX,ret);
4362 Set_Node_Offset(ret, parse_start+1);
4363 Set_Node_Length(ret,
4364 op == '{' ? (RExC_parse - parse_start) : 1);
4366 if (!SIZE_ONLY && RExC_extralen)
4367 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4368 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4370 RExC_whilem_seen++, RExC_extralen += 3;
4371 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4379 if (max && max < min)
4380 vFAIL("Can't do {n,m} with n > m");
4382 ARG1_SET(ret, (U16)min);
4383 ARG2_SET(ret, (U16)max);
4395 #if 0 /* Now runtime fix should be reliable. */
4397 /* if this is reinstated, don't forget to put this back into perldiag:
4399 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4401 (F) The part of the regexp subject to either the * or + quantifier
4402 could match an empty string. The {#} shows in the regular
4403 expression about where the problem was discovered.
4407 if (!(flags&HASWIDTH) && op != '?')
4408 vFAIL("Regexp *+ operand could be empty");
4411 parse_start = RExC_parse;
4412 nextchar(pRExC_state);
4414 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4416 if (op == '*' && (flags&SIMPLE)) {
4417 reginsert(pRExC_state, STAR, ret);
4421 else if (op == '*') {
4425 else if (op == '+' && (flags&SIMPLE)) {
4426 reginsert(pRExC_state, PLUS, ret);
4430 else if (op == '+') {
4434 else if (op == '?') {
4439 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4441 "%.*s matches null string many times",
4442 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4446 if (*RExC_parse == '?') {
4447 nextchar(pRExC_state);
4448 reginsert(pRExC_state, MINMOD, ret);
4449 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4451 if (ISMULT2(RExC_parse)) {
4453 vFAIL("Nested quantifiers");
4460 - regatom - the lowest level
4462 * Optimization: gobbles an entire sequence of ordinary characters so that
4463 * it can turn them into a single node, which is smaller to store and
4464 * faster to run. Backslashed characters are exceptions, each becoming a
4465 * separate node; the code is simpler that way and it's not worth fixing.
4467 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4468 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4471 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4474 register regnode *ret = NULL;
4476 char *parse_start = RExC_parse;
4477 GET_RE_DEBUG_FLAGS_DECL;
4478 DEBUG_PARSE("atom");
4479 *flagp = WORST; /* Tentatively. */
4482 switch (*RExC_parse) {
4484 RExC_seen_zerolen++;
4485 nextchar(pRExC_state);
4486 if (RExC_flags & PMf_MULTILINE)
4487 ret = reg_node(pRExC_state, MBOL);
4488 else if (RExC_flags & PMf_SINGLELINE)
4489 ret = reg_node(pRExC_state, SBOL);
4491 ret = reg_node(pRExC_state, BOL);
4492 Set_Node_Length(ret, 1); /* MJD */
4495 nextchar(pRExC_state);
4497 RExC_seen_zerolen++;
4498 if (RExC_flags & PMf_MULTILINE)
4499 ret = reg_node(pRExC_state, MEOL);
4500 else if (RExC_flags & PMf_SINGLELINE)
4501 ret = reg_node(pRExC_state, SEOL);
4503 ret = reg_node(pRExC_state, EOL);
4504 Set_Node_Length(ret, 1); /* MJD */
4507 nextchar(pRExC_state);
4508 if (RExC_flags & PMf_SINGLELINE)
4509 ret = reg_node(pRExC_state, SANY);
4511 ret = reg_node(pRExC_state, REG_ANY);
4512 *flagp |= HASWIDTH|SIMPLE;
4514 Set_Node_Length(ret, 1); /* MJD */
4518 char * const oregcomp_parse = ++RExC_parse;
4519 ret = regclass(pRExC_state,depth+1);
4520 if (*RExC_parse != ']') {
4521 RExC_parse = oregcomp_parse;
4522 vFAIL("Unmatched [");
4524 nextchar(pRExC_state);
4525 *flagp |= HASWIDTH|SIMPLE;
4526 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4530 nextchar(pRExC_state);
4531 ret = reg(pRExC_state, 1, &flags,depth+1);
4533 if (flags & TRYAGAIN) {
4534 if (RExC_parse == RExC_end) {
4535 /* Make parent create an empty node if needed. */
4543 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4547 if (flags & TRYAGAIN) {
4551 vFAIL("Internal urp");
4552 /* Supposed to be caught earlier. */
4555 if (!regcurly(RExC_parse)) {
4564 vFAIL("Quantifier follows nothing");
4567 switch (*++RExC_parse) {
4569 RExC_seen_zerolen++;
4570 ret = reg_node(pRExC_state, SBOL);
4572 nextchar(pRExC_state);
4573 Set_Node_Length(ret, 2); /* MJD */
4576 ret = reg_node(pRExC_state, GPOS);
4577 RExC_seen |= REG_SEEN_GPOS;
4579 nextchar(pRExC_state);
4580 Set_Node_Length(ret, 2); /* MJD */
4583 ret = reg_node(pRExC_state, SEOL);
4585 RExC_seen_zerolen++; /* Do not optimize RE away */
4586 nextchar(pRExC_state);
4589 ret = reg_node(pRExC_state, EOS);
4591 RExC_seen_zerolen++; /* Do not optimize RE away */
4592 nextchar(pRExC_state);
4593 Set_Node_Length(ret, 2); /* MJD */
4596 ret = reg_node(pRExC_state, CANY);
4597 RExC_seen |= REG_SEEN_CANY;
4598 *flagp |= HASWIDTH|SIMPLE;
4599 nextchar(pRExC_state);
4600 Set_Node_Length(ret, 2); /* MJD */
4603 ret = reg_node(pRExC_state, CLUMP);
4605 nextchar(pRExC_state);
4606 Set_Node_Length(ret, 2); /* MJD */
4609 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4610 *flagp |= HASWIDTH|SIMPLE;
4611 nextchar(pRExC_state);
4612 Set_Node_Length(ret, 2); /* MJD */
4615 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4616 *flagp |= HASWIDTH|SIMPLE;
4617 nextchar(pRExC_state);
4618 Set_Node_Length(ret, 2); /* MJD */
4621 RExC_seen_zerolen++;
4622 RExC_seen |= REG_SEEN_LOOKBEHIND;
4623 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4625 nextchar(pRExC_state);
4626 Set_Node_Length(ret, 2); /* MJD */
4629 RExC_seen_zerolen++;
4630 RExC_seen |= REG_SEEN_LOOKBEHIND;
4631 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4633 nextchar(pRExC_state);
4634 Set_Node_Length(ret, 2); /* MJD */
4637 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4638 *flagp |= HASWIDTH|SIMPLE;
4639 nextchar(pRExC_state);
4640 Set_Node_Length(ret, 2); /* MJD */
4643 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4644 *flagp |= HASWIDTH|SIMPLE;
4645 nextchar(pRExC_state);
4646 Set_Node_Length(ret, 2); /* MJD */
4649 ret = reg_node(pRExC_state, DIGIT);
4650 *flagp |= HASWIDTH|SIMPLE;
4651 nextchar(pRExC_state);
4652 Set_Node_Length(ret, 2); /* MJD */
4655 ret = reg_node(pRExC_state, NDIGIT);
4656 *flagp |= HASWIDTH|SIMPLE;
4657 nextchar(pRExC_state);
4658 Set_Node_Length(ret, 2); /* MJD */
4663 char* const oldregxend = RExC_end;
4664 char* parse_start = RExC_parse - 2;
4666 if (RExC_parse[1] == '{') {
4667 /* a lovely hack--pretend we saw [\pX] instead */
4668 RExC_end = strchr(RExC_parse, '}');
4670 const U8 c = (U8)*RExC_parse;
4672 RExC_end = oldregxend;
4673 vFAIL2("Missing right brace on \\%c{}", c);
4678 RExC_end = RExC_parse + 2;
4679 if (RExC_end > oldregxend)
4680 RExC_end = oldregxend;
4684 ret = regclass(pRExC_state,depth+1);
4686 RExC_end = oldregxend;
4689 Set_Node_Offset(ret, parse_start + 2);
4690 Set_Node_Cur_Length(ret);
4691 nextchar(pRExC_state);
4692 *flagp |= HASWIDTH|SIMPLE;
4705 case '1': case '2': case '3': case '4':
4706 case '5': case '6': case '7': case '8': case '9':
4708 const I32 num = atoi(RExC_parse);
4710 if (num > 9 && num >= RExC_npar)
4713 char * const parse_start = RExC_parse - 1; /* MJD */
4714 while (isDIGIT(*RExC_parse))
4717 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4718 vFAIL("Reference to nonexistent group");
4720 ret = reganode(pRExC_state,
4721 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4725 /* override incorrect value set in reganode MJD */
4726 Set_Node_Offset(ret, parse_start+1);
4727 Set_Node_Cur_Length(ret); /* MJD */
4729 nextchar(pRExC_state);
4734 if (RExC_parse >= RExC_end)
4735 FAIL("Trailing \\");
4738 /* Do not generate "unrecognized" warnings here, we fall
4739 back into the quick-grab loop below */
4746 if (RExC_flags & PMf_EXTENDED) {
4747 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4749 if (RExC_parse < RExC_end)
4755 register STRLEN len;
4760 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4762 parse_start = RExC_parse - 1;
4768 ret = reg_node(pRExC_state,
4769 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4771 for (len = 0, p = RExC_parse - 1;
4772 len < 127 && p < RExC_end;
4775 char * const oldp = p;
4777 if (RExC_flags & PMf_EXTENDED)
4778 p = regwhite(p, RExC_end);
4825 ender = ASCII_TO_NATIVE('\033');
4829 ender = ASCII_TO_NATIVE('\007');
4834 char* const e = strchr(p, '}');
4838 vFAIL("Missing right brace on \\x{}");
4841 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4842 | PERL_SCAN_DISALLOW_PREFIX;
4843 STRLEN numlen = e - p - 1;
4844 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4851 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4853 ender = grok_hex(p, &numlen, &flags, NULL);
4859 ender = UCHARAT(p++);
4860 ender = toCTRL(ender);
4862 case '0': case '1': case '2': case '3':case '4':
4863 case '5': case '6': case '7': case '8':case '9':
4865 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4868 ender = grok_oct(p, &numlen, &flags, NULL);
4878 FAIL("Trailing \\");
4881 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4882 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4883 goto normal_default;
4888 if (UTF8_IS_START(*p) && UTF) {
4890 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4891 &numlen, UTF8_ALLOW_DEFAULT);
4898 if (RExC_flags & PMf_EXTENDED)
4899 p = regwhite(p, RExC_end);
4901 /* Prime the casefolded buffer. */
4902 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4904 if (ISMULT2(p)) { /* Back off on ?+*. */
4909 /* Emit all the Unicode characters. */
4911 for (foldbuf = tmpbuf;
4913 foldlen -= numlen) {
4914 ender = utf8_to_uvchr(foldbuf, &numlen);
4916 const STRLEN unilen = reguni(pRExC_state, ender, s);
4919 /* In EBCDIC the numlen
4920 * and unilen can differ. */
4922 if (numlen >= foldlen)
4926 break; /* "Can't happen." */
4930 const STRLEN unilen = reguni(pRExC_state, ender, s);
4939 REGC((char)ender, s++);
4945 /* Emit all the Unicode characters. */
4947 for (foldbuf = tmpbuf;
4949 foldlen -= numlen) {
4950 ender = utf8_to_uvchr(foldbuf, &numlen);
4952 const STRLEN unilen = reguni(pRExC_state, ender, s);
4955 /* In EBCDIC the numlen
4956 * and unilen can differ. */
4958 if (numlen >= foldlen)
4966 const STRLEN unilen = reguni(pRExC_state, ender, s);
4975 REGC((char)ender, s++);
4979 Set_Node_Cur_Length(ret); /* MJD */
4980 nextchar(pRExC_state);
4982 /* len is STRLEN which is unsigned, need to copy to signed */
4985 vFAIL("Internal disaster");
4989 if (len == 1 && UNI_IS_INVARIANT(ender))
4993 RExC_size += STR_SZ(len);
4996 RExC_emit += STR_SZ(len);
5002 /* If the encoding pragma is in effect recode the text of
5003 * any EXACT-kind nodes. */
5004 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
5005 const STRLEN oldlen = STR_LEN(ret);
5006 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
5010 if (sv_utf8_downgrade(sv, TRUE)) {
5011 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5012 const STRLEN newlen = SvCUR(sv);
5017 GET_RE_DEBUG_FLAGS_DECL;
5018 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
5019 (int)oldlen, STRING(ret),
5021 Copy(s, STRING(ret), newlen, char);
5022 STR_LEN(ret) += newlen - oldlen;
5023 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5025 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5033 S_regwhite(char *p, const char *e)
5038 else if (*p == '#') {
5041 } while (p < e && *p != '\n');
5049 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5050 Character classes ([:foo:]) can also be negated ([:^foo:]).
5051 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5052 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
5053 but trigger failures because they are currently unimplemented. */
5055 #define POSIXCC_DONE(c) ((c) == ':')
5056 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5057 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5060 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
5063 I32 namedclass = OOB_NAMEDCLASS;
5065 if (value == '[' && RExC_parse + 1 < RExC_end &&
5066 /* I smell either [: or [= or [. -- POSIX has been here, right? */
5067 POSIXCC(UCHARAT(RExC_parse))) {
5068 const char c = UCHARAT(RExC_parse);
5069 char* const s = RExC_parse++;
5071 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
5073 if (RExC_parse == RExC_end)
5074 /* Grandfather lone [:, [=, [. */
5077 const char* const t = RExC_parse++; /* skip over the c */
5080 if (UCHARAT(RExC_parse) == ']') {
5081 const char *posixcc = s + 1;
5082 RExC_parse++; /* skip over the ending ] */
5085 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5086 const I32 skip = t - posixcc;
5088 /* Initially switch on the length of the name. */
5091 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5092 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5095 /* Names all of length 5. */
5096 /* alnum alpha ascii blank cntrl digit graph lower
5097 print punct space upper */
5098 /* Offset 4 gives the best switch position. */
5099 switch (posixcc[4]) {
5101 if (memEQ(posixcc, "alph", 4)) /* alpha */
5102 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5105 if (memEQ(posixcc, "spac", 4)) /* space */
5106 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5109 if (memEQ(posixcc, "grap", 4)) /* graph */
5110 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5113 if (memEQ(posixcc, "asci", 4)) /* ascii */
5114 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5117 if (memEQ(posixcc, "blan", 4)) /* blank */
5118 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5121 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5122 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5125 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5126 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5129 if (memEQ(posixcc, "lowe", 4)) /* lower */
5130 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5131 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5132 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5135 if (memEQ(posixcc, "digi", 4)) /* digit */
5136 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5137 else if (memEQ(posixcc, "prin", 4)) /* print */
5138 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5139 else if (memEQ(posixcc, "punc", 4)) /* punct */
5140 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5145 if (memEQ(posixcc, "xdigit", 6))
5146 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5150 if (namedclass == OOB_NAMEDCLASS)
5151 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5153 assert (posixcc[skip] == ':');
5154 assert (posixcc[skip+1] == ']');
5155 } else if (!SIZE_ONLY) {
5156 /* [[=foo=]] and [[.foo.]] are still future. */
5158 /* adjust RExC_parse so the warning shows after
5160 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5162 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5165 /* Maternal grandfather:
5166 * "[:" ending in ":" but not in ":]" */
5176 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5179 if (POSIXCC(UCHARAT(RExC_parse))) {
5180 const char *s = RExC_parse;
5181 const char c = *s++;
5185 if (*s && c == *s && s[1] == ']') {
5186 if (ckWARN(WARN_REGEXP))
5188 "POSIX syntax [%c %c] belongs inside character classes",
5191 /* [[=foo=]] and [[.foo.]] are still future. */
5192 if (POSIXCC_NOTYET(c)) {
5193 /* adjust RExC_parse so the error shows after
5195 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5197 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5205 parse a class specification and produce either an ANYOF node that
5206 matches the pattern. If the pattern matches a single char only and
5207 that char is < 256 then we produce an EXACT node instead.
5210 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5214 register UV nextvalue;
5215 register IV prevvalue = OOB_UNICODE;
5216 register IV range = 0;
5217 register regnode *ret;
5220 char *rangebegin = NULL;
5221 bool need_class = 0;
5224 bool optimize_invert = TRUE;
5225 AV* unicode_alternate = NULL;
5227 UV literal_endpoint = 0;
5229 UV stored = 0; /* number of chars stored in the class */
5231 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5232 case we need to change the emitted regop to an EXACT. */
5233 const char * orig_parse = RExC_parse;
5234 GET_RE_DEBUG_FLAGS_DECL;
5236 PERL_UNUSED_ARG(depth);
5239 DEBUG_PARSE("clas");
5241 /* Assume we are going to generate an ANYOF node. */
5242 ret = reganode(pRExC_state, ANYOF, 0);
5245 ANYOF_FLAGS(ret) = 0;
5247 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5251 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5255 RExC_size += ANYOF_SKIP;
5256 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5259 RExC_emit += ANYOF_SKIP;
5261 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5263 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5264 ANYOF_BITMAP_ZERO(ret);
5265 listsv = newSVpvs("# comment\n");
5268 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5270 if (!SIZE_ONLY && POSIXCC(nextvalue))
5271 checkposixcc(pRExC_state);
5273 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5274 if (UCHARAT(RExC_parse) == ']')
5277 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5281 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5284 rangebegin = RExC_parse;
5286 value = utf8n_to_uvchr((U8*)RExC_parse,
5287 RExC_end - RExC_parse,
5288 &numlen, UTF8_ALLOW_DEFAULT);
5289 RExC_parse += numlen;
5292 value = UCHARAT(RExC_parse++);
5294 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5295 if (value == '[' && POSIXCC(nextvalue))
5296 namedclass = regpposixcc(pRExC_state, value);
5297 else if (value == '\\') {
5299 value = utf8n_to_uvchr((U8*)RExC_parse,
5300 RExC_end - RExC_parse,
5301 &numlen, UTF8_ALLOW_DEFAULT);
5302 RExC_parse += numlen;
5305 value = UCHARAT(RExC_parse++);
5306 /* Some compilers cannot handle switching on 64-bit integer
5307 * values, therefore value cannot be an UV. Yes, this will
5308 * be a problem later if we want switch on Unicode.
5309 * A similar issue a little bit later when switching on
5310 * namedclass. --jhi */
5311 switch ((I32)value) {
5312 case 'w': namedclass = ANYOF_ALNUM; break;
5313 case 'W': namedclass = ANYOF_NALNUM; break;
5314 case 's': namedclass = ANYOF_SPACE; break;
5315 case 'S': namedclass = ANYOF_NSPACE; break;
5316 case 'd': namedclass = ANYOF_DIGIT; break;
5317 case 'D': namedclass = ANYOF_NDIGIT; break;
5322 if (RExC_parse >= RExC_end)
5323 vFAIL2("Empty \\%c{}", (U8)value);
5324 if (*RExC_parse == '{') {
5325 const U8 c = (U8)value;
5326 e = strchr(RExC_parse++, '}');
5328 vFAIL2("Missing right brace on \\%c{}", c);
5329 while (isSPACE(UCHARAT(RExC_parse)))
5331 if (e == RExC_parse)
5332 vFAIL2("Empty \\%c{}", c);
5334 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5342 if (UCHARAT(RExC_parse) == '^') {
5345 value = value == 'p' ? 'P' : 'p'; /* toggle */
5346 while (isSPACE(UCHARAT(RExC_parse))) {
5351 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5352 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5355 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5356 namedclass = ANYOF_MAX; /* no official name, but it's named */
5359 case 'n': value = '\n'; break;
5360 case 'r': value = '\r'; break;
5361 case 't': value = '\t'; break;
5362 case 'f': value = '\f'; break;
5363 case 'b': value = '\b'; break;
5364 case 'e': value = ASCII_TO_NATIVE('\033');break;
5365 case 'a': value = ASCII_TO_NATIVE('\007');break;
5367 if (*RExC_parse == '{') {
5368 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5369 | PERL_SCAN_DISALLOW_PREFIX;
5370 char * const e = strchr(RExC_parse++, '}');
5372 vFAIL("Missing right brace on \\x{}");
5374 numlen = e - RExC_parse;
5375 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5379 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5381 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5382 RExC_parse += numlen;
5386 value = UCHARAT(RExC_parse++);
5387 value = toCTRL(value);
5389 case '0': case '1': case '2': case '3': case '4':
5390 case '5': case '6': case '7': case '8': case '9':
5394 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5395 RExC_parse += numlen;
5399 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5401 "Unrecognized escape \\%c in character class passed through",
5405 } /* end of \blah */
5411 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5413 if (!SIZE_ONLY && !need_class)
5414 ANYOF_CLASS_ZERO(ret);
5418 /* a bad range like a-\d, a-[:digit:] ? */
5421 if (ckWARN(WARN_REGEXP)) {
5423 RExC_parse >= rangebegin ?
5424 RExC_parse - rangebegin : 0;
5426 "False [] range \"%*.*s\"",
5429 if (prevvalue < 256) {
5430 ANYOF_BITMAP_SET(ret, prevvalue);
5431 ANYOF_BITMAP_SET(ret, '-');
5434 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5435 Perl_sv_catpvf(aTHX_ listsv,
5436 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5440 range = 0; /* this was not a true range */
5444 const char *what = NULL;
5447 if (namedclass > OOB_NAMEDCLASS)
5448 optimize_invert = FALSE;
5449 /* Possible truncation here but in some 64-bit environments
5450 * the compiler gets heartburn about switch on 64-bit values.
5451 * A similar issue a little earlier when switching on value.
5453 switch ((I32)namedclass) {
5456 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5458 for (value = 0; value < 256; value++)
5460 ANYOF_BITMAP_SET(ret, value);
5467 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5469 for (value = 0; value < 256; value++)
5470 if (!isALNUM(value))
5471 ANYOF_BITMAP_SET(ret, value);
5478 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5480 for (value = 0; value < 256; value++)
5481 if (isALNUMC(value))
5482 ANYOF_BITMAP_SET(ret, value);
5489 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5491 for (value = 0; value < 256; value++)
5492 if (!isALNUMC(value))
5493 ANYOF_BITMAP_SET(ret, value);
5500 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5502 for (value = 0; value < 256; value++)
5504 ANYOF_BITMAP_SET(ret, value);
5511 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5513 for (value = 0; value < 256; value++)
5514 if (!isALPHA(value))
5515 ANYOF_BITMAP_SET(ret, value);
5522 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5525 for (value = 0; value < 128; value++)
5526 ANYOF_BITMAP_SET(ret, value);
5528 for (value = 0; value < 256; value++) {
5530 ANYOF_BITMAP_SET(ret, value);
5539 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5542 for (value = 128; value < 256; value++)
5543 ANYOF_BITMAP_SET(ret, value);
5545 for (value = 0; value < 256; value++) {
5546 if (!isASCII(value))
5547 ANYOF_BITMAP_SET(ret, value);
5556 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5558 for (value = 0; value < 256; value++)
5560 ANYOF_BITMAP_SET(ret, value);
5567 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5569 for (value = 0; value < 256; value++)
5570 if (!isBLANK(value))
5571 ANYOF_BITMAP_SET(ret, value);
5578 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5580 for (value = 0; value < 256; value++)
5582 ANYOF_BITMAP_SET(ret, value);
5589 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5591 for (value = 0; value < 256; value++)
5592 if (!isCNTRL(value))
5593 ANYOF_BITMAP_SET(ret, value);
5600 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5602 /* consecutive digits assumed */
5603 for (value = '0'; value <= '9'; value++)
5604 ANYOF_BITMAP_SET(ret, value);
5611 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5613 /* consecutive digits assumed */
5614 for (value = 0; value < '0'; value++)
5615 ANYOF_BITMAP_SET(ret, value);
5616 for (value = '9' + 1; value < 256; value++)
5617 ANYOF_BITMAP_SET(ret, value);
5624 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5626 for (value = 0; value < 256; value++)
5628 ANYOF_BITMAP_SET(ret, value);
5635 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5637 for (value = 0; value < 256; value++)
5638 if (!isGRAPH(value))
5639 ANYOF_BITMAP_SET(ret, value);
5646 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5648 for (value = 0; value < 256; value++)
5650 ANYOF_BITMAP_SET(ret, value);
5657 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5659 for (value = 0; value < 256; value++)
5660 if (!isLOWER(value))
5661 ANYOF_BITMAP_SET(ret, value);
5668 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5670 for (value = 0; value < 256; value++)
5672 ANYOF_BITMAP_SET(ret, value);
5679 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5681 for (value = 0; value < 256; value++)
5682 if (!isPRINT(value))
5683 ANYOF_BITMAP_SET(ret, value);
5690 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5692 for (value = 0; value < 256; value++)
5693 if (isPSXSPC(value))
5694 ANYOF_BITMAP_SET(ret, value);
5701 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5703 for (value = 0; value < 256; value++)
5704 if (!isPSXSPC(value))
5705 ANYOF_BITMAP_SET(ret, value);
5712 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5714 for (value = 0; value < 256; value++)
5716 ANYOF_BITMAP_SET(ret, value);
5723 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5725 for (value = 0; value < 256; value++)
5726 if (!isPUNCT(value))
5727 ANYOF_BITMAP_SET(ret, value);
5734 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5736 for (value = 0; value < 256; value++)
5738 ANYOF_BITMAP_SET(ret, value);
5745 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5747 for (value = 0; value < 256; value++)
5748 if (!isSPACE(value))
5749 ANYOF_BITMAP_SET(ret, value);
5756 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5758 for (value = 0; value < 256; value++)
5760 ANYOF_BITMAP_SET(ret, value);
5767 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5769 for (value = 0; value < 256; value++)
5770 if (!isUPPER(value))
5771 ANYOF_BITMAP_SET(ret, value);
5778 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5780 for (value = 0; value < 256; value++)
5781 if (isXDIGIT(value))
5782 ANYOF_BITMAP_SET(ret, value);
5789 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5791 for (value = 0; value < 256; value++)
5792 if (!isXDIGIT(value))
5793 ANYOF_BITMAP_SET(ret, value);
5799 /* this is to handle \p and \P */
5802 vFAIL("Invalid [::] class");
5806 /* Strings such as "+utf8::isWord\n" */
5807 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5810 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5813 } /* end of namedclass \blah */
5816 if (prevvalue > (IV)value) /* b-a */ {
5817 const int w = RExC_parse - rangebegin;
5818 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5819 range = 0; /* not a valid range */
5823 prevvalue = value; /* save the beginning of the range */
5824 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5825 RExC_parse[1] != ']') {
5828 /* a bad range like \w-, [:word:]- ? */
5829 if (namedclass > OOB_NAMEDCLASS) {
5830 if (ckWARN(WARN_REGEXP)) {
5832 RExC_parse >= rangebegin ?
5833 RExC_parse - rangebegin : 0;
5835 "False [] range \"%*.*s\"",
5839 ANYOF_BITMAP_SET(ret, '-');
5841 range = 1; /* yeah, it's a range! */
5842 continue; /* but do it the next time */
5846 /* now is the next time */
5847 /*stored += (value - prevvalue + 1);*/
5849 if (prevvalue < 256) {
5850 const IV ceilvalue = value < 256 ? value : 255;
5853 /* In EBCDIC [\x89-\x91] should include
5854 * the \x8e but [i-j] should not. */
5855 if (literal_endpoint == 2 &&
5856 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5857 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5859 if (isLOWER(prevvalue)) {
5860 for (i = prevvalue; i <= ceilvalue; i++)
5862 ANYOF_BITMAP_SET(ret, i);
5864 for (i = prevvalue; i <= ceilvalue; i++)
5866 ANYOF_BITMAP_SET(ret, i);
5871 for (i = prevvalue; i <= ceilvalue; i++) {
5872 if (!ANYOF_BITMAP_TEST(ret,i)) {
5874 ANYOF_BITMAP_SET(ret, i);
5878 if (value > 255 || UTF) {
5879 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5880 const UV natvalue = NATIVE_TO_UNI(value);
5881 stored+=2; /* can't optimize this class */
5882 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5883 if (prevnatvalue < natvalue) { /* what about > ? */
5884 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5885 prevnatvalue, natvalue);
5887 else if (prevnatvalue == natvalue) {
5888 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5890 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5892 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5894 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
5895 if (RExC_precomp[0] == ':' &&
5896 RExC_precomp[1] == '[' &&
5897 (f == 0xDF || f == 0x92)) {
5898 f = NATIVE_TO_UNI(f);
5901 /* If folding and foldable and a single
5902 * character, insert also the folded version
5903 * to the charclass. */
5905 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
5906 if ((RExC_precomp[0] == ':' &&
5907 RExC_precomp[1] == '[' &&
5909 (value == 0xFB05 || value == 0xFB06))) ?
5910 foldlen == ((STRLEN)UNISKIP(f) - 1) :
5911 foldlen == (STRLEN)UNISKIP(f) )
5913 if (foldlen == (STRLEN)UNISKIP(f))
5915 Perl_sv_catpvf(aTHX_ listsv,
5918 /* Any multicharacter foldings
5919 * require the following transform:
5920 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5921 * where E folds into "pq" and F folds
5922 * into "rst", all other characters
5923 * fold to single characters. We save
5924 * away these multicharacter foldings,
5925 * to be later saved as part of the
5926 * additional "s" data. */
5929 if (!unicode_alternate)
5930 unicode_alternate = newAV();
5931 sv = newSVpvn((char*)foldbuf, foldlen);
5933 av_push(unicode_alternate, sv);
5937 /* If folding and the value is one of the Greek
5938 * sigmas insert a few more sigmas to make the
5939 * folding rules of the sigmas to work right.
5940 * Note that not all the possible combinations
5941 * are handled here: some of them are handled
5942 * by the standard folding rules, and some of
5943 * them (literal or EXACTF cases) are handled
5944 * during runtime in regexec.c:S_find_byclass(). */
5945 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5946 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5947 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5948 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5949 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5951 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5952 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5953 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5958 literal_endpoint = 0;
5962 range = 0; /* this range (if it was one) is done now */
5966 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5968 RExC_size += ANYOF_CLASS_ADD_SKIP;
5970 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5976 /****** !SIZE_ONLY AFTER HERE *********/
5978 if( stored == 1 && value < 256
5979 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5981 /* optimize single char class to an EXACT node
5982 but *only* when its not a UTF/high char */
5983 const char * cur_parse= RExC_parse;
5984 RExC_emit = (regnode *)orig_emit;
5985 RExC_parse = (char *)orig_parse;
5986 ret = reg_node(pRExC_state,
5987 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5988 RExC_parse = (char *)cur_parse;
5989 *STRING(ret)= (char)value;
5991 RExC_emit += STR_SZ(1);
5994 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5995 if ( /* If the only flag is folding (plus possibly inversion). */
5996 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5998 for (value = 0; value < 256; ++value) {
5999 if (ANYOF_BITMAP_TEST(ret, value)) {
6000 UV fold = PL_fold[value];
6003 ANYOF_BITMAP_SET(ret, fold);
6006 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
6009 /* optimize inverted simple patterns (e.g. [^a-z]) */
6010 if (optimize_invert &&
6011 /* If the only flag is inversion. */
6012 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
6013 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
6014 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
6015 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
6018 AV * const av = newAV();
6020 /* The 0th element stores the character class description
6021 * in its textual form: used later (regexec.c:Perl_regclass_swash())
6022 * to initialize the appropriate swash (which gets stored in
6023 * the 1st element), and also useful for dumping the regnode.
6024 * The 2nd element stores the multicharacter foldings,
6025 * used later (regexec.c:S_reginclass()). */
6026 av_store(av, 0, listsv);
6027 av_store(av, 1, NULL);
6028 av_store(av, 2, (SV*)unicode_alternate);
6029 rv = newRV_noinc((SV*)av);
6030 n = add_data(pRExC_state, 1, "s");
6031 RExC_rx->data->data[n] = (void*)rv;
6038 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
6040 char* const retval = RExC_parse++;
6043 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6044 RExC_parse[2] == '#') {
6045 while (*RExC_parse != ')') {
6046 if (RExC_parse == RExC_end)
6047 FAIL("Sequence (?#... not terminated");
6053 if (RExC_flags & PMf_EXTENDED) {
6054 if (isSPACE(*RExC_parse)) {
6058 else if (*RExC_parse == '#') {
6059 while (RExC_parse < RExC_end)
6060 if (*RExC_parse++ == '\n') break;
6069 - reg_node - emit a node
6071 STATIC regnode * /* Location. */
6072 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
6075 register regnode *ptr;
6076 regnode * const ret = RExC_emit;
6077 GET_RE_DEBUG_FLAGS_DECL;
6080 SIZE_ALIGN(RExC_size);
6084 NODE_ALIGN_FILL(ret);
6086 FILL_ADVANCE_NODE(ptr, op);
6087 if (RExC_offsets) { /* MJD */
6088 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
6089 "reg_node", __LINE__,
6091 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6092 ? "Overwriting end of array!\n" : "OK",
6093 (UV)(RExC_emit - RExC_emit_start),
6094 (UV)(RExC_parse - RExC_start),
6095 (UV)RExC_offsets[0]));
6096 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
6105 - reganode - emit a node with an argument
6107 STATIC regnode * /* Location. */
6108 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6111 register regnode *ptr;
6112 regnode * const ret = RExC_emit;
6113 GET_RE_DEBUG_FLAGS_DECL;
6116 SIZE_ALIGN(RExC_size);
6121 NODE_ALIGN_FILL(ret);
6123 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6124 if (RExC_offsets) { /* MJD */
6125 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6129 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6130 "Overwriting end of array!\n" : "OK",
6131 (UV)(RExC_emit - RExC_emit_start),
6132 (UV)(RExC_parse - RExC_start),
6133 (UV)RExC_offsets[0]));
6134 Set_Cur_Node_Offset;
6143 - reguni - emit (if appropriate) a Unicode character
6146 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6149 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6153 - reginsert - insert an operator in front of already-emitted operand
6155 * Means relocating the operand.
6158 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6161 register regnode *src;
6162 register regnode *dst;
6163 register regnode *place;
6164 const int offset = regarglen[(U8)op];
6165 GET_RE_DEBUG_FLAGS_DECL;
6166 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6169 RExC_size += NODE_STEP_REGNODE + offset;
6174 RExC_emit += NODE_STEP_REGNODE + offset;
6176 while (src > opnd) {
6177 StructCopy(--src, --dst, regnode);
6178 if (RExC_offsets) { /* MJD 20010112 */
6179 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6183 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6184 ? "Overwriting end of array!\n" : "OK",
6185 (UV)(src - RExC_emit_start),
6186 (UV)(dst - RExC_emit_start),
6187 (UV)RExC_offsets[0]));
6188 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6189 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6194 place = opnd; /* Op node, where operand used to be. */
6195 if (RExC_offsets) { /* MJD */
6196 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6200 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6201 ? "Overwriting end of array!\n" : "OK",
6202 (UV)(place - RExC_emit_start),
6203 (UV)(RExC_parse - RExC_start),
6205 Set_Node_Offset(place, RExC_parse);
6206 Set_Node_Length(place, 1);
6208 src = NEXTOPER(place);
6209 FILL_ADVANCE_NODE(place, op);
6210 Zero(src, offset, regnode);
6214 - regtail - set the next-pointer at the end of a node chain of p to val.
6215 - SEE ALSO: regtail_study
6217 /* TODO: All three parms should be const */
6219 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6222 register regnode *scan;
6223 GET_RE_DEBUG_FLAGS_DECL;
6225 PERL_UNUSED_ARG(depth);
6231 /* Find last node. */
6234 regnode * const temp = regnext(scan);
6236 SV * const mysv=sv_newmortal();
6237 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6238 regprop(RExC_rx, mysv, scan);
6239 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6240 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6247 if (reg_off_by_arg[OP(scan)]) {
6248 ARG_SET(scan, val - scan);
6251 NEXT_OFF(scan) = val - scan;
6257 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6258 - Look for optimizable sequences at the same time.
6259 - currently only looks for EXACT chains.
6261 This is expermental code. The idea is to use this routine to perform
6262 in place optimizations on branches and groups as they are constructed,
6263 with the long term intention of removing optimization from study_chunk so
6264 that it is purely analytical.
6266 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6267 to control which is which.
6270 /* TODO: All four parms should be const */
6273 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6276 register regnode *scan;
6278 #ifdef EXPERIMENTAL_INPLACESCAN
6282 GET_RE_DEBUG_FLAGS_DECL;
6288 /* Find last node. */
6292 regnode * const temp = regnext(scan);
6293 #ifdef EXPERIMENTAL_INPLACESCAN
6294 if (PL_regkind[OP(scan)] == EXACT)
6295 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6303 if( exact == PSEUDO )
6305 else if ( exact != OP(scan) )
6314 SV * const mysv=sv_newmortal();
6315 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6316 regprop(RExC_rx, mysv, scan);
6317 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6318 SvPV_nolen_const(mysv),
6320 REG_NODE_NUM(scan));
6327 SV * const mysv_val=sv_newmortal();
6328 DEBUG_PARSE_MSG("");
6329 regprop(RExC_rx, mysv_val, val);
6330 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6331 SvPV_nolen_const(mysv_val),
6336 if (reg_off_by_arg[OP(scan)]) {
6337 ARG_SET(scan, val - scan);
6340 NEXT_OFF(scan) = val - scan;
6348 - regcurly - a little FSA that accepts {\d+,?\d*}
6351 S_regcurly(register const char *s)
6370 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6373 Perl_regdump(pTHX_ const regexp *r)
6377 SV * const sv = sv_newmortal();
6378 SV *dsv= sv_newmortal();
6380 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
6382 /* Header fields of interest. */
6383 if (r->anchored_substr) {
6384 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
6385 RE_SV_DUMPLEN(r->anchored_substr), 30);
6386 PerlIO_printf(Perl_debug_log,
6387 "anchored %s%s at %"IVdf" ",
6388 s, RE_SV_TAIL(r->anchored_substr),
6389 (IV)r->anchored_offset);
6390 } else if (r->anchored_utf8) {
6391 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
6392 RE_SV_DUMPLEN(r->anchored_utf8), 30);
6393 PerlIO_printf(Perl_debug_log,
6394 "anchored utf8 %s%s at %"IVdf" ",
6395 s, RE_SV_TAIL(r->anchored_utf8),
6396 (IV)r->anchored_offset);
6398 if (r->float_substr) {
6399 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
6400 RE_SV_DUMPLEN(r->float_substr), 30);
6401 PerlIO_printf(Perl_debug_log,
6402 "floating %s%s at %"IVdf"..%"UVuf" ",
6403 s, RE_SV_TAIL(r->float_substr),
6404 (IV)r->float_min_offset, (UV)r->float_max_offset);
6405 } else if (r->float_utf8) {
6406 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
6407 RE_SV_DUMPLEN(r->float_utf8), 30);
6408 PerlIO_printf(Perl_debug_log,
6409 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
6410 s, RE_SV_TAIL(r->float_utf8),
6411 (IV)r->float_min_offset, (UV)r->float_max_offset);
6413 if (r->check_substr || r->check_utf8)
6414 PerlIO_printf(Perl_debug_log,
6416 (r->check_substr == r->float_substr
6417 && r->check_utf8 == r->float_utf8
6418 ? "(checking floating" : "(checking anchored"));
6419 if (r->reganch & ROPT_NOSCAN)
6420 PerlIO_printf(Perl_debug_log, " noscan");
6421 if (r->reganch & ROPT_CHECK_ALL)
6422 PerlIO_printf(Perl_debug_log, " isall");
6423 if (r->check_substr || r->check_utf8)
6424 PerlIO_printf(Perl_debug_log, ") ");
6426 if (r->regstclass) {
6427 regprop(r, sv, r->regstclass);
6428 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6430 if (r->reganch & ROPT_ANCH) {
6431 PerlIO_printf(Perl_debug_log, "anchored");
6432 if (r->reganch & ROPT_ANCH_BOL)
6433 PerlIO_printf(Perl_debug_log, "(BOL)");
6434 if (r->reganch & ROPT_ANCH_MBOL)
6435 PerlIO_printf(Perl_debug_log, "(MBOL)");
6436 if (r->reganch & ROPT_ANCH_SBOL)
6437 PerlIO_printf(Perl_debug_log, "(SBOL)");
6438 if (r->reganch & ROPT_ANCH_GPOS)
6439 PerlIO_printf(Perl_debug_log, "(GPOS)");
6440 PerlIO_putc(Perl_debug_log, ' ');
6442 if (r->reganch & ROPT_GPOS_SEEN)
6443 PerlIO_printf(Perl_debug_log, "GPOS ");
6444 if (r->reganch & ROPT_SKIP)
6445 PerlIO_printf(Perl_debug_log, "plus ");
6446 if (r->reganch & ROPT_IMPLICIT)
6447 PerlIO_printf(Perl_debug_log, "implicit ");
6448 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6449 if (r->reganch & ROPT_EVAL_SEEN)
6450 PerlIO_printf(Perl_debug_log, "with eval ");
6451 PerlIO_printf(Perl_debug_log, "\n");
6453 PERL_UNUSED_CONTEXT;
6455 #endif /* DEBUGGING */
6459 - regprop - printable representation of opcode
6462 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6468 sv_setpvn(sv, "", 0);
6469 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6470 /* It would be nice to FAIL() here, but this may be called from
6471 regexec.c, and it would be hard to supply pRExC_state. */
6472 Perl_croak(aTHX_ "Corrupted regexp opcode");
6473 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6475 k = PL_regkind[OP(o)];
6478 SV * const dsv = sv_2mortal(newSVpvs(""));
6479 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
6480 * is a crude hack but it may be the best for now since
6481 * we have no flag "this EXACTish node was UTF-8"
6483 const char * const s =
6484 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
6485 PL_colors[0], PL_colors[1],
6486 PERL_PV_ESCAPE_UNI_DETECT |
6487 PERL_PV_PRETTY_ELIPSES |
6490 Perl_sv_catpvf(aTHX_ sv, " %s", s );
6491 } else if (k == TRIE) {
6492 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6493 /* print the details of the trie in dumpuntil instead, as
6494 * prog->data isn't available here */
6495 } else if (k == CURLY) {
6496 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6497 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6498 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6500 else if (k == WHILEM && o->flags) /* Ordinal/of */
6501 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6502 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6503 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6504 else if (k == LOGICAL)
6505 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6506 else if (k == ANYOF) {
6507 int i, rangestart = -1;
6508 const U8 flags = ANYOF_FLAGS(o);
6510 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6511 static const char * const anyofs[] = {
6544 if (flags & ANYOF_LOCALE)
6545 sv_catpvs(sv, "{loc}");
6546 if (flags & ANYOF_FOLD)
6547 sv_catpvs(sv, "{i}");
6548 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6549 if (flags & ANYOF_INVERT)
6551 for (i = 0; i <= 256; i++) {
6552 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6553 if (rangestart == -1)
6555 } else if (rangestart != -1) {
6556 if (i <= rangestart + 3)
6557 for (; rangestart < i; rangestart++)
6558 put_byte(sv, rangestart);
6560 put_byte(sv, rangestart);
6562 put_byte(sv, i - 1);
6568 if (o->flags & ANYOF_CLASS)
6569 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6570 if (ANYOF_CLASS_TEST(o,i))
6571 sv_catpv(sv, anyofs[i]);
6573 if (flags & ANYOF_UNICODE)
6574 sv_catpvs(sv, "{unicode}");
6575 else if (flags & ANYOF_UNICODE_ALL)
6576 sv_catpvs(sv, "{unicode_all}");
6580 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6584 U8 s[UTF8_MAXBYTES_CASE+1];
6586 for (i = 0; i <= 256; i++) { /* just the first 256 */
6587 uvchr_to_utf8(s, i);
6589 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6590 if (rangestart == -1)
6592 } else if (rangestart != -1) {
6593 if (i <= rangestart + 3)
6594 for (; rangestart < i; rangestart++) {
6595 const U8 * const e = uvchr_to_utf8(s,rangestart);
6597 for(p = s; p < e; p++)
6601 const U8 *e = uvchr_to_utf8(s,rangestart);
6603 for (p = s; p < e; p++)
6606 e = uvchr_to_utf8(s, i-1);
6607 for (p = s; p < e; p++)
6614 sv_catpvs(sv, "..."); /* et cetera */
6618 char *s = savesvpv(lv);
6619 char * const origs = s;
6621 while (*s && *s != '\n')
6625 const char * const t = ++s;
6643 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6645 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6646 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
6648 PERL_UNUSED_CONTEXT;
6649 PERL_UNUSED_ARG(sv);
6651 PERL_UNUSED_ARG(prog);
6652 #endif /* DEBUGGING */
6656 Perl_re_intuit_string(pTHX_ regexp *prog)
6657 { /* Assume that RE_INTUIT is set */
6659 GET_RE_DEBUG_FLAGS_DECL;
6660 PERL_UNUSED_CONTEXT;
6664 const char * const s = SvPV_nolen_const(prog->check_substr
6665 ? prog->check_substr : prog->check_utf8);
6667 if (!PL_colorset) reginitcolors();
6668 PerlIO_printf(Perl_debug_log,
6669 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6671 prog->check_substr ? "" : "utf8 ",
6672 PL_colors[5],PL_colors[0],
6675 (strlen(s) > 60 ? "..." : ""));
6678 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6682 Perl_pregfree(pTHX_ struct regexp *r)
6688 GET_RE_DEBUG_FLAGS_DECL;
6690 if (!r || (--r->refcnt > 0))
6696 SV *dsv= sv_newmortal();
6697 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
6698 dsv, r->precomp, r->prelen, 60);
6699 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
6700 PL_colors[4],PL_colors[5],s);
6704 /* gcov results gave these as non-null 100% of the time, so there's no
6705 optimisation in checking them before calling Safefree */
6706 Safefree(r->precomp);
6707 Safefree(r->offsets); /* 20010421 MJD */
6708 RX_MATCH_COPY_FREE(r);
6709 #ifdef PERL_OLD_COPY_ON_WRITE
6711 SvREFCNT_dec(r->saved_copy);
6714 if (r->anchored_substr)
6715 SvREFCNT_dec(r->anchored_substr);
6716 if (r->anchored_utf8)
6717 SvREFCNT_dec(r->anchored_utf8);
6718 if (r->float_substr)
6719 SvREFCNT_dec(r->float_substr);
6721 SvREFCNT_dec(r->float_utf8);
6722 Safefree(r->substrs);
6725 int n = r->data->count;
6726 PAD* new_comppad = NULL;
6731 /* If you add a ->what type here, update the comment in regcomp.h */
6732 switch (r->data->what[n]) {
6734 SvREFCNT_dec((SV*)r->data->data[n]);
6737 Safefree(r->data->data[n]);
6740 new_comppad = (AV*)r->data->data[n];
6743 if (new_comppad == NULL)
6744 Perl_croak(aTHX_ "panic: pregfree comppad");
6745 PAD_SAVE_LOCAL(old_comppad,
6746 /* Watch out for global destruction's random ordering. */
6747 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6750 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6753 op_free((OP_4tree*)r->data->data[n]);
6755 PAD_RESTORE_LOCAL(old_comppad);
6756 SvREFCNT_dec((SV*)new_comppad);
6762 { /* Aho Corasick add-on structure for a trie node.
6763 Used in stclass optimization only */
6765 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6767 refcount = --aho->refcount;
6770 Safefree(aho->states);
6771 Safefree(aho->fail);
6772 aho->trie=NULL; /* not necessary to free this as it is
6773 handled by the 't' case */
6774 Safefree(r->data->data[n]); /* do this last!!!! */
6775 Safefree(r->regstclass);
6781 /* trie structure. */
6783 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6785 refcount = --trie->refcount;
6788 Safefree(trie->charmap);
6789 if (trie->widecharmap)
6790 SvREFCNT_dec((SV*)trie->widecharmap);
6791 Safefree(trie->states);
6792 Safefree(trie->trans);
6794 Safefree(trie->bitmap);
6796 Safefree(trie->wordlen);
6800 SvREFCNT_dec((SV*)trie->words);
6801 if (trie->revcharmap)
6802 SvREFCNT_dec((SV*)trie->revcharmap);
6805 Safefree(r->data->data[n]); /* do this last!!!! */
6810 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6813 Safefree(r->data->what);
6816 Safefree(r->startp);
6821 #ifndef PERL_IN_XSUB_RE
6823 - regnext - dig the "next" pointer out of a node
6826 Perl_regnext(pTHX_ register regnode *p)
6829 register I32 offset;
6831 if (p == &PL_regdummy)
6834 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6843 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6846 STRLEN l1 = strlen(pat1);
6847 STRLEN l2 = strlen(pat2);
6850 const char *message;
6856 Copy(pat1, buf, l1 , char);
6857 Copy(pat2, buf + l1, l2 , char);
6858 buf[l1 + l2] = '\n';
6859 buf[l1 + l2 + 1] = '\0';
6861 /* ANSI variant takes additional second argument */
6862 va_start(args, pat2);
6866 msv = vmess(buf, &args);
6868 message = SvPV_const(msv,l1);
6871 Copy(message, buf, l1 , char);
6872 buf[l1-1] = '\0'; /* Overwrite \n */
6873 Perl_croak(aTHX_ "%s", buf);
6876 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6878 #ifndef PERL_IN_XSUB_RE
6880 Perl_save_re_context(pTHX)
6884 struct re_save_state *state;
6886 SAVEVPTR(PL_curcop);
6887 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6889 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6890 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6891 SSPUSHINT(SAVEt_RE_STATE);
6893 Copy(&PL_reg_state, state, 1, struct re_save_state);
6895 PL_reg_start_tmp = 0;
6896 PL_reg_start_tmpl = 0;
6897 PL_reg_oldsaved = NULL;
6898 PL_reg_oldsavedlen = 0;
6900 PL_reg_leftiter = 0;
6901 PL_reg_poscache = NULL;
6902 PL_reg_poscache_size = 0;
6903 #ifdef PERL_OLD_COPY_ON_WRITE
6907 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6909 const REGEXP * const rx = PM_GETRE(PL_curpm);
6912 for (i = 1; i <= rx->nparens; i++) {
6913 char digits[TYPE_CHARS(long)];
6914 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
6915 GV *const *const gvp
6916 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6919 GV * const gv = *gvp;
6920 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6930 clear_re(pTHX_ void *r)
6933 ReREFCNT_dec((regexp *)r);
6939 S_put_byte(pTHX_ SV *sv, int c)
6941 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6942 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6943 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6944 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6946 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6949 #define CLEAR_OPTSTART \
6950 if (optstart) STMT_START { \
6951 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
6955 #define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6957 STATIC const regnode *
6958 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6959 const regnode *last, SV* sv, I32 l)
6962 register U8 op = EXACT; /* Arbitrary non-END op. */
6963 register const regnode *next;
6964 const regnode *optstart= NULL;
6965 GET_RE_DEBUG_FLAGS_DECL;
6967 while (op != END && (!last || node < last)) {
6968 /* While that wasn't END last time... */
6974 next = regnext((regnode *)node);
6977 if (OP(node) == OPTIMIZED) {
6978 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
6985 regprop(r, sv, node);
6986 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6987 (int)(2*l + 1), "", SvPVX_const(sv));
6989 if (OP(node) != OPTIMIZED) {
6990 if (next == NULL) /* Next ptr. */
6991 PerlIO_printf(Perl_debug_log, "(0)");
6993 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6994 (void)PerlIO_putc(Perl_debug_log, '\n');
6998 if (PL_regkind[(U8)op] == BRANCHJ) {
7001 register const regnode *nnode = (OP(next) == LONGJMP
7002 ? regnext((regnode *)next)
7004 if (last && nnode > last)
7006 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
7009 else if (PL_regkind[(U8)op] == BRANCH) {
7011 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
7013 else if ( PL_regkind[(U8)op] == TRIE ) {
7014 const I32 n = ARG(node);
7015 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
7016 const I32 arry_len = av_len(trie->words)+1;
7018 PerlIO_printf(Perl_debug_log,
7019 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
7023 TRIE_WORDCOUNT(trie),
7024 (int)TRIE_CHARCOUNT(trie),
7025 trie->uniquecharcount,
7026 (IV)TRIE_LASTSTATE(trie)-1,
7033 sv_setpvn(sv, "", 0);
7034 for (i = 0; i <= 256; i++) {
7035 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
7036 if (rangestart == -1)
7038 } else if (rangestart != -1) {
7039 if (i <= rangestart + 3)
7040 for (; rangestart < i; rangestart++)
7041 put_byte(sv, rangestart);
7043 put_byte(sv, rangestart);
7045 put_byte(sv, i - 1);
7050 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
7052 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
7054 for (word_idx=0; word_idx < arry_len; word_idx++) {
7055 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
7057 PerlIO_printf(Perl_debug_log, "%*s%s\n",
7059 pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
7060 PL_colors[0], PL_colors[1],
7061 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
7062 PERL_PV_PRETTY_ELIPSES |
7068 node = NEXTOPER(node);
7069 node += regarglen[(U8)op];
7072 else if ( op == CURLY) { /* "next" might be very big: optimizer */
7073 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7074 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7076 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
7078 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7081 else if ( op == PLUS || op == STAR) {
7082 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
7084 else if (op == ANYOF) {
7085 /* arglen 1 + class block */
7086 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7087 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7088 node = NEXTOPER(node);
7090 else if (PL_regkind[(U8)op] == EXACT) {
7091 /* Literal string, where present. */
7092 node += NODE_SZ_STR(node) - 1;
7093 node = NEXTOPER(node);
7096 node = NEXTOPER(node);
7097 node += regarglen[(U8)op];
7099 if (op == CURLYX || op == OPEN)
7101 else if (op == WHILEM)
7108 #endif /* DEBUGGING */
7112 * c-indentation-style: bsd
7114 * indent-tabs-mode: t
7117 * ex: set ts=8 sts=4 sw=4 noet: