5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
176 /* Length of a variant. */
178 typedef struct scan_data_t {
184 I32 last_end; /* min value, <0 unless valid. */
187 SV **longest; /* Either &l_fixed, or &l_float. */
191 I32 offset_float_min;
192 I32 offset_float_max;
196 struct regnode_charclass_class *start_class;
200 * Forward declarations for pregcomp()'s friends.
203 static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL 0x0001
208 #define SF_BEFORE_MEOL 0x0002
209 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
213 # define SF_FIX_SHIFT_EOL (0+2)
214 # define SF_FL_SHIFT_EOL (0+4)
216 # define SF_FIX_SHIFT_EOL (+2)
217 # define SF_FL_SHIFT_EOL (+4)
220 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF 0x0040
226 #define SF_HAS_PAR 0x0080
227 #define SF_IN_PAR 0x0100
228 #define SF_HAS_EVAL 0x0200
229 #define SCF_DO_SUBSTR 0x0400
230 #define SCF_DO_STCLASS_AND 0x0800
231 #define SCF_DO_STCLASS_OR 0x1000
232 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS 0x2000
235 #define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
237 #define UTF (RExC_utf8 != 0)
238 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
239 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
241 #define OOB_UNICODE 12345678
242 #define OOB_NAMEDCLASS -1
244 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248 /* length of regex to show in messages that don't mark a position within */
249 #define RegexLengthToShowInErrorMessages 127
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
256 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
257 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
259 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 #define FAIL(msg) STMT_START { \
267 const char *ellipses = ""; \
268 IV len = RExC_end - RExC_precomp; \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
284 #define Simple_vFAIL(m) STMT_START { \
285 const IV offset = RExC_parse - RExC_precomp; \
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
293 #define vFAIL(m) STMT_START { \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
300 * Like Simple_vFAIL(), but accepts two arguments.
302 #define Simple_vFAIL2(m,a1) STMT_START { \
303 const IV offset = RExC_parse - RExC_precomp; \
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
311 #define vFAIL2(m,a1) STMT_START { \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
319 * Like Simple_vFAIL(), but accepts three arguments.
321 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
322 const IV offset = RExC_parse - RExC_precomp; \
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
330 #define vFAIL3(m,a1,a2) STMT_START { \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
337 * Like Simple_vFAIL(), but accepts four arguments.
339 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
340 const IV offset = RExC_parse - RExC_precomp; \
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
345 #define vWARN(loc,m) STMT_START { \
346 const IV offset = loc - RExC_precomp; \
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351 #define vWARNdep(loc,m) STMT_START { \
352 const IV offset = loc - RExC_precomp; \
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359 #define vWARN2(loc, m, a1) STMT_START { \
360 const IV offset = loc - RExC_precomp; \
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365 #define vWARN3(loc, m, a1, a2) STMT_START { \
366 const IV offset = loc - RExC_precomp; \
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
372 const IV offset = loc - RExC_precomp; \
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
378 const IV offset = loc - RExC_precomp; \
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 /* Allow for side effects in s */
385 #define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389 /* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
393 * Position is 1 indexed.
396 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
399 __LINE__, (node), (int)(byte))); \
401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
403 RExC_offsets[2*(node)-1] = (byte); \
408 #define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
412 #define Set_Node_Length_To_R(node,len) STMT_START { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
415 __LINE__, (int)(node), (int)(len))); \
417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
419 RExC_offsets[2*(node)] = (len); \
424 #define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427 #define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
430 /* Get offsets and lengths */
431 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
434 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
440 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441 #define EXPERIMENTAL_INPLACESCAN
444 static void clear_re(pTHX_ void *r);
446 /* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
451 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
457 SvSetMagicSV(*data->longest, data->last_found);
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
464 data->flags &= ~SF_FIX_BEFORE_EOL;
467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
468 data->offset_float_max = (l
469 ? data->last_start_max
470 : data->pos_min + data->pos_delta);
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
473 if (data->flags & SF_BEFORE_EOL)
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
477 data->flags &= ~SF_FL_BEFORE_EOL;
480 SvCUR_set(data->last_found, 0);
482 SV * const sv = data->last_found;
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
490 data->flags &= ~SF_BEFORE_EOL;
493 /* Can match anything (initialization) */
495 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
497 ANYOF_CLASS_ZERO(cl);
498 ANYOF_BITMAP_SETALL(cl);
499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
501 cl->flags |= ANYOF_LOCALE;
504 /* Can match anything (initialization) */
506 S_cl_is_anything(const struct regnode_charclass_class *cl)
510 for (value = 0; value <= ANYOF_MAX; value += 2)
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
520 /* Can match anything (initialization) */
522 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 Zero(cl, 1, struct regnode_charclass_class);
526 cl_anything(pRExC_state, cl);
530 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 Zero(cl, 1, struct regnode_charclass_class);
534 cl_anything(pRExC_state, cl);
536 cl->flags |= ANYOF_LOCALE;
539 /* 'And' a given class with another one. Can create false positives */
540 /* We assume that cl is not inverted */
542 S_cl_and(struct regnode_charclass_class *cl,
543 const struct regnode_charclass_class *and_with)
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
570 cl->flags &= ~ANYOF_UNICODE_ALL;
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
573 cl->flags &= ~ANYOF_UNICODE;
576 /* 'OR' a given class with another one. Can create false positives */
577 /* We assume that cl is not inverted */
579 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
581 if (or_with->flags & ANYOF_INVERT) {
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
600 cl_anything(pRExC_state, cl);
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && (!(or_with->flags & ANYOF_FOLD)
606 || (cl->flags & ANYOF_FOLD)) ) {
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
619 cl_anything(pRExC_state, cl);
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
638 make_trie(startbranch,first,last,tail,flags,depth)
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
648 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
650 A trie is an N'ary tree where the branches are determined by digital
651 decomposition of the key. IE, at the root node you look up the 1st character and
652 follow that branch repeat until you find the end of the branches. Nodes can be
653 marked as "accepting" meaning they represent a complete word. Eg:
657 would convert into the following structure. Numbers represent states, letters
658 following numbers represent valid transitions on the letter from that state, if
659 the number is in square brackets it represents an accepting state, otherwise it
660 will be in parenthesis.
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
666 (1) +-i->(6)-+-s->[7]
668 +-s->(3)-+-h->(4)-+-e->[5]
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
672 This shows that when matching against the string 'hers' we will begin at state 1
673 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676 single traverse. We store a mapping from accepting to state to which word was
677 matched, and then when we have multiple possibilities we try to complete the
678 rest of the regex in the order in which they occured in the alternation.
680 The only prior NFA like behaviour that would be changed by the TRIE support is
681 the silent ignoring of duplicate alternations which are of the form:
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
685 Thus EVAL blocks follwing a trie may be called a different number of times with
686 and without the optimisation. With the optimisations dupes will be silently
687 ignored. This inconsistant behaviour of EVAL type nodes is well established as
688 the following demonstrates:
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
692 which prints out 'word' three times, but
694 'words'=~/(word|word|word)(?{ print $1 })S/
696 which doesnt print it out at all. This is due to other optimisations kicking in.
698 Example of what happens on a structural level:
700 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
702 1: CURLYM[1] {1,32767}(18)
713 This would be optimizable with startbranch=5, first=5, last=16, tail=16
714 and should turn into:
716 1: CURLYM[1] {1,32767}(18)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
726 Cases where tail != last would be like /(?foo|bar)baz/:
736 which would be optimizable with startbranch=1, first=1, last=7, tail=8
737 and would end up looking like:
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
747 d = uvuni_to_utf8_flags(d, uv, 0);
749 is the recommended Unicode-aware way of saying
754 #define TRIE_STORE_REVCHAR \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
760 #define TRIE_READ_CHAR STMT_START { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
785 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
790 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
801 #define TRIE_LIST_NEW(state) STMT_START { \
802 Newxz( trie->states[ state ].trans.list, \
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
808 #define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
812 trie->wordlen[ curword ] = wordlen; \
813 trie->states[ state ].wordnum = ++curword; \
815 /* store the word for dumping */ \
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
820 tmp = newSVpvn( "", 0 ); \
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
825 NOOP; /* It's a dupe. So ignore it. */ \
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
835 These routines dump out a trie in a somewhat readable format.
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
852 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
855 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
875 const U32 base = trie->states[ state ].trans.base;
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
909 PerlIO_printf( Perl_debug_log, "]");
912 PerlIO_printf( Perl_debug_log, "\n" );
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
923 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
932 for( state=1 ; state < next_alloc ; state ++ ) {
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
945 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
964 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
968 GET_RE_DEBUG_FLAGS_DECL;
971 print out the table precompression so that we can do a visual check
972 that they are identical.
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
978 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
990 PerlIO_printf( Perl_debug_log, "\n" );
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1013 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1023 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1025 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1045 const U32 ucharcount = trie->uniquecharcount;
1046 const U32 numstates = trie->laststate;
1047 const U32 ubound = trie->lasttrans + ucharcount;
1051 U32 base = trie->states[ 1 ].trans.base;
1054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1056 PERL_UNUSED_ARG(depth);
1058 GET_RE_DEBUG_FLAGS_DECL;
1060 ARG_SET( stclass, data_slot );
1061 Newxz( aho, 1, reg_ac_data );
1062 RExC_rx->data->data[ data_slot ] = (void*)aho;
1064 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1065 (trie->laststate+1)*sizeof(reg_trie_state));
1066 Newxz( q, numstates, U32);
1067 Newxz( aho->fail, numstates, U32 );
1070 fail[ 0 ] = fail[ 1 ] = 1;
1072 for ( charid = 0; charid < ucharcount ; charid++ ) {
1073 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1075 q[ q_write ] = newstate;
1076 /* set to point at the root */
1077 fail[ q[ q_write++ ] ]=1;
1080 while ( q_read < q_write) {
1081 const U32 cur = q[ q_read++ % numstates ];
1082 base = trie->states[ cur ].trans.base;
1084 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1085 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1087 U32 fail_state = cur;
1090 fail_state = fail[ fail_state ];
1091 fail_base = aho->states[ fail_state ].trans.base;
1092 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1094 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1095 fail[ ch_state ] = fail_state;
1096 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1098 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1100 q[ q_write++ % numstates] = ch_state;
1105 DEBUG_TRIE_COMPILE_MORE_r({
1106 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1107 for( q_read=2; q_read<numstates; q_read++ ) {
1108 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1110 PerlIO_printf(Perl_debug_log, "\n");
1113 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1119 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1122 /* first pass, loop through and scan words */
1123 reg_trie_data *trie;
1125 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1130 /* we just use folder as a flag in utf8 */
1131 const U8 * const folder = ( flags == EXACTF
1133 : ( flags == EXACTFL
1139 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1140 SV *re_trie_maxbuff;
1142 /* these are only used during construction but are useful during
1143 * debugging so we store them in the struct when debugging.
1144 * Wordcount is actually superfluous in debugging as we have
1145 * (AV*)trie->words to use for it, but that's not available when
1146 * not debugging... We could make the macro use the AV during
1147 * debugging though...
1149 U16 trie_wordcount=0;
1150 STRLEN trie_charcount=0;
1151 /*U32 trie_laststate=0;*/
1152 AV *trie_revcharmap;
1153 PERL_UNUSED_ARG(depth);
1155 GET_RE_DEBUG_FLAGS_DECL;
1157 Newxz( trie, 1, reg_trie_data );
1159 trie->startstate = 1;
1160 RExC_rx->data->data[ data_slot ] = (void*)trie;
1161 Newxz( trie->charmap, 256, U16 );
1162 if (!(UTF && folder))
1163 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1165 trie->words = newAV();
1167 TRIE_REVCHARMAP(trie) = newAV();
1169 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1170 if (!SvIOK(re_trie_maxbuff)) {
1171 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1174 PerlIO_printf( Perl_debug_log,
1175 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1176 (int)depth * 2 + 2, "",
1177 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1178 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1180 /* -- First loop and Setup --
1182 We first traverse the branches and scan each word to determine if it
1183 contains widechars, and how many unique chars there are, this is
1184 important as we have to build a table with at least as many columns as we
1187 We use an array of integers to represent the character codes 0..255
1188 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1189 native representation of the character value as the key and IV's for the
1192 *TODO* If we keep track of how many times each character is used we can
1193 remap the columns so that the table compression later on is more
1194 efficient in terms of memory by ensuring most common value is in the
1195 middle and the least common are on the outside. IMO this would be better
1196 than a most to least common mapping as theres a decent chance the most
1197 common letter will share a node with the least common, meaning the node
1198 will not be compressable. With a middle is most common approach the worst
1199 case is when we have the least common nodes twice.
1203 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1204 regnode * const noper = NEXTOPER( cur );
1205 const U8 *uc = (U8*)STRING( noper );
1206 const U8 * const e = uc + STR_LEN( noper );
1208 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1209 const U8 *scan = (U8*)NULL;
1210 U32 wordlen = 0; /* required init */
1213 TRIE_WORDCOUNT(trie)++;
1214 if (OP(noper) == NOTHING) {
1219 TRIE_BITMAP_SET(trie,*uc);
1220 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1222 for ( ; uc < e ; uc += len ) {
1223 TRIE_CHARCOUNT(trie)++;
1227 if ( !trie->charmap[ uvc ] ) {
1228 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1230 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1235 if ( !trie->widecharmap )
1236 trie->widecharmap = newHV();
1238 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1241 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1243 if ( !SvTRUE( *svpp ) ) {
1244 sv_setiv( *svpp, ++trie->uniquecharcount );
1249 if( cur == first ) {
1252 } else if (chars < trie->minlen) {
1254 } else if (chars > trie->maxlen) {
1258 } /* end first pass */
1259 DEBUG_TRIE_COMPILE_r(
1260 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1261 (int)depth * 2 + 2,"",
1262 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1263 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1264 (int)trie->minlen, (int)trie->maxlen )
1266 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
1269 We now know what we are dealing with in terms of unique chars and
1270 string sizes so we can calculate how much memory a naive
1271 representation using a flat table will take. If it's over a reasonable
1272 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1273 conservative but potentially much slower representation using an array
1276 At the end we convert both representations into the same compressed
1277 form that will be used in regexec.c for matching with. The latter
1278 is a form that cannot be used to construct with but has memory
1279 properties similar to the list form and access properties similar
1280 to the table form making it both suitable for fast searches and
1281 small enough that its feasable to store for the duration of a program.
1283 See the comment in the code where the compressed table is produced
1284 inplace from the flat tabe representation for an explanation of how
1285 the compression works.
1290 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1292 Second Pass -- Array Of Lists Representation
1294 Each state will be represented by a list of charid:state records
1295 (reg_trie_trans_le) the first such element holds the CUR and LEN
1296 points of the allocated array. (See defines above).
1298 We build the initial structure using the lists, and then convert
1299 it into the compressed table form which allows faster lookups
1300 (but cant be modified once converted).
1303 STRLEN transcount = 1;
1305 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1309 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1311 regnode * const noper = NEXTOPER( cur );
1312 U8 *uc = (U8*)STRING( noper );
1313 const U8 * const e = uc + STR_LEN( noper );
1314 U32 state = 1; /* required init */
1315 U16 charid = 0; /* sanity init */
1316 U8 *scan = (U8*)NULL; /* sanity init */
1317 STRLEN foldlen = 0; /* required init */
1318 U32 wordlen = 0; /* required init */
1319 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1321 if (OP(noper) != NOTHING) {
1322 for ( ; uc < e ; uc += len ) {
1327 charid = trie->charmap[ uvc ];
1329 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1333 charid=(U16)SvIV( *svpp );
1342 if ( !trie->states[ state ].trans.list ) {
1343 TRIE_LIST_NEW( state );
1345 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1346 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1347 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1352 newstate = next_alloc++;
1353 TRIE_LIST_PUSH( state, charid, newstate );
1358 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1360 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1363 TRIE_HANDLE_WORD(state);
1365 } /* end second pass */
1367 TRIE_LASTSTATE(trie) = next_alloc;
1368 Renew( trie->states, next_alloc, reg_trie_state );
1370 /* and now dump it out before we compress it */
1371 DEBUG_TRIE_COMPILE_MORE_r(
1372 dump_trie_interim_list(trie,next_alloc,depth+1)
1375 Newxz( trie->trans, transcount ,reg_trie_trans );
1382 for( state=1 ; state < next_alloc ; state ++ ) {
1386 DEBUG_TRIE_COMPILE_MORE_r(
1387 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1391 if (trie->states[state].trans.list) {
1392 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1396 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1397 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1398 if ( forid < minid ) {
1400 } else if ( forid > maxid ) {
1404 if ( transcount < tp + maxid - minid + 1) {
1406 Renew( trie->trans, transcount, reg_trie_trans );
1407 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1409 base = trie->uniquecharcount + tp - minid;
1410 if ( maxid == minid ) {
1412 for ( ; zp < tp ; zp++ ) {
1413 if ( ! trie->trans[ zp ].next ) {
1414 base = trie->uniquecharcount + zp - minid;
1415 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1416 trie->trans[ zp ].check = state;
1422 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1423 trie->trans[ tp ].check = state;
1428 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1429 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1430 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1431 trie->trans[ tid ].check = state;
1433 tp += ( maxid - minid + 1 );
1435 Safefree(trie->states[ state ].trans.list);
1438 DEBUG_TRIE_COMPILE_MORE_r(
1439 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1442 trie->states[ state ].trans.base=base;
1444 trie->lasttrans = tp + 1;
1448 Second Pass -- Flat Table Representation.
1450 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1451 We know that we will need Charcount+1 trans at most to store the data
1452 (one row per char at worst case) So we preallocate both structures
1453 assuming worst case.
1455 We then construct the trie using only the .next slots of the entry
1458 We use the .check field of the first entry of the node temporarily to
1459 make compression both faster and easier by keeping track of how many non
1460 zero fields are in the node.
1462 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1465 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1466 number representing the first entry of the node, and state as a
1467 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1468 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1469 are 2 entrys per node. eg:
1477 The table is internally in the right hand, idx form. However as we also
1478 have to deal with the states array which is indexed by nodenum we have to
1479 use TRIE_NODENUM() to convert.
1484 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1486 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1487 next_alloc = trie->uniquecharcount + 1;
1490 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1492 regnode * const noper = NEXTOPER( cur );
1493 const U8 *uc = (U8*)STRING( noper );
1494 const U8 * const e = uc + STR_LEN( noper );
1496 U32 state = 1; /* required init */
1498 U16 charid = 0; /* sanity init */
1499 U32 accept_state = 0; /* sanity init */
1500 U8 *scan = (U8*)NULL; /* sanity init */
1502 STRLEN foldlen = 0; /* required init */
1503 U32 wordlen = 0; /* required init */
1504 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1506 if ( OP(noper) != NOTHING ) {
1507 for ( ; uc < e ; uc += len ) {
1512 charid = trie->charmap[ uvc ];
1514 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1515 charid = svpp ? (U16)SvIV(*svpp) : 0;
1519 if ( !trie->trans[ state + charid ].next ) {
1520 trie->trans[ state + charid ].next = next_alloc;
1521 trie->trans[ state ].check++;
1522 next_alloc += trie->uniquecharcount;
1524 state = trie->trans[ state + charid ].next;
1526 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1528 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1531 accept_state = TRIE_NODENUM( state );
1532 TRIE_HANDLE_WORD(accept_state);
1534 } /* end second pass */
1536 /* and now dump it out before we compress it */
1537 DEBUG_TRIE_COMPILE_MORE_r(
1538 dump_trie_interim_table(trie,next_alloc,depth+1)
1543 * Inplace compress the table.*
1545 For sparse data sets the table constructed by the trie algorithm will
1546 be mostly 0/FAIL transitions or to put it another way mostly empty.
1547 (Note that leaf nodes will not contain any transitions.)
1549 This algorithm compresses the tables by eliminating most such
1550 transitions, at the cost of a modest bit of extra work during lookup:
1552 - Each states[] entry contains a .base field which indicates the
1553 index in the state[] array wheres its transition data is stored.
1555 - If .base is 0 there are no valid transitions from that node.
1557 - If .base is nonzero then charid is added to it to find an entry in
1560 -If trans[states[state].base+charid].check!=state then the
1561 transition is taken to be a 0/Fail transition. Thus if there are fail
1562 transitions at the front of the node then the .base offset will point
1563 somewhere inside the previous nodes data (or maybe even into a node
1564 even earlier), but the .check field determines if the transition is
1567 The following process inplace converts the table to the compressed
1568 table: We first do not compress the root node 1,and mark its all its
1569 .check pointers as 1 and set its .base pointer as 1 as well. This
1570 allows to do a DFA construction from the compressed table later, and
1571 ensures that any .base pointers we calculate later are greater than
1574 - We set 'pos' to indicate the first entry of the second node.
1576 - We then iterate over the columns of the node, finding the first and
1577 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1578 and set the .check pointers accordingly, and advance pos
1579 appropriately and repreat for the next node. Note that when we copy
1580 the next pointers we have to convert them from the original
1581 NODEIDX form to NODENUM form as the former is not valid post
1584 - If a node has no transitions used we mark its base as 0 and do not
1585 advance the pos pointer.
1587 - If a node only has one transition we use a second pointer into the
1588 structure to fill in allocated fail transitions from other states.
1589 This pointer is independent of the main pointer and scans forward
1590 looking for null transitions that are allocated to a state. When it
1591 finds one it writes the single transition into the "hole". If the
1592 pointer doesnt find one the single transition is appeneded as normal.
1594 - Once compressed we can Renew/realloc the structures to release the
1597 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1598 specifically Fig 3.47 and the associated pseudocode.
1602 const U32 laststate = TRIE_NODENUM( next_alloc );
1605 TRIE_LASTSTATE(trie) = laststate;
1607 for ( state = 1 ; state < laststate ; state++ ) {
1609 const U32 stateidx = TRIE_NODEIDX( state );
1610 const U32 o_used = trie->trans[ stateidx ].check;
1611 U32 used = trie->trans[ stateidx ].check;
1612 trie->trans[ stateidx ].check = 0;
1614 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1615 if ( flag || trie->trans[ stateidx + charid ].next ) {
1616 if ( trie->trans[ stateidx + charid ].next ) {
1618 for ( ; zp < pos ; zp++ ) {
1619 if ( ! trie->trans[ zp ].next ) {
1623 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1624 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1625 trie->trans[ zp ].check = state;
1626 if ( ++zp > pos ) pos = zp;
1633 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1635 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1636 trie->trans[ pos ].check = state;
1641 trie->lasttrans = pos + 1;
1642 Renew( trie->states, laststate + 1, reg_trie_state);
1643 DEBUG_TRIE_COMPILE_MORE_r(
1644 PerlIO_printf( Perl_debug_log,
1645 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1646 (int)depth * 2 + 2,"",
1647 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1650 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1653 } /* end table compress */
1655 /* resize the trans array to remove unused space */
1656 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1658 /* and now dump out the compressed format */
1659 DEBUG_TRIE_COMPILE_r(
1660 dump_trie(trie,depth+1)
1663 { /* Modify the program and insert the new TRIE node*/
1665 U8 nodetype =(U8)(flags & 0xFF);
1672 This means we convert either the first branch or the first Exact,
1673 depending on whether the thing following (in 'last') is a branch
1674 or not and whther first is the startbranch (ie is it a sub part of
1675 the alternation or is it the whole thing.)
1676 Assuming its a sub part we conver the EXACT otherwise we convert
1677 the whole branch sequence, including the first.
1679 /* Find the node we are going to overwrite */
1680 if ( first == startbranch && OP( last ) != BRANCH ) {
1681 /* whole branch chain */
1684 const regnode *nop = NEXTOPER( convert );
1685 mjd_offset= Node_Offset((nop));
1686 mjd_nodelen= Node_Length((nop));
1689 /* branch sub-chain */
1690 convert = NEXTOPER( first );
1691 NEXT_OFF( first ) = (U16)(last - first);
1693 mjd_offset= Node_Offset((convert));
1694 mjd_nodelen= Node_Length((convert));
1698 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1699 (int)depth * 2 + 2, "",
1700 mjd_offset,mjd_nodelen)
1703 /* But first we check to see if there is a common prefix we can
1704 split out as an EXACT and put in front of the TRIE node. */
1705 trie->startstate= 1;
1706 if ( trie->bitmap && !trie->widecharmap ) {
1709 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1710 (int)depth * 2 + 2, "",
1711 TRIE_LASTSTATE(trie))
1713 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1717 const U32 base = trie->states[ state ].trans.base;
1719 if ( trie->states[state].wordnum )
1722 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1723 if ( ( base + ofs >= trie->uniquecharcount ) &&
1724 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1725 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1727 if ( ++count > 1 ) {
1728 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1729 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1730 if ( state == 1 ) break;
1732 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1734 PerlIO_printf(Perl_debug_log,
1735 "%*sNew Start State=%"UVuf" Class: [",
1736 (int)depth * 2 + 2, "",
1739 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1740 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1742 TRIE_BITMAP_SET(trie,*ch);
1744 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1746 PerlIO_printf(Perl_debug_log, (char*)ch)
1750 TRIE_BITMAP_SET(trie,*ch);
1752 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1753 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1759 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1760 const char *ch = SvPV_nolen_const( *tmp );
1762 PerlIO_printf( Perl_debug_log,
1763 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1764 (int)depth * 2 + 2, "",
1768 OP( convert ) = nodetype;
1769 str=STRING(convert);
1778 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1784 regnode *n = convert+NODE_SZ_STR(convert);
1785 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1786 trie->startstate = state;
1787 trie->minlen -= (state - 1);
1788 trie->maxlen -= (state - 1);
1790 regnode *fix = convert;
1792 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1793 while( ++fix < n ) {
1794 Set_Node_Offset_Length(fix, 0, 0);
1800 NEXT_OFF(convert) = (U16)(tail - convert);
1804 if ( trie->maxlen ) {
1805 OP( convert ) = TRIE;
1806 NEXT_OFF( convert ) = (U16)(tail - convert);
1807 ARG_SET( convert, data_slot );
1809 /* store the type in the flags */
1810 convert->flags = nodetype;
1811 /* XXX We really should free up the resource in trie now, as we wont use them */
1813 /* needed for dumping*/
1815 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1816 regnode *opt = convert;
1817 while (++opt<optimize) {
1818 Set_Node_Offset_Length(opt,0,0);
1820 /* We now need to mark all of the space originally used by the
1821 branches as optimized away. This keeps the dumpuntil from
1822 throwing a wobbly as it doesnt use regnext() to traverse the
1824 We also "fix" the offsets
1826 while( optimize < last ) {
1827 mjd_nodelen += Node_Length((optimize));
1828 OP( optimize ) = OPTIMIZED;
1829 Set_Node_Offset_Length(optimize,0,0);
1832 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1834 } /* end node insert */
1836 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1842 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1843 * These need to be revisited when a newer toolchain becomes available.
1845 #if defined(__sparc64__) && defined(__GNUC__)
1846 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1847 # undef SPARC64_GCC_WORKAROUND
1848 # define SPARC64_GCC_WORKAROUND 1
1852 #define DEBUG_PEEP(str,scan,depth) \
1853 DEBUG_OPTIMISE_r({ \
1854 SV * const mysv=sv_newmortal(); \
1855 regnode *Next = regnext(scan); \
1856 regprop(RExC_rx, mysv, scan); \
1857 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1858 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1859 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1862 #define JOIN_EXACT(scan,min,flags) \
1863 if (PL_regkind[OP(scan)] == EXACT) \
1864 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1867 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1868 /* Merge several consecutive EXACTish nodes into one. */
1869 regnode *n = regnext(scan);
1871 regnode *next = scan + NODE_SZ_STR(scan);
1875 regnode *stop = scan;
1877 PERL_UNUSED_ARG(flags);
1878 PERL_UNUSED_ARG(val);
1879 PERL_UNUSED_ARG(depth);
1881 GET_RE_DEBUG_FLAGS_DECL;
1882 DEBUG_PEEP("join",scan,depth);
1884 /* Skip NOTHING, merge EXACT*. */
1886 ( PL_regkind[OP(n)] == NOTHING ||
1887 (stringok && (OP(n) == OP(scan))))
1889 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1891 if (OP(n) == TAIL || n > next)
1893 if (PL_regkind[OP(n)] == NOTHING) {
1895 DEBUG_PEEP("skip:",n,depth);
1896 NEXT_OFF(scan) += NEXT_OFF(n);
1897 next = n + NODE_STEP_REGNODE;
1904 else if (stringok) {
1905 const int oldl = STR_LEN(scan);
1906 regnode * const nnext = regnext(n);
1908 DEBUG_PEEP("merg",n,depth);
1911 if (oldl + STR_LEN(n) > U8_MAX)
1913 NEXT_OFF(scan) += NEXT_OFF(n);
1914 STR_LEN(scan) += STR_LEN(n);
1915 next = n + NODE_SZ_STR(n);
1916 /* Now we can overwrite *n : */
1917 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1925 #ifdef EXPERIMENTAL_INPLACESCAN
1926 if (flags && !NEXT_OFF(n)) {
1927 DEBUG_PEEP("atch",val,depth);
1928 if (reg_off_by_arg[OP(n)]) {
1929 ARG_SET(n, val - n);
1932 NEXT_OFF(n) = val - n;
1939 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1941 Two problematic code points in Unicode casefolding of EXACT nodes:
1943 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1944 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1950 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1951 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1953 This means that in case-insensitive matching (or "loose matching",
1954 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1955 length of the above casefolded versions) can match a target string
1956 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1957 This would rather mess up the minimum length computation.
1959 What we'll do is to look for the tail four bytes, and then peek
1960 at the preceding two bytes to see whether we need to decrease
1961 the minimum length by four (six minus two).
1963 Thanks to the design of UTF-8, there cannot be false matches:
1964 A sequence of valid UTF-8 bytes cannot be a subsequence of
1965 another valid sequence of UTF-8 bytes.
1968 char * const s0 = STRING(scan), *s, *t;
1969 char * const s1 = s0 + STR_LEN(scan) - 1;
1970 char * const s2 = s1 - 4;
1971 const char t0[] = "\xcc\x88\xcc\x81";
1972 const char * const t1 = t0 + 3;
1975 s < s2 && (t = ninstr(s, s1, t0, t1));
1977 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1978 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1985 n = scan + NODE_SZ_STR(scan);
1987 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1994 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
1998 /* REx optimizer. Converts nodes into quickier variants "in place".
1999 Finds fixed substrings. */
2001 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2002 to the position after last scanned or to NULL. */
2007 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2008 regnode *last, scan_data_t *data, U32 flags, U32 depth)
2009 /* scanp: Start here (read-write). */
2010 /* deltap: Write maxlen-minlen here. */
2011 /* last: Stop before this one. */
2014 I32 min = 0, pars = 0, code;
2015 regnode *scan = *scanp, *next;
2017 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2018 int is_inf_internal = 0; /* The studied chunk is infinite */
2019 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2020 scan_data_t data_fake;
2021 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2022 SV *re_trie_maxbuff = NULL;
2024 GET_RE_DEBUG_FLAGS_DECL;
2026 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2029 while (scan && OP(scan) != END && scan < last) {
2030 /* Peephole optimizer: */
2031 DEBUG_PEEP("Peep",scan,depth);
2033 JOIN_EXACT(scan,&min,0);
2035 /* Follow the next-chain of the current node and optimize
2036 away all the NOTHINGs from it. */
2037 if (OP(scan) != CURLYX) {
2038 const int max = (reg_off_by_arg[OP(scan)]
2040 /* I32 may be smaller than U16 on CRAYs! */
2041 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2042 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2046 /* Skip NOTHING and LONGJMP. */
2047 while ((n = regnext(n))
2048 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2049 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2050 && off + noff < max)
2052 if (reg_off_by_arg[OP(scan)])
2055 NEXT_OFF(scan) = off;
2060 /* The principal pseudo-switch. Cannot be a switch, since we
2061 look into several different things. */
2062 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2063 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2064 next = regnext(scan);
2066 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2068 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2069 I32 max1 = 0, min1 = I32_MAX, num = 0;
2070 struct regnode_charclass_class accum;
2071 regnode * const startbranch=scan;
2073 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2074 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2075 if (flags & SCF_DO_STCLASS)
2076 cl_init_zero(pRExC_state, &accum);
2078 while (OP(scan) == code) {
2079 I32 deltanext, minnext, f = 0, fake;
2080 struct regnode_charclass_class this_class;
2083 data_fake.flags = 0;
2085 data_fake.whilem_c = data->whilem_c;
2086 data_fake.last_closep = data->last_closep;
2089 data_fake.last_closep = &fake;
2090 next = regnext(scan);
2091 scan = NEXTOPER(scan);
2093 scan = NEXTOPER(scan);
2094 if (flags & SCF_DO_STCLASS) {
2095 cl_init(pRExC_state, &this_class);
2096 data_fake.start_class = &this_class;
2097 f = SCF_DO_STCLASS_AND;
2099 if (flags & SCF_WHILEM_VISITED_POS)
2100 f |= SCF_WHILEM_VISITED_POS;
2102 /* we suppose the run is continuous, last=next...*/
2103 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2104 next, &data_fake, f,depth+1);
2107 if (max1 < minnext + deltanext)
2108 max1 = minnext + deltanext;
2109 if (deltanext == I32_MAX)
2110 is_inf = is_inf_internal = 1;
2112 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2115 if (data_fake.flags & SF_HAS_EVAL)
2116 data->flags |= SF_HAS_EVAL;
2117 data->whilem_c = data_fake.whilem_c;
2119 if (flags & SCF_DO_STCLASS)
2120 cl_or(pRExC_state, &accum, &this_class);
2121 if (code == SUSPEND)
2124 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2126 if (flags & SCF_DO_SUBSTR) {
2127 data->pos_min += min1;
2128 data->pos_delta += max1 - min1;
2129 if (max1 != min1 || is_inf)
2130 data->longest = &(data->longest_float);
2133 delta += max1 - min1;
2134 if (flags & SCF_DO_STCLASS_OR) {
2135 cl_or(pRExC_state, data->start_class, &accum);
2137 cl_and(data->start_class, &and_with);
2138 flags &= ~SCF_DO_STCLASS;
2141 else if (flags & SCF_DO_STCLASS_AND) {
2143 cl_and(data->start_class, &accum);
2144 flags &= ~SCF_DO_STCLASS;
2147 /* Switch to OR mode: cache the old value of
2148 * data->start_class */
2149 StructCopy(data->start_class, &and_with,
2150 struct regnode_charclass_class);
2151 flags &= ~SCF_DO_STCLASS_AND;
2152 StructCopy(&accum, data->start_class,
2153 struct regnode_charclass_class);
2154 flags |= SCF_DO_STCLASS_OR;
2155 data->start_class->flags |= ANYOF_EOS;
2161 Assuming this was/is a branch we are dealing with: 'scan' now
2162 points at the item that follows the branch sequence, whatever
2163 it is. We now start at the beginning of the sequence and look
2169 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2171 If we can find such a subseqence we need to turn the first
2172 element into a trie and then add the subsequent branch exact
2173 strings to the trie.
2177 1. patterns where the whole set of branch can be converted to a trie,
2179 2. patterns where only a subset of the alternations can be
2180 converted to a trie.
2182 In case 1 we can replace the whole set with a single regop
2183 for the trie. In case 2 we need to keep the start and end
2186 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2187 becomes BRANCH TRIE; BRANCH X;
2189 Hypthetically when we know the regex isnt anchored we can
2190 turn a case 1 into a DFA and let it rip... Every time it finds a match
2191 it would just call its tail, no WHILEM/CURLY needed.
2194 if (PERL_ENABLE_TRIE_OPTIMISATION) {
2196 if (!re_trie_maxbuff) {
2197 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2198 if (!SvIOK(re_trie_maxbuff))
2199 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2201 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2203 regnode *first = (regnode *)NULL;
2204 regnode *last = (regnode *)NULL;
2205 regnode *tail = scan;
2210 SV * const mysv = sv_newmortal(); /* for dumping */
2212 /* var tail is used because there may be a TAIL
2213 regop in the way. Ie, the exacts will point to the
2214 thing following the TAIL, but the last branch will
2215 point at the TAIL. So we advance tail. If we
2216 have nested (?:) we may have to move through several
2220 while ( OP( tail ) == TAIL ) {
2221 /* this is the TAIL generated by (?:) */
2222 tail = regnext( tail );
2227 regprop(RExC_rx, mysv, tail );
2228 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2229 (int)depth * 2 + 2, "",
2230 "Looking for TRIE'able sequences. Tail node is: ",
2231 SvPV_nolen_const( mysv )
2237 step through the branches, cur represents each
2238 branch, noper is the first thing to be matched
2239 as part of that branch and noper_next is the
2240 regnext() of that node. if noper is an EXACT
2241 and noper_next is the same as scan (our current
2242 position in the regex) then the EXACT branch is
2243 a possible optimization target. Once we have
2244 two or more consequetive such branches we can
2245 create a trie of the EXACT's contents and stich
2246 it in place. If the sequence represents all of
2247 the branches we eliminate the whole thing and
2248 replace it with a single TRIE. If it is a
2249 subsequence then we need to stitch it in. This
2250 means the first branch has to remain, and needs
2251 to be repointed at the item on the branch chain
2252 following the last branch optimized. This could
2253 be either a BRANCH, in which case the
2254 subsequence is internal, or it could be the
2255 item following the branch sequence in which
2256 case the subsequence is at the end.
2260 /* dont use tail as the end marker for this traverse */
2261 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2262 regnode * const noper = NEXTOPER( cur );
2263 regnode * const noper_next = regnext( noper );
2266 regprop(RExC_rx, mysv, cur);
2267 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2268 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2270 regprop(RExC_rx, mysv, noper);
2271 PerlIO_printf( Perl_debug_log, " -> %s",
2272 SvPV_nolen_const(mysv));
2275 regprop(RExC_rx, mysv, noper_next );
2276 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2277 SvPV_nolen_const(mysv));
2279 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2280 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2282 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2283 : PL_regkind[ OP( noper ) ] == EXACT )
2284 || OP(noper) == NOTHING )
2285 && noper_next == tail && count<U16_MAX)
2288 if ( !first || optype == NOTHING ) {
2289 if (!first) first = cur;
2290 optype = OP( noper );
2296 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2298 if ( PL_regkind[ OP( noper ) ] == EXACT
2299 && noper_next == tail )
2303 optype = OP( noper );
2313 regprop(RExC_rx, mysv, cur);
2314 PerlIO_printf( Perl_debug_log,
2315 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2316 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2320 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2321 #ifdef TRIE_STUDY_OPT
2322 if ( made && startbranch == first ) {
2323 if ( OP(first)!=TRIE )
2324 flags |= SCF_EXACT_TRIE;
2326 regnode *chk=*scanp;
2327 while ( OP( chk ) == OPEN )
2328 chk = regnext( chk );
2330 flags |= SCF_EXACT_TRIE;
2339 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2340 scan = NEXTOPER(NEXTOPER(scan));
2341 } else /* single branch is optimized. */
2342 scan = NEXTOPER(scan);
2345 else if (OP(scan) == EXACT) {
2346 I32 l = STR_LEN(scan);
2349 const U8 * const s = (U8*)STRING(scan);
2350 l = utf8_length(s, s + l);
2351 uc = utf8_to_uvchr(s, NULL);
2353 uc = *((U8*)STRING(scan));
2356 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2357 /* The code below prefers earlier match for fixed
2358 offset, later match for variable offset. */
2359 if (data->last_end == -1) { /* Update the start info. */
2360 data->last_start_min = data->pos_min;
2361 data->last_start_max = is_inf
2362 ? I32_MAX : data->pos_min + data->pos_delta;
2364 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2366 SvUTF8_on(data->last_found);
2368 SV * const sv = data->last_found;
2369 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2370 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2371 if (mg && mg->mg_len >= 0)
2372 mg->mg_len += utf8_length((U8*)STRING(scan),
2373 (U8*)STRING(scan)+STR_LEN(scan));
2375 data->last_end = data->pos_min + l;
2376 data->pos_min += l; /* As in the first entry. */
2377 data->flags &= ~SF_BEFORE_EOL;
2379 if (flags & SCF_DO_STCLASS_AND) {
2380 /* Check whether it is compatible with what we know already! */
2384 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2385 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2386 && (!(data->start_class->flags & ANYOF_FOLD)
2387 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2390 ANYOF_CLASS_ZERO(data->start_class);
2391 ANYOF_BITMAP_ZERO(data->start_class);
2393 ANYOF_BITMAP_SET(data->start_class, uc);
2394 data->start_class->flags &= ~ANYOF_EOS;
2396 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2398 else if (flags & SCF_DO_STCLASS_OR) {
2399 /* false positive possible if the class is case-folded */
2401 ANYOF_BITMAP_SET(data->start_class, uc);
2403 data->start_class->flags |= ANYOF_UNICODE_ALL;
2404 data->start_class->flags &= ~ANYOF_EOS;
2405 cl_and(data->start_class, &and_with);
2407 flags &= ~SCF_DO_STCLASS;
2409 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2410 I32 l = STR_LEN(scan);
2411 UV uc = *((U8*)STRING(scan));
2413 /* Search for fixed substrings supports EXACT only. */
2414 if (flags & SCF_DO_SUBSTR) {
2416 scan_commit(pRExC_state, data);
2419 const U8 * const s = (U8 *)STRING(scan);
2420 l = utf8_length(s, s + l);
2421 uc = utf8_to_uvchr(s, NULL);
2424 if (flags & SCF_DO_SUBSTR)
2426 if (flags & SCF_DO_STCLASS_AND) {
2427 /* Check whether it is compatible with what we know already! */
2431 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2432 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2433 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2435 ANYOF_CLASS_ZERO(data->start_class);
2436 ANYOF_BITMAP_ZERO(data->start_class);
2438 ANYOF_BITMAP_SET(data->start_class, uc);
2439 data->start_class->flags &= ~ANYOF_EOS;
2440 data->start_class->flags |= ANYOF_FOLD;
2441 if (OP(scan) == EXACTFL)
2442 data->start_class->flags |= ANYOF_LOCALE;
2445 else if (flags & SCF_DO_STCLASS_OR) {
2446 if (data->start_class->flags & ANYOF_FOLD) {
2447 /* false positive possible if the class is case-folded.
2448 Assume that the locale settings are the same... */
2450 ANYOF_BITMAP_SET(data->start_class, uc);
2451 data->start_class->flags &= ~ANYOF_EOS;
2453 cl_and(data->start_class, &and_with);
2455 flags &= ~SCF_DO_STCLASS;
2457 #ifdef TRIE_STUDY_OPT
2458 else if (OP(scan) == TRIE) {
2459 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2460 min += trie->minlen;
2461 delta += (trie->maxlen - trie->minlen);
2462 flags &= ~SCF_DO_STCLASS; /* xxx */
2463 if (flags & SCF_DO_SUBSTR) {
2464 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2465 data->pos_min += trie->minlen;
2466 data->pos_delta += (trie->maxlen - trie->minlen);
2467 if (trie->maxlen != trie->minlen)
2468 data->longest = &(data->longest_float);
2472 else if (strchr((const char*)PL_varies,OP(scan))) {
2473 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2474 I32 f = flags, pos_before = 0;
2475 regnode * const oscan = scan;
2476 struct regnode_charclass_class this_class;
2477 struct regnode_charclass_class *oclass = NULL;
2478 I32 next_is_eval = 0;
2480 switch (PL_regkind[OP(scan)]) {
2481 case WHILEM: /* End of (?:...)* . */
2482 scan = NEXTOPER(scan);
2485 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2486 next = NEXTOPER(scan);
2487 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2489 maxcount = REG_INFTY;
2490 next = regnext(scan);
2491 scan = NEXTOPER(scan);
2495 if (flags & SCF_DO_SUBSTR)
2500 if (flags & SCF_DO_STCLASS) {
2502 maxcount = REG_INFTY;
2503 next = regnext(scan);
2504 scan = NEXTOPER(scan);
2507 is_inf = is_inf_internal = 1;
2508 scan = regnext(scan);
2509 if (flags & SCF_DO_SUBSTR) {
2510 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2511 data->longest = &(data->longest_float);
2513 goto optimize_curly_tail;
2515 mincount = ARG1(scan);
2516 maxcount = ARG2(scan);
2517 next = regnext(scan);
2518 if (OP(scan) == CURLYX) {
2519 I32 lp = (data ? *(data->last_closep) : 0);
2520 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2522 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2523 next_is_eval = (OP(scan) == EVAL);
2525 if (flags & SCF_DO_SUBSTR) {
2526 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2527 pos_before = data->pos_min;
2531 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2533 data->flags |= SF_IS_INF;
2535 if (flags & SCF_DO_STCLASS) {
2536 cl_init(pRExC_state, &this_class);
2537 oclass = data->start_class;
2538 data->start_class = &this_class;
2539 f |= SCF_DO_STCLASS_AND;
2540 f &= ~SCF_DO_STCLASS_OR;
2542 /* These are the cases when once a subexpression
2543 fails at a particular position, it cannot succeed
2544 even after backtracking at the enclosing scope.
2546 XXXX what if minimal match and we are at the
2547 initial run of {n,m}? */
2548 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2549 f &= ~SCF_WHILEM_VISITED_POS;
2551 /* This will finish on WHILEM, setting scan, or on NULL: */
2552 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2554 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2556 if (flags & SCF_DO_STCLASS)
2557 data->start_class = oclass;
2558 if (mincount == 0 || minnext == 0) {
2559 if (flags & SCF_DO_STCLASS_OR) {
2560 cl_or(pRExC_state, data->start_class, &this_class);
2562 else if (flags & SCF_DO_STCLASS_AND) {
2563 /* Switch to OR mode: cache the old value of
2564 * data->start_class */
2565 StructCopy(data->start_class, &and_with,
2566 struct regnode_charclass_class);
2567 flags &= ~SCF_DO_STCLASS_AND;
2568 StructCopy(&this_class, data->start_class,
2569 struct regnode_charclass_class);
2570 flags |= SCF_DO_STCLASS_OR;
2571 data->start_class->flags |= ANYOF_EOS;
2573 } else { /* Non-zero len */
2574 if (flags & SCF_DO_STCLASS_OR) {
2575 cl_or(pRExC_state, data->start_class, &this_class);
2576 cl_and(data->start_class, &and_with);
2578 else if (flags & SCF_DO_STCLASS_AND)
2579 cl_and(data->start_class, &this_class);
2580 flags &= ~SCF_DO_STCLASS;
2582 if (!scan) /* It was not CURLYX, but CURLY. */
2584 if ( /* ? quantifier ok, except for (?{ ... }) */
2585 (next_is_eval || !(mincount == 0 && maxcount == 1))
2586 && (minnext == 0) && (deltanext == 0)
2587 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2588 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2589 && ckWARN(WARN_REGEXP))
2592 "Quantifier unexpected on zero-length expression");
2595 min += minnext * mincount;
2596 is_inf_internal |= ((maxcount == REG_INFTY
2597 && (minnext + deltanext) > 0)
2598 || deltanext == I32_MAX);
2599 is_inf |= is_inf_internal;
2600 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2602 /* Try powerful optimization CURLYX => CURLYN. */
2603 if ( OP(oscan) == CURLYX && data
2604 && data->flags & SF_IN_PAR
2605 && !(data->flags & SF_HAS_EVAL)
2606 && !deltanext && minnext == 1 ) {
2607 /* Try to optimize to CURLYN. */
2608 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2609 regnode * const nxt1 = nxt;
2616 if (!strchr((const char*)PL_simple,OP(nxt))
2617 && !(PL_regkind[OP(nxt)] == EXACT
2618 && STR_LEN(nxt) == 1))
2624 if (OP(nxt) != CLOSE)
2626 /* Now we know that nxt2 is the only contents: */
2627 oscan->flags = (U8)ARG(nxt);
2629 OP(nxt1) = NOTHING; /* was OPEN. */
2631 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2632 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2633 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2634 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2635 OP(nxt + 1) = OPTIMIZED; /* was count. */
2636 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2641 /* Try optimization CURLYX => CURLYM. */
2642 if ( OP(oscan) == CURLYX && data
2643 && !(data->flags & SF_HAS_PAR)
2644 && !(data->flags & SF_HAS_EVAL)
2645 && !deltanext /* atom is fixed width */
2646 && minnext != 0 /* CURLYM can't handle zero width */
2648 /* XXXX How to optimize if data == 0? */
2649 /* Optimize to a simpler form. */
2650 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2654 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2655 && (OP(nxt2) != WHILEM))
2657 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2658 /* Need to optimize away parenths. */
2659 if (data->flags & SF_IN_PAR) {
2660 /* Set the parenth number. */
2661 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2663 if (OP(nxt) != CLOSE)
2664 FAIL("Panic opt close");
2665 oscan->flags = (U8)ARG(nxt);
2666 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2667 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2669 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2670 OP(nxt + 1) = OPTIMIZED; /* was count. */
2671 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2672 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2675 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2676 regnode *nnxt = regnext(nxt1);
2679 if (reg_off_by_arg[OP(nxt1)])
2680 ARG_SET(nxt1, nxt2 - nxt1);
2681 else if (nxt2 - nxt1 < U16_MAX)
2682 NEXT_OFF(nxt1) = nxt2 - nxt1;
2684 OP(nxt) = NOTHING; /* Cannot beautify */
2689 /* Optimize again: */
2690 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2696 else if ((OP(oscan) == CURLYX)
2697 && (flags & SCF_WHILEM_VISITED_POS)
2698 /* See the comment on a similar expression above.
2699 However, this time it not a subexpression
2700 we care about, but the expression itself. */
2701 && (maxcount == REG_INFTY)
2702 && data && ++data->whilem_c < 16) {
2703 /* This stays as CURLYX, we can put the count/of pair. */
2704 /* Find WHILEM (as in regexec.c) */
2705 regnode *nxt = oscan + NEXT_OFF(oscan);
2707 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2709 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2710 | (RExC_whilem_seen << 4)); /* On WHILEM */
2712 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2714 if (flags & SCF_DO_SUBSTR) {
2715 SV *last_str = NULL;
2716 int counted = mincount != 0;
2718 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2719 #if defined(SPARC64_GCC_WORKAROUND)
2722 const char *s = NULL;
2725 if (pos_before >= data->last_start_min)
2728 b = data->last_start_min;
2731 s = SvPV_const(data->last_found, l);
2732 old = b - data->last_start_min;
2735 I32 b = pos_before >= data->last_start_min
2736 ? pos_before : data->last_start_min;
2738 const char * const s = SvPV_const(data->last_found, l);
2739 I32 old = b - data->last_start_min;
2743 old = utf8_hop((U8*)s, old) - (U8*)s;
2746 /* Get the added string: */
2747 last_str = newSVpvn(s + old, l);
2749 SvUTF8_on(last_str);
2750 if (deltanext == 0 && pos_before == b) {
2751 /* What was added is a constant string */
2753 SvGROW(last_str, (mincount * l) + 1);
2754 repeatcpy(SvPVX(last_str) + l,
2755 SvPVX_const(last_str), l, mincount - 1);
2756 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2757 /* Add additional parts. */
2758 SvCUR_set(data->last_found,
2759 SvCUR(data->last_found) - l);
2760 sv_catsv(data->last_found, last_str);
2762 SV * sv = data->last_found;
2764 SvUTF8(sv) && SvMAGICAL(sv) ?
2765 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2766 if (mg && mg->mg_len >= 0)
2767 mg->mg_len += CHR_SVLEN(last_str);
2769 data->last_end += l * (mincount - 1);
2772 /* start offset must point into the last copy */
2773 data->last_start_min += minnext * (mincount - 1);
2774 data->last_start_max += is_inf ? I32_MAX
2775 : (maxcount - 1) * (minnext + data->pos_delta);
2778 /* It is counted once already... */
2779 data->pos_min += minnext * (mincount - counted);
2780 data->pos_delta += - counted * deltanext +
2781 (minnext + deltanext) * maxcount - minnext * mincount;
2782 if (mincount != maxcount) {
2783 /* Cannot extend fixed substrings found inside
2785 scan_commit(pRExC_state,data);
2786 if (mincount && last_str) {
2787 SV * const sv = data->last_found;
2788 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2789 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2793 sv_setsv(sv, last_str);
2794 data->last_end = data->pos_min;
2795 data->last_start_min =
2796 data->pos_min - CHR_SVLEN(last_str);
2797 data->last_start_max = is_inf
2799 : data->pos_min + data->pos_delta
2800 - CHR_SVLEN(last_str);
2802 data->longest = &(data->longest_float);
2804 SvREFCNT_dec(last_str);
2806 if (data && (fl & SF_HAS_EVAL))
2807 data->flags |= SF_HAS_EVAL;
2808 optimize_curly_tail:
2809 if (OP(oscan) != CURLYX) {
2810 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2812 NEXT_OFF(oscan) += NEXT_OFF(next);
2815 default: /* REF and CLUMP only? */
2816 if (flags & SCF_DO_SUBSTR) {
2817 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2818 data->longest = &(data->longest_float);
2820 is_inf = is_inf_internal = 1;
2821 if (flags & SCF_DO_STCLASS_OR)
2822 cl_anything(pRExC_state, data->start_class);
2823 flags &= ~SCF_DO_STCLASS;
2827 else if (strchr((const char*)PL_simple,OP(scan))) {
2830 if (flags & SCF_DO_SUBSTR) {
2831 scan_commit(pRExC_state,data);
2835 if (flags & SCF_DO_STCLASS) {
2836 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2838 /* Some of the logic below assumes that switching
2839 locale on will only add false positives. */
2840 switch (PL_regkind[OP(scan)]) {
2844 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2845 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2846 cl_anything(pRExC_state, data->start_class);
2849 if (OP(scan) == SANY)
2851 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2852 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2853 || (data->start_class->flags & ANYOF_CLASS));
2854 cl_anything(pRExC_state, data->start_class);
2856 if (flags & SCF_DO_STCLASS_AND || !value)
2857 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2860 if (flags & SCF_DO_STCLASS_AND)
2861 cl_and(data->start_class,
2862 (struct regnode_charclass_class*)scan);
2864 cl_or(pRExC_state, data->start_class,
2865 (struct regnode_charclass_class*)scan);
2868 if (flags & SCF_DO_STCLASS_AND) {
2869 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2870 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2871 for (value = 0; value < 256; value++)
2872 if (!isALNUM(value))
2873 ANYOF_BITMAP_CLEAR(data->start_class, value);
2877 if (data->start_class->flags & ANYOF_LOCALE)
2878 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2880 for (value = 0; value < 256; value++)
2882 ANYOF_BITMAP_SET(data->start_class, value);
2887 if (flags & SCF_DO_STCLASS_AND) {
2888 if (data->start_class->flags & ANYOF_LOCALE)
2889 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2892 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2893 data->start_class->flags |= ANYOF_LOCALE;
2897 if (flags & SCF_DO_STCLASS_AND) {
2898 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2899 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2900 for (value = 0; value < 256; value++)
2902 ANYOF_BITMAP_CLEAR(data->start_class, value);
2906 if (data->start_class->flags & ANYOF_LOCALE)
2907 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2909 for (value = 0; value < 256; value++)
2910 if (!isALNUM(value))
2911 ANYOF_BITMAP_SET(data->start_class, value);
2916 if (flags & SCF_DO_STCLASS_AND) {
2917 if (data->start_class->flags & ANYOF_LOCALE)
2918 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2921 data->start_class->flags |= ANYOF_LOCALE;
2922 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2926 if (flags & SCF_DO_STCLASS_AND) {
2927 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2928 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2929 for (value = 0; value < 256; value++)
2930 if (!isSPACE(value))
2931 ANYOF_BITMAP_CLEAR(data->start_class, value);
2935 if (data->start_class->flags & ANYOF_LOCALE)
2936 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2938 for (value = 0; value < 256; value++)
2940 ANYOF_BITMAP_SET(data->start_class, value);
2945 if (flags & SCF_DO_STCLASS_AND) {
2946 if (data->start_class->flags & ANYOF_LOCALE)
2947 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2950 data->start_class->flags |= ANYOF_LOCALE;
2951 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2955 if (flags & SCF_DO_STCLASS_AND) {
2956 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2957 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2958 for (value = 0; value < 256; value++)
2960 ANYOF_BITMAP_CLEAR(data->start_class, value);
2964 if (data->start_class->flags & ANYOF_LOCALE)
2965 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2967 for (value = 0; value < 256; value++)
2968 if (!isSPACE(value))
2969 ANYOF_BITMAP_SET(data->start_class, value);
2974 if (flags & SCF_DO_STCLASS_AND) {
2975 if (data->start_class->flags & ANYOF_LOCALE) {
2976 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2977 for (value = 0; value < 256; value++)
2978 if (!isSPACE(value))
2979 ANYOF_BITMAP_CLEAR(data->start_class, value);
2983 data->start_class->flags |= ANYOF_LOCALE;
2984 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2988 if (flags & SCF_DO_STCLASS_AND) {
2989 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2990 for (value = 0; value < 256; value++)
2991 if (!isDIGIT(value))
2992 ANYOF_BITMAP_CLEAR(data->start_class, value);
2995 if (data->start_class->flags & ANYOF_LOCALE)
2996 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2998 for (value = 0; value < 256; value++)
3000 ANYOF_BITMAP_SET(data->start_class, value);
3005 if (flags & SCF_DO_STCLASS_AND) {
3006 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3007 for (value = 0; value < 256; value++)
3009 ANYOF_BITMAP_CLEAR(data->start_class, value);
3012 if (data->start_class->flags & ANYOF_LOCALE)
3013 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3015 for (value = 0; value < 256; value++)
3016 if (!isDIGIT(value))
3017 ANYOF_BITMAP_SET(data->start_class, value);
3022 if (flags & SCF_DO_STCLASS_OR)
3023 cl_and(data->start_class, &and_with);
3024 flags &= ~SCF_DO_STCLASS;
3027 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3028 data->flags |= (OP(scan) == MEOL
3032 else if ( PL_regkind[OP(scan)] == BRANCHJ
3033 /* Lookbehind, or need to calculate parens/evals/stclass: */
3034 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3035 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3036 /* Lookahead/lookbehind */
3037 I32 deltanext, minnext, fake = 0;
3039 struct regnode_charclass_class intrnl;
3042 data_fake.flags = 0;
3044 data_fake.whilem_c = data->whilem_c;
3045 data_fake.last_closep = data->last_closep;
3048 data_fake.last_closep = &fake;
3049 if ( flags & SCF_DO_STCLASS && !scan->flags
3050 && OP(scan) == IFMATCH ) { /* Lookahead */
3051 cl_init(pRExC_state, &intrnl);
3052 data_fake.start_class = &intrnl;
3053 f |= SCF_DO_STCLASS_AND;
3055 if (flags & SCF_WHILEM_VISITED_POS)
3056 f |= SCF_WHILEM_VISITED_POS;
3057 next = regnext(scan);
3058 nscan = NEXTOPER(NEXTOPER(scan));
3059 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3062 vFAIL("Variable length lookbehind not implemented");
3064 else if (minnext > U8_MAX) {
3065 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3067 scan->flags = (U8)minnext;
3070 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3072 if (data_fake.flags & SF_HAS_EVAL)
3073 data->flags |= SF_HAS_EVAL;
3074 data->whilem_c = data_fake.whilem_c;
3076 if (f & SCF_DO_STCLASS_AND) {
3077 const int was = (data->start_class->flags & ANYOF_EOS);
3079 cl_and(data->start_class, &intrnl);
3081 data->start_class->flags |= ANYOF_EOS;
3084 else if (OP(scan) == OPEN) {
3087 else if (OP(scan) == CLOSE) {
3088 if ((I32)ARG(scan) == is_par) {
3089 next = regnext(scan);
3091 if ( next && (OP(next) != WHILEM) && next < last)
3092 is_par = 0; /* Disable optimization */
3095 *(data->last_closep) = ARG(scan);
3097 else if (OP(scan) == EVAL) {
3099 data->flags |= SF_HAS_EVAL;
3101 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3102 if (flags & SCF_DO_SUBSTR) {
3103 scan_commit(pRExC_state,data);
3104 data->longest = &(data->longest_float);
3106 is_inf = is_inf_internal = 1;
3107 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3108 cl_anything(pRExC_state, data->start_class);
3109 flags &= ~SCF_DO_STCLASS;
3111 /* Else: zero-length, ignore. */
3112 scan = regnext(scan);
3117 *deltap = is_inf_internal ? I32_MAX : delta;
3118 if (flags & SCF_DO_SUBSTR && is_inf)
3119 data->pos_delta = I32_MAX - data->pos_min;
3120 if (is_par > U8_MAX)
3122 if (is_par && pars==1 && data) {
3123 data->flags |= SF_IN_PAR;
3124 data->flags &= ~SF_HAS_PAR;
3126 else if (pars && data) {
3127 data->flags |= SF_HAS_PAR;
3128 data->flags &= ~SF_IN_PAR;
3130 if (flags & SCF_DO_STCLASS_OR)
3131 cl_and(data->start_class, &and_with);
3132 if (flags & SCF_EXACT_TRIE)
3133 data->flags |= SCF_EXACT_TRIE;
3138 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3140 if (RExC_rx->data) {
3141 Renewc(RExC_rx->data,
3142 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3143 char, struct reg_data);
3144 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3145 RExC_rx->data->count += n;
3148 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3149 char, struct reg_data);
3150 Newx(RExC_rx->data->what, n, U8);
3151 RExC_rx->data->count = n;
3153 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3154 return RExC_rx->data->count - n;
3157 #ifndef PERL_IN_XSUB_RE
3159 Perl_reginitcolors(pTHX)
3162 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3164 char *t = savepv(s);
3168 t = strchr(t, '\t');
3174 PL_colors[i] = t = (char *)"";
3179 PL_colors[i++] = (char *)"";
3187 - pregcomp - compile a regular expression into internal code
3189 * We can't allocate space until we know how big the compiled form will be,
3190 * but we can't compile it (and thus know how big it is) until we've got a
3191 * place to put the code. So we cheat: we compile it twice, once with code
3192 * generation turned off and size counting turned on, and once "for real".
3193 * This also means that we don't allocate space until we are sure that the
3194 * thing really will compile successfully, and we never have to move the
3195 * code and thus invalidate pointers into it. (Note that it has to be in
3196 * one piece because free() must be able to free it all.) [NB: not true in perl]
3198 * Beware that the optimization-preparation code in here knows about some
3199 * of the structure of the compiled regexp. [I'll say.]
3202 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3213 RExC_state_t RExC_state;
3214 RExC_state_t * const pRExC_state = &RExC_state;
3215 #ifdef TRIE_STUDY_OPT
3217 RExC_state_t copyRExC_state;
3220 GET_RE_DEBUG_FLAGS_DECL;
3223 FAIL("NULL regexp argument");
3225 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3228 DEBUG_r(if (!PL_colorset) reginitcolors());
3230 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
3231 PL_colors[4],PL_colors[5],PL_colors[0],
3232 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3234 RExC_flags = pm->op_pmflags;
3238 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3239 RExC_seen_evals = 0;
3242 /* First pass: determine size, legality. */
3249 RExC_emit = &PL_regdummy;
3250 RExC_whilem_seen = 0;
3251 #if 0 /* REGC() is (currently) a NOP at the first pass.
3252 * Clever compilers notice this and complain. --jhi */
3253 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3255 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3256 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3257 RExC_precomp = NULL;
3260 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3261 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3262 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3265 RExC_lastparse=NULL;
3269 /* Small enough for pointer-storage convention?
3270 If extralen==0, this means that we will not need long jumps. */
3271 if (RExC_size >= 0x10000L && RExC_extralen)
3272 RExC_size += RExC_extralen;
3275 if (RExC_whilem_seen > 15)
3276 RExC_whilem_seen = 15;
3278 /* Allocate space and initialize. */
3279 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3282 FAIL("Regexp out of space");
3285 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3286 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3289 r->prelen = xend - exp;
3290 r->precomp = savepvn(RExC_precomp, r->prelen);
3292 #ifdef PERL_OLD_COPY_ON_WRITE
3293 r->saved_copy = NULL;
3295 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3296 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3297 r->lastparen = 0; /* mg.c reads this. */
3299 r->substrs = 0; /* Useful during FAIL. */
3300 r->startp = 0; /* Useful during FAIL. */
3301 r->endp = 0; /* Useful during FAIL. */
3303 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3305 r->offsets[0] = RExC_size;
3307 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3308 "%s %"UVuf" bytes for offset annotations.\n",
3309 r->offsets ? "Got" : "Couldn't get",
3310 (UV)((2*RExC_size+1) * sizeof(U32))));
3314 /* Second pass: emit code. */
3315 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3320 RExC_emit_start = r->program;
3321 RExC_emit = r->program;
3322 /* Store the count of eval-groups for security checks: */
3323 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3324 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3326 if (reg(pRExC_state, 0, &flags,1) == NULL)
3328 /* XXXX To minimize changes to RE engine we always allocate
3329 3-units-long substrs field. */
3330 Newx(r->substrs, 1, struct reg_substr_data);
3333 Zero(r->substrs, 1, struct reg_substr_data);
3334 StructCopy(&zero_scan_data, &data, scan_data_t);
3336 #ifdef TRIE_STUDY_OPT
3338 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3339 RExC_state=copyRExC_state;
3340 if (data.longest_fixed)
3341 SvREFCNT_dec(data.longest_fixed);
3342 if (data.longest_float)
3343 SvREFCNT_dec(data.longest_float);
3344 if (data.last_found)
3345 SvREFCNT_dec(data.last_found);
3347 copyRExC_state=RExC_state;
3350 /* Dig out information for optimizations. */
3351 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3352 pm->op_pmflags = RExC_flags;
3354 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3355 r->regstclass = NULL;
3356 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3357 r->reganch |= ROPT_NAUGHTY;
3358 scan = r->program + 1; /* First BRANCH. */
3360 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3361 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3363 STRLEN longest_float_length, longest_fixed_length;
3364 struct regnode_charclass_class ch_class; /* pointed to by data */
3366 I32 last_close = 0; /* pointed to by data */
3369 /* Skip introductions and multiplicators >= 1. */
3370 while ((OP(first) == OPEN && (sawopen = 1)) ||
3371 /* An OR of *one* alternative - should not happen now. */
3372 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3373 /* for now we can't handle lookbehind IFMATCH*/
3374 (OP(first) == IFMATCH && !first->flags) ||
3375 (OP(first) == PLUS) ||
3376 (OP(first) == MINMOD) ||
3377 /* An {n,m} with n>0 */
3378 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3380 DEBUG_PEEP("first:",first,0);
3381 if (OP(first) == PLUS)
3384 first += regarglen[OP(first)];
3385 if (OP(first) == IFMATCH) {
3386 first = NEXTOPER(first);
3387 first += EXTRA_STEP_2ARGS;
3388 } else /*xxx possible optimisation for /(?=)/*/
3389 first = NEXTOPER(first);
3392 /* Starting-point info. */
3394 /* Ignore EXACT as we deal with it later. */
3395 if (PL_regkind[OP(first)] == EXACT) {
3396 if (OP(first) == EXACT)
3397 NOOP; /* Empty, get anchored substr later. */
3398 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3399 r->regstclass = first;
3402 else if (OP(first) == TRIE &&
3403 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3405 /* this can happen only on restudy */
3406 struct regnode_1 *trie_op;
3407 Newxz(trie_op,1,struct regnode_1);
3408 StructCopy(first,trie_op,struct regnode_1);
3409 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3410 r->regstclass = (regnode *)trie_op;
3413 else if (strchr((const char*)PL_simple,OP(first)))
3414 r->regstclass = first;
3415 else if (PL_regkind[OP(first)] == BOUND ||
3416 PL_regkind[OP(first)] == NBOUND)
3417 r->regstclass = first;
3418 else if (PL_regkind[OP(first)] == BOL) {
3419 r->reganch |= (OP(first) == MBOL
3421 : (OP(first) == SBOL
3424 first = NEXTOPER(first);
3427 else if (OP(first) == GPOS) {
3428 r->reganch |= ROPT_ANCH_GPOS;
3429 first = NEXTOPER(first);
3432 else if (!sawopen && (OP(first) == STAR &&
3433 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3434 !(r->reganch & ROPT_ANCH) )
3436 /* turn .* into ^.* with an implied $*=1 */
3438 (OP(NEXTOPER(first)) == REG_ANY)
3441 r->reganch |= type | ROPT_IMPLICIT;
3442 first = NEXTOPER(first);
3445 if (sawplus && (!sawopen || !RExC_sawback)
3446 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3447 /* x+ must match at the 1st pos of run of x's */
3448 r->reganch |= ROPT_SKIP;
3450 /* Scan is after the zeroth branch, first is atomic matcher. */
3451 #ifdef TRIE_STUDY_OPT
3454 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3455 (IV)(first - scan + 1))
3459 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3460 (IV)(first - scan + 1))
3466 * If there's something expensive in the r.e., find the
3467 * longest literal string that must appear and make it the
3468 * regmust. Resolve ties in favor of later strings, since
3469 * the regstart check works with the beginning of the r.e.
3470 * and avoiding duplication strengthens checking. Not a
3471 * strong reason, but sufficient in the absence of others.
3472 * [Now we resolve ties in favor of the earlier string if
3473 * it happens that c_offset_min has been invalidated, since the
3474 * earlier string may buy us something the later one won't.]
3478 data.longest_fixed = newSVpvs("");
3479 data.longest_float = newSVpvs("");
3480 data.last_found = newSVpvs("");
3481 data.longest = &(data.longest_fixed);
3483 if (!r->regstclass) {
3484 cl_init(pRExC_state, &ch_class);
3485 data.start_class = &ch_class;
3486 stclass_flag = SCF_DO_STCLASS_AND;
3487 } else /* XXXX Check for BOUND? */
3489 data.last_closep = &last_close;
3491 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3492 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3494 #ifdef TRIE_STUDY_OPT
3495 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3500 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3501 && data.last_start_min == 0 && data.last_end > 0
3502 && !RExC_seen_zerolen
3503 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3504 r->reganch |= ROPT_CHECK_ALL;
3505 scan_commit(pRExC_state, &data);
3506 SvREFCNT_dec(data.last_found);
3508 longest_float_length = CHR_SVLEN(data.longest_float);
3509 if (longest_float_length
3510 || (data.flags & SF_FL_BEFORE_EOL
3511 && (!(data.flags & SF_FL_BEFORE_MEOL)
3512 || (RExC_flags & PMf_MULTILINE)))) {
3515 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3516 && data.offset_fixed == data.offset_float_min
3517 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3518 goto remove_float; /* As in (a)+. */
3520 if (SvUTF8(data.longest_float)) {
3521 r->float_utf8 = data.longest_float;
3522 r->float_substr = NULL;
3524 r->float_substr = data.longest_float;
3525 r->float_utf8 = NULL;
3527 r->float_min_offset = data.offset_float_min;
3528 r->float_max_offset = data.offset_float_max;
3529 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3530 && (!(data.flags & SF_FL_BEFORE_MEOL)
3531 || (RExC_flags & PMf_MULTILINE)));
3532 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3536 r->float_substr = r->float_utf8 = NULL;
3537 SvREFCNT_dec(data.longest_float);
3538 longest_float_length = 0;
3541 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3542 if (longest_fixed_length
3543 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3544 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3545 || (RExC_flags & PMf_MULTILINE)))) {
3548 if (SvUTF8(data.longest_fixed)) {
3549 r->anchored_utf8 = data.longest_fixed;
3550 r->anchored_substr = NULL;
3552 r->anchored_substr = data.longest_fixed;
3553 r->anchored_utf8 = NULL;
3555 r->anchored_offset = data.offset_fixed;
3556 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3557 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3558 || (RExC_flags & PMf_MULTILINE)));
3559 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3562 r->anchored_substr = r->anchored_utf8 = NULL;
3563 SvREFCNT_dec(data.longest_fixed);
3564 longest_fixed_length = 0;
3567 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3568 r->regstclass = NULL;
3569 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3571 && !(data.start_class->flags & ANYOF_EOS)
3572 && !cl_is_anything(data.start_class))
3574 const I32 n = add_data(pRExC_state, 1, "f");
3576 Newx(RExC_rx->data->data[n], 1,
3577 struct regnode_charclass_class);
3578 StructCopy(data.start_class,
3579 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3580 struct regnode_charclass_class);
3581 r->regstclass = (regnode*)RExC_rx->data->data[n];
3582 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3583 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3584 regprop(r, sv, (regnode*)data.start_class);
3585 PerlIO_printf(Perl_debug_log,
3586 "synthetic stclass \"%s\".\n",
3587 SvPVX_const(sv));});
3590 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3591 if (longest_fixed_length > longest_float_length) {
3592 r->check_substr = r->anchored_substr;
3593 r->check_utf8 = r->anchored_utf8;
3594 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3595 if (r->reganch & ROPT_ANCH_SINGLE)
3596 r->reganch |= ROPT_NOSCAN;
3599 r->check_substr = r->float_substr;
3600 r->check_utf8 = r->float_utf8;
3601 r->check_offset_min = data.offset_float_min;
3602 r->check_offset_max = data.offset_float_max;
3604 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3605 This should be changed ASAP! */
3606 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3607 r->reganch |= RE_USE_INTUIT;
3608 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3609 r->reganch |= RE_INTUIT_TAIL;
3613 /* Several toplevels. Best we can is to set minlen. */
3615 struct regnode_charclass_class ch_class;
3618 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3620 scan = r->program + 1;
3621 cl_init(pRExC_state, &ch_class);
3622 data.start_class = &ch_class;
3623 data.last_closep = &last_close;
3625 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3626 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3628 #ifdef TRIE_STUDY_OPT
3629 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3634 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3635 = r->float_substr = r->float_utf8 = NULL;
3636 if (!(data.start_class->flags & ANYOF_EOS)
3637 && !cl_is_anything(data.start_class))
3639 const I32 n = add_data(pRExC_state, 1, "f");
3641 Newx(RExC_rx->data->data[n], 1,
3642 struct regnode_charclass_class);
3643 StructCopy(data.start_class,
3644 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3645 struct regnode_charclass_class);
3646 r->regstclass = (regnode*)RExC_rx->data->data[n];
3647 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3648 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3649 regprop(r, sv, (regnode*)data.start_class);
3650 PerlIO_printf(Perl_debug_log,
3651 "synthetic stclass \"%s\".\n",
3652 SvPVX_const(sv));});
3657 if (RExC_seen & REG_SEEN_GPOS)
3658 r->reganch |= ROPT_GPOS_SEEN;
3659 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3660 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3661 if (RExC_seen & REG_SEEN_EVAL)
3662 r->reganch |= ROPT_EVAL_SEEN;
3663 if (RExC_seen & REG_SEEN_CANY)
3664 r->reganch |= ROPT_CANY_SEEN;
3665 Newxz(r->startp, RExC_npar, I32);
3666 Newxz(r->endp, RExC_npar, I32);
3668 DEBUG_r( RX_DEBUG_on(r) );
3670 PerlIO_printf(Perl_debug_log,"Final program:\n");
3673 DEBUG_OFFSETS_r(if (r->offsets) {
3674 const U32 len = r->offsets[0];
3676 GET_RE_DEBUG_FLAGS_DECL;
3677 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3678 for (i = 1; i <= len; i++) {
3679 if (r->offsets[i*2-1] || r->offsets[i*2])
3680 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3681 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3683 PerlIO_printf(Perl_debug_log, "\n");
3689 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3690 int rem=(int)(RExC_end - RExC_parse); \
3699 if (RExC_lastparse!=RExC_parse) \
3700 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3703 iscut ? "..." : "<" \
3706 PerlIO_printf(Perl_debug_log,"%16s",""); \
3711 num=REG_NODE_NUM(RExC_emit); \
3712 if (RExC_lastnum!=num) \
3713 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3715 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3716 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3717 (int)((depth*2)), "", \
3721 RExC_lastparse=RExC_parse; \
3726 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3727 DEBUG_PARSE_MSG((funcname)); \
3728 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3731 - reg - regular expression, i.e. main body or parenthesized thing
3733 * Caller must absorb opening parenthesis.
3735 * Combining parenthesis handling with the base level of regular expression
3736 * is a trifle forced, but the need to tie the tails of the branches to what
3737 * follows makes it hard to avoid.
3739 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3741 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3743 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3747 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3748 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3751 register regnode *ret; /* Will be the head of the group. */
3752 register regnode *br;
3753 register regnode *lastbr;
3754 register regnode *ender = NULL;
3755 register I32 parno = 0;
3757 const I32 oregflags = RExC_flags;
3758 bool have_branch = 0;
3761 /* for (?g), (?gc), and (?o) warnings; warning
3762 about (?c) will warn about (?g) -- japhy */
3764 #define WASTED_O 0x01
3765 #define WASTED_G 0x02
3766 #define WASTED_C 0x04
3767 #define WASTED_GC (0x02|0x04)
3768 I32 wastedflags = 0x00;
3770 char * parse_start = RExC_parse; /* MJD */
3771 char * const oregcomp_parse = RExC_parse;
3773 GET_RE_DEBUG_FLAGS_DECL;
3774 DEBUG_PARSE("reg ");
3777 *flagp = 0; /* Tentatively. */
3780 /* Make an OPEN node, if parenthesized. */
3782 if (*RExC_parse == '?') { /* (?...) */
3783 U32 posflags = 0, negflags = 0;
3784 U32 *flagsp = &posflags;
3785 bool is_logical = 0;
3786 const char * const seqstart = RExC_parse;
3789 paren = *RExC_parse++;
3790 ret = NULL; /* For look-ahead/behind. */
3792 case '<': /* (?<...) */
3793 RExC_seen |= REG_SEEN_LOOKBEHIND;
3794 if (*RExC_parse == '!')
3796 if (*RExC_parse != '=' && *RExC_parse != '!')
3799 case '=': /* (?=...) */
3800 case '!': /* (?!...) */
3801 RExC_seen_zerolen++;
3802 case ':': /* (?:...) */
3803 case '>': /* (?>...) */
3805 case '$': /* (?$...) */
3806 case '@': /* (?@...) */
3807 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3809 case '#': /* (?#...) */
3810 while (*RExC_parse && *RExC_parse != ')')
3812 if (*RExC_parse != ')')
3813 FAIL("Sequence (?#... not terminated");
3814 nextchar(pRExC_state);
3817 case 'p': /* (?p...) */
3818 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3819 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3821 case '?': /* (??...) */
3823 if (*RExC_parse != '{')
3825 paren = *RExC_parse++;
3827 case '{': /* (?{...}) */
3829 I32 count = 1, n = 0;
3831 char *s = RExC_parse;
3833 RExC_seen_zerolen++;
3834 RExC_seen |= REG_SEEN_EVAL;
3835 while (count && (c = *RExC_parse)) {
3846 if (*RExC_parse != ')') {
3848 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3852 OP_4tree *sop, *rop;
3853 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3856 Perl_save_re_context(aTHX);
3857 rop = sv_compile_2op(sv, &sop, "re", &pad);
3858 sop->op_private |= OPpREFCOUNTED;
3859 /* re_dup will OpREFCNT_inc */
3860 OpREFCNT_set(sop, 1);
3863 n = add_data(pRExC_state, 3, "nop");
3864 RExC_rx->data->data[n] = (void*)rop;
3865 RExC_rx->data->data[n+1] = (void*)sop;
3866 RExC_rx->data->data[n+2] = (void*)pad;
3869 else { /* First pass */
3870 if (PL_reginterp_cnt < ++RExC_seen_evals
3872 /* No compiled RE interpolated, has runtime
3873 components ===> unsafe. */
3874 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3875 if (PL_tainting && PL_tainted)
3876 FAIL("Eval-group in insecure regular expression");
3877 #if PERL_VERSION > 8
3878 if (IN_PERL_COMPILETIME)
3883 nextchar(pRExC_state);
3885 ret = reg_node(pRExC_state, LOGICAL);
3888 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3889 /* deal with the length of this later - MJD */
3892 ret = reganode(pRExC_state, EVAL, n);
3893 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3894 Set_Node_Offset(ret, parse_start);
3897 case '(': /* (?(?{...})...) and (?(?=...)...) */
3899 if (RExC_parse[0] == '?') { /* (?(?...)) */
3900 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3901 || RExC_parse[1] == '<'
3902 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3905 ret = reg_node(pRExC_state, LOGICAL);
3908 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3912 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3915 parno = atoi(RExC_parse++);
3917 while (isDIGIT(*RExC_parse))
3919 ret = reganode(pRExC_state, GROUPP, parno);
3921 if ((c = *nextchar(pRExC_state)) != ')')
3922 vFAIL("Switch condition not recognized");
3924 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3925 br = regbranch(pRExC_state, &flags, 1,depth+1);
3927 br = reganode(pRExC_state, LONGJMP, 0);
3929 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3930 c = *nextchar(pRExC_state);
3934 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3935 regbranch(pRExC_state, &flags, 1,depth+1);
3936 REGTAIL(pRExC_state, ret, lastbr);
3939 c = *nextchar(pRExC_state);
3944 vFAIL("Switch (?(condition)... contains too many branches");
3945 ender = reg_node(pRExC_state, TAIL);
3946 REGTAIL(pRExC_state, br, ender);
3948 REGTAIL(pRExC_state, lastbr, ender);
3949 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3952 REGTAIL(pRExC_state, ret, ender);
3956 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3960 RExC_parse--; /* for vFAIL to print correctly */
3961 vFAIL("Sequence (? incomplete");
3965 parse_flags: /* (?i) */
3966 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3967 /* (?g), (?gc) and (?o) are useless here
3968 and must be globally applied -- japhy */
3970 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3971 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3972 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3973 if (! (wastedflags & wflagbit) ) {
3974 wastedflags |= wflagbit;
3977 "Useless (%s%c) - %suse /%c modifier",
3978 flagsp == &negflags ? "?-" : "?",
3980 flagsp == &negflags ? "don't " : "",
3986 else if (*RExC_parse == 'c') {
3987 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3988 if (! (wastedflags & WASTED_C) ) {
3989 wastedflags |= WASTED_GC;
3992 "Useless (%sc) - %suse /gc modifier",
3993 flagsp == &negflags ? "?-" : "?",
3994 flagsp == &negflags ? "don't " : ""
3999 else { pmflag(flagsp, *RExC_parse); }
4003 if (*RExC_parse == '-') {
4005 wastedflags = 0; /* reset so (?g-c) warns twice */
4009 RExC_flags |= posflags;
4010 RExC_flags &= ~negflags;
4011 if (*RExC_parse == ':') {
4017 if (*RExC_parse != ')') {
4019 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
4021 nextchar(pRExC_state);
4029 ret = reganode(pRExC_state, OPEN, parno);
4030 Set_Node_Length(ret, 1); /* MJD */
4031 Set_Node_Offset(ret, RExC_parse); /* MJD */
4038 /* Pick up the branches, linking them together. */
4039 parse_start = RExC_parse; /* MJD */
4040 br = regbranch(pRExC_state, &flags, 1,depth+1);
4041 /* branch_len = (paren != 0); */
4045 if (*RExC_parse == '|') {
4046 if (!SIZE_ONLY && RExC_extralen) {
4047 reginsert(pRExC_state, BRANCHJ, br);
4050 reginsert(pRExC_state, BRANCH, br);
4051 Set_Node_Length(br, paren != 0);
4052 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4056 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4058 else if (paren == ':') {
4059 *flagp |= flags&SIMPLE;
4061 if (is_open) { /* Starts with OPEN. */
4062 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4064 else if (paren != '?') /* Not Conditional */
4066 *flagp |= flags & (SPSTART | HASWIDTH);
4068 while (*RExC_parse == '|') {
4069 if (!SIZE_ONLY && RExC_extralen) {
4070 ender = reganode(pRExC_state, LONGJMP,0);
4071 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4074 RExC_extralen += 2; /* Account for LONGJMP. */
4075 nextchar(pRExC_state);
4076 br = regbranch(pRExC_state, &flags, 0, depth+1);
4080 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4084 *flagp |= flags&SPSTART;
4087 if (have_branch || paren != ':') {
4088 /* Make a closing node, and hook it on the end. */
4091 ender = reg_node(pRExC_state, TAIL);
4094 ender = reganode(pRExC_state, CLOSE, parno);
4095 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4096 Set_Node_Length(ender,1); /* MJD */
4102 *flagp &= ~HASWIDTH;
4105 ender = reg_node(pRExC_state, SUCCEED);
4108 ender = reg_node(pRExC_state, END);
4111 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4113 if (have_branch && !SIZE_ONLY) {
4114 /* Hook the tails of the branches to the closing node. */
4115 for (br = ret; br; br = regnext(br)) {
4116 const U8 op = PL_regkind[OP(br)];
4118 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4120 else if (op == BRANCHJ) {
4121 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4129 static const char parens[] = "=!<,>";
4131 if (paren && (p = strchr(parens, paren))) {
4132 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4133 int flag = (p - parens) > 1;
4136 node = SUSPEND, flag = 0;
4137 reginsert(pRExC_state, node,ret);
4138 Set_Node_Cur_Length(ret);
4139 Set_Node_Offset(ret, parse_start + 1);
4141 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4145 /* Check for proper termination. */
4147 RExC_flags = oregflags;
4148 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4149 RExC_parse = oregcomp_parse;
4150 vFAIL("Unmatched (");
4153 else if (!paren && RExC_parse < RExC_end) {
4154 if (*RExC_parse == ')') {
4156 vFAIL("Unmatched )");
4159 FAIL("Junk on end of regexp"); /* "Can't happen". */
4167 - regbranch - one alternative of an | operator
4169 * Implements the concatenation operator.
4172 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4175 register regnode *ret;
4176 register regnode *chain = NULL;
4177 register regnode *latest;
4178 I32 flags = 0, c = 0;
4179 GET_RE_DEBUG_FLAGS_DECL;
4180 DEBUG_PARSE("brnc");
4184 if (!SIZE_ONLY && RExC_extralen)
4185 ret = reganode(pRExC_state, BRANCHJ,0);
4187 ret = reg_node(pRExC_state, BRANCH);
4188 Set_Node_Length(ret, 1);
4192 if (!first && SIZE_ONLY)
4193 RExC_extralen += 1; /* BRANCHJ */
4195 *flagp = WORST; /* Tentatively. */
4198 nextchar(pRExC_state);
4199 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4201 latest = regpiece(pRExC_state, &flags,depth+1);
4202 if (latest == NULL) {
4203 if (flags & TRYAGAIN)
4207 else if (ret == NULL)
4209 *flagp |= flags&HASWIDTH;
4210 if (chain == NULL) /* First piece. */
4211 *flagp |= flags&SPSTART;
4214 REGTAIL(pRExC_state, chain, latest);
4219 if (chain == NULL) { /* Loop ran zero times. */
4220 chain = reg_node(pRExC_state, NOTHING);
4225 *flagp |= flags&SIMPLE;
4232 - regpiece - something followed by possible [*+?]
4234 * Note that the branching code sequences used for ? and the general cases
4235 * of * and + are somewhat optimized: they use the same NOTHING node as
4236 * both the endmarker for their branch list and the body of the last branch.
4237 * It might seem that this node could be dispensed with entirely, but the
4238 * endmarker role is not redundant.
4241 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4244 register regnode *ret;
4246 register char *next;
4248 const char * const origparse = RExC_parse;
4250 I32 max = REG_INFTY;
4252 GET_RE_DEBUG_FLAGS_DECL;
4253 DEBUG_PARSE("piec");
4255 ret = regatom(pRExC_state, &flags,depth+1);
4257 if (flags & TRYAGAIN)
4264 if (op == '{' && regcurly(RExC_parse)) {
4265 const char *maxpos = NULL;
4266 parse_start = RExC_parse; /* MJD */
4267 next = RExC_parse + 1;
4268 while (isDIGIT(*next) || *next == ',') {
4277 if (*next == '}') { /* got one */
4281 min = atoi(RExC_parse);
4285 maxpos = RExC_parse;
4287 if (!max && *maxpos != '0')
4288 max = REG_INFTY; /* meaning "infinity" */
4289 else if (max >= REG_INFTY)
4290 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4292 nextchar(pRExC_state);
4295 if ((flags&SIMPLE)) {
4296 RExC_naughty += 2 + RExC_naughty / 2;
4297 reginsert(pRExC_state, CURLY, ret);
4298 Set_Node_Offset(ret, parse_start+1); /* MJD */
4299 Set_Node_Cur_Length(ret);
4302 regnode * const w = reg_node(pRExC_state, WHILEM);
4305 REGTAIL(pRExC_state, ret, w);
4306 if (!SIZE_ONLY && RExC_extralen) {
4307 reginsert(pRExC_state, LONGJMP,ret);
4308 reginsert(pRExC_state, NOTHING,ret);
4309 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4311 reginsert(pRExC_state, CURLYX,ret);
4313 Set_Node_Offset(ret, parse_start+1);
4314 Set_Node_Length(ret,
4315 op == '{' ? (RExC_parse - parse_start) : 1);
4317 if (!SIZE_ONLY && RExC_extralen)
4318 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4319 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4321 RExC_whilem_seen++, RExC_extralen += 3;
4322 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4330 if (max && max < min)
4331 vFAIL("Can't do {n,m} with n > m");
4333 ARG1_SET(ret, (U16)min);
4334 ARG2_SET(ret, (U16)max);
4346 #if 0 /* Now runtime fix should be reliable. */
4348 /* if this is reinstated, don't forget to put this back into perldiag:
4350 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4352 (F) The part of the regexp subject to either the * or + quantifier
4353 could match an empty string. The {#} shows in the regular
4354 expression about where the problem was discovered.
4358 if (!(flags&HASWIDTH) && op != '?')
4359 vFAIL("Regexp *+ operand could be empty");
4362 parse_start = RExC_parse;
4363 nextchar(pRExC_state);
4365 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4367 if (op == '*' && (flags&SIMPLE)) {
4368 reginsert(pRExC_state, STAR, ret);
4372 else if (op == '*') {
4376 else if (op == '+' && (flags&SIMPLE)) {
4377 reginsert(pRExC_state, PLUS, ret);
4381 else if (op == '+') {
4385 else if (op == '?') {
4390 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4392 "%.*s matches null string many times",
4393 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4397 if (*RExC_parse == '?') {
4398 nextchar(pRExC_state);
4399 reginsert(pRExC_state, MINMOD, ret);
4400 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4402 if (ISMULT2(RExC_parse)) {
4404 vFAIL("Nested quantifiers");
4411 - regatom - the lowest level
4413 * Optimization: gobbles an entire sequence of ordinary characters so that
4414 * it can turn them into a single node, which is smaller to store and
4415 * faster to run. Backslashed characters are exceptions, each becoming a
4416 * separate node; the code is simpler that way and it's not worth fixing.
4418 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4419 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4422 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4425 register regnode *ret = NULL;
4427 char *parse_start = RExC_parse;
4428 GET_RE_DEBUG_FLAGS_DECL;
4429 DEBUG_PARSE("atom");
4430 *flagp = WORST; /* Tentatively. */
4433 switch (*RExC_parse) {
4435 RExC_seen_zerolen++;
4436 nextchar(pRExC_state);
4437 if (RExC_flags & PMf_MULTILINE)
4438 ret = reg_node(pRExC_state, MBOL);
4439 else if (RExC_flags & PMf_SINGLELINE)
4440 ret = reg_node(pRExC_state, SBOL);
4442 ret = reg_node(pRExC_state, BOL);
4443 Set_Node_Length(ret, 1); /* MJD */
4446 nextchar(pRExC_state);
4448 RExC_seen_zerolen++;
4449 if (RExC_flags & PMf_MULTILINE)
4450 ret = reg_node(pRExC_state, MEOL);
4451 else if (RExC_flags & PMf_SINGLELINE)
4452 ret = reg_node(pRExC_state, SEOL);
4454 ret = reg_node(pRExC_state, EOL);
4455 Set_Node_Length(ret, 1); /* MJD */
4458 nextchar(pRExC_state);
4459 if (RExC_flags & PMf_SINGLELINE)
4460 ret = reg_node(pRExC_state, SANY);
4462 ret = reg_node(pRExC_state, REG_ANY);
4463 *flagp |= HASWIDTH|SIMPLE;
4465 Set_Node_Length(ret, 1); /* MJD */
4469 char * const oregcomp_parse = ++RExC_parse;
4470 ret = regclass(pRExC_state,depth+1);
4471 if (*RExC_parse != ']') {
4472 RExC_parse = oregcomp_parse;
4473 vFAIL("Unmatched [");
4475 nextchar(pRExC_state);
4476 *flagp |= HASWIDTH|SIMPLE;
4477 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4481 nextchar(pRExC_state);
4482 ret = reg(pRExC_state, 1, &flags,depth+1);
4484 if (flags & TRYAGAIN) {
4485 if (RExC_parse == RExC_end) {
4486 /* Make parent create an empty node if needed. */
4494 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4498 if (flags & TRYAGAIN) {
4502 vFAIL("Internal urp");
4503 /* Supposed to be caught earlier. */
4506 if (!regcurly(RExC_parse)) {
4515 vFAIL("Quantifier follows nothing");
4518 switch (*++RExC_parse) {
4520 RExC_seen_zerolen++;
4521 ret = reg_node(pRExC_state, SBOL);
4523 nextchar(pRExC_state);
4524 Set_Node_Length(ret, 2); /* MJD */
4527 ret = reg_node(pRExC_state, GPOS);
4528 RExC_seen |= REG_SEEN_GPOS;
4530 nextchar(pRExC_state);
4531 Set_Node_Length(ret, 2); /* MJD */
4534 ret = reg_node(pRExC_state, SEOL);
4536 RExC_seen_zerolen++; /* Do not optimize RE away */
4537 nextchar(pRExC_state);
4540 ret = reg_node(pRExC_state, EOS);
4542 RExC_seen_zerolen++; /* Do not optimize RE away */
4543 nextchar(pRExC_state);
4544 Set_Node_Length(ret, 2); /* MJD */
4547 ret = reg_node(pRExC_state, CANY);
4548 RExC_seen |= REG_SEEN_CANY;
4549 *flagp |= HASWIDTH|SIMPLE;
4550 nextchar(pRExC_state);
4551 Set_Node_Length(ret, 2); /* MJD */
4554 ret = reg_node(pRExC_state, CLUMP);
4556 nextchar(pRExC_state);
4557 Set_Node_Length(ret, 2); /* MJD */
4560 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4561 *flagp |= HASWIDTH|SIMPLE;
4562 nextchar(pRExC_state);
4563 Set_Node_Length(ret, 2); /* MJD */
4566 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4567 *flagp |= HASWIDTH|SIMPLE;
4568 nextchar(pRExC_state);
4569 Set_Node_Length(ret, 2); /* MJD */
4572 RExC_seen_zerolen++;
4573 RExC_seen |= REG_SEEN_LOOKBEHIND;
4574 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4576 nextchar(pRExC_state);
4577 Set_Node_Length(ret, 2); /* MJD */
4580 RExC_seen_zerolen++;
4581 RExC_seen |= REG_SEEN_LOOKBEHIND;
4582 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4584 nextchar(pRExC_state);
4585 Set_Node_Length(ret, 2); /* MJD */
4588 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4589 *flagp |= HASWIDTH|SIMPLE;
4590 nextchar(pRExC_state);
4591 Set_Node_Length(ret, 2); /* MJD */
4594 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4595 *flagp |= HASWIDTH|SIMPLE;
4596 nextchar(pRExC_state);
4597 Set_Node_Length(ret, 2); /* MJD */
4600 ret = reg_node(pRExC_state, DIGIT);
4601 *flagp |= HASWIDTH|SIMPLE;
4602 nextchar(pRExC_state);
4603 Set_Node_Length(ret, 2); /* MJD */
4606 ret = reg_node(pRExC_state, NDIGIT);
4607 *flagp |= HASWIDTH|SIMPLE;
4608 nextchar(pRExC_state);
4609 Set_Node_Length(ret, 2); /* MJD */
4614 char* const oldregxend = RExC_end;
4615 char* parse_start = RExC_parse - 2;
4617 if (RExC_parse[1] == '{') {
4618 /* a lovely hack--pretend we saw [\pX] instead */
4619 RExC_end = strchr(RExC_parse, '}');
4621 const U8 c = (U8)*RExC_parse;
4623 RExC_end = oldregxend;
4624 vFAIL2("Missing right brace on \\%c{}", c);
4629 RExC_end = RExC_parse + 2;
4630 if (RExC_end > oldregxend)
4631 RExC_end = oldregxend;
4635 ret = regclass(pRExC_state,depth+1);
4637 RExC_end = oldregxend;
4640 Set_Node_Offset(ret, parse_start + 2);
4641 Set_Node_Cur_Length(ret);
4642 nextchar(pRExC_state);
4643 *flagp |= HASWIDTH|SIMPLE;
4656 case '1': case '2': case '3': case '4':
4657 case '5': case '6': case '7': case '8': case '9':
4659 const I32 num = atoi(RExC_parse);
4661 if (num > 9 && num >= RExC_npar)
4664 char * const parse_start = RExC_parse - 1; /* MJD */
4665 while (isDIGIT(*RExC_parse))
4668 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4669 vFAIL("Reference to nonexistent group");
4671 ret = reganode(pRExC_state,
4672 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4676 /* override incorrect value set in reganode MJD */
4677 Set_Node_Offset(ret, parse_start+1);
4678 Set_Node_Cur_Length(ret); /* MJD */
4680 nextchar(pRExC_state);
4685 if (RExC_parse >= RExC_end)
4686 FAIL("Trailing \\");
4689 /* Do not generate "unrecognized" warnings here, we fall
4690 back into the quick-grab loop below */
4697 if (RExC_flags & PMf_EXTENDED) {
4698 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4700 if (RExC_parse < RExC_end)
4706 register STRLEN len;
4711 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4713 parse_start = RExC_parse - 1;
4719 ret = reg_node(pRExC_state,
4720 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4722 for (len = 0, p = RExC_parse - 1;
4723 len < 127 && p < RExC_end;
4726 char * const oldp = p;
4728 if (RExC_flags & PMf_EXTENDED)
4729 p = regwhite(p, RExC_end);
4776 ender = ASCII_TO_NATIVE('\033');
4780 ender = ASCII_TO_NATIVE('\007');
4785 char* const e = strchr(p, '}');
4789 vFAIL("Missing right brace on \\x{}");
4792 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4793 | PERL_SCAN_DISALLOW_PREFIX;
4794 STRLEN numlen = e - p - 1;
4795 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4802 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4804 ender = grok_hex(p, &numlen, &flags, NULL);
4810 ender = UCHARAT(p++);
4811 ender = toCTRL(ender);
4813 case '0': case '1': case '2': case '3':case '4':
4814 case '5': case '6': case '7': case '8':case '9':
4816 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4819 ender = grok_oct(p, &numlen, &flags, NULL);
4829 FAIL("Trailing \\");
4832 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4833 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4834 goto normal_default;
4839 if (UTF8_IS_START(*p) && UTF) {
4841 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4842 &numlen, UTF8_ALLOW_DEFAULT);
4849 if (RExC_flags & PMf_EXTENDED)
4850 p = regwhite(p, RExC_end);
4852 /* Prime the casefolded buffer. */
4853 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4855 if (ISMULT2(p)) { /* Back off on ?+*. */
4860 /* Emit all the Unicode characters. */
4862 for (foldbuf = tmpbuf;
4864 foldlen -= numlen) {
4865 ender = utf8_to_uvchr(foldbuf, &numlen);
4867 const STRLEN unilen = reguni(pRExC_state, ender, s);
4870 /* In EBCDIC the numlen
4871 * and unilen can differ. */
4873 if (numlen >= foldlen)
4877 break; /* "Can't happen." */
4881 const STRLEN unilen = reguni(pRExC_state, ender, s);
4890 REGC((char)ender, s++);
4896 /* Emit all the Unicode characters. */
4898 for (foldbuf = tmpbuf;
4900 foldlen -= numlen) {
4901 ender = utf8_to_uvchr(foldbuf, &numlen);
4903 const STRLEN unilen = reguni(pRExC_state, ender, s);
4906 /* In EBCDIC the numlen
4907 * and unilen can differ. */
4909 if (numlen >= foldlen)
4917 const STRLEN unilen = reguni(pRExC_state, ender, s);
4926 REGC((char)ender, s++);
4930 Set_Node_Cur_Length(ret); /* MJD */
4931 nextchar(pRExC_state);
4933 /* len is STRLEN which is unsigned, need to copy to signed */
4936 vFAIL("Internal disaster");
4940 if (len == 1 && UNI_IS_INVARIANT(ender))
4944 RExC_size += STR_SZ(len);
4947 RExC_emit += STR_SZ(len);
4953 /* If the encoding pragma is in effect recode the text of
4954 * any EXACT-kind nodes. */
4955 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4956 const STRLEN oldlen = STR_LEN(ret);
4957 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4961 if (sv_utf8_downgrade(sv, TRUE)) {
4962 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4963 const STRLEN newlen = SvCUR(sv);
4968 GET_RE_DEBUG_FLAGS_DECL;
4969 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4970 (int)oldlen, STRING(ret),
4972 Copy(s, STRING(ret), newlen, char);
4973 STR_LEN(ret) += newlen - oldlen;
4974 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4976 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4984 S_regwhite(char *p, const char *e)
4989 else if (*p == '#') {
4992 } while (p < e && *p != '\n');
5000 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5001 Character classes ([:foo:]) can also be negated ([:^foo:]).
5002 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5003 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
5004 but trigger failures because they are currently unimplemented. */
5006 #define POSIXCC_DONE(c) ((c) == ':')
5007 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5008 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5011 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
5014 I32 namedclass = OOB_NAMEDCLASS;
5016 if (value == '[' && RExC_parse + 1 < RExC_end &&
5017 /* I smell either [: or [= or [. -- POSIX has been here, right? */
5018 POSIXCC(UCHARAT(RExC_parse))) {
5019 const char c = UCHARAT(RExC_parse);
5020 char* const s = RExC_parse++;
5022 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
5024 if (RExC_parse == RExC_end)
5025 /* Grandfather lone [:, [=, [. */
5028 const char* const t = RExC_parse++; /* skip over the c */
5031 if (UCHARAT(RExC_parse) == ']') {
5032 const char *posixcc = s + 1;
5033 RExC_parse++; /* skip over the ending ] */
5036 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5037 const I32 skip = t - posixcc;
5039 /* Initially switch on the length of the name. */
5042 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5043 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5046 /* Names all of length 5. */
5047 /* alnum alpha ascii blank cntrl digit graph lower
5048 print punct space upper */
5049 /* Offset 4 gives the best switch position. */
5050 switch (posixcc[4]) {
5052 if (memEQ(posixcc, "alph", 4)) /* alpha */
5053 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5056 if (memEQ(posixcc, "spac", 4)) /* space */
5057 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5060 if (memEQ(posixcc, "grap", 4)) /* graph */
5061 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5064 if (memEQ(posixcc, "asci", 4)) /* ascii */
5065 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5068 if (memEQ(posixcc, "blan", 4)) /* blank */
5069 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5072 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5073 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5076 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5077 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5080 if (memEQ(posixcc, "lowe", 4)) /* lower */
5081 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5082 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5083 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5086 if (memEQ(posixcc, "digi", 4)) /* digit */
5087 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5088 else if (memEQ(posixcc, "prin", 4)) /* print */
5089 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5090 else if (memEQ(posixcc, "punc", 4)) /* punct */
5091 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5096 if (memEQ(posixcc, "xdigit", 6))
5097 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5101 if (namedclass == OOB_NAMEDCLASS)
5102 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5104 assert (posixcc[skip] == ':');
5105 assert (posixcc[skip+1] == ']');
5106 } else if (!SIZE_ONLY) {
5107 /* [[=foo=]] and [[.foo.]] are still future. */
5109 /* adjust RExC_parse so the warning shows after
5111 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5113 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5116 /* Maternal grandfather:
5117 * "[:" ending in ":" but not in ":]" */
5127 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5130 if (POSIXCC(UCHARAT(RExC_parse))) {
5131 const char *s = RExC_parse;
5132 const char c = *s++;
5136 if (*s && c == *s && s[1] == ']') {
5137 if (ckWARN(WARN_REGEXP))
5139 "POSIX syntax [%c %c] belongs inside character classes",
5142 /* [[=foo=]] and [[.foo.]] are still future. */
5143 if (POSIXCC_NOTYET(c)) {
5144 /* adjust RExC_parse so the error shows after
5146 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5148 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5156 parse a class specification and produce either an ANYOF node that
5157 matches the pattern. If the pattern matches a single char only and
5158 that char is < 256 then we produce an EXACT node instead.
5161 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5165 register UV nextvalue;
5166 register IV prevvalue = OOB_UNICODE;
5167 register IV range = 0;
5168 register regnode *ret;
5171 char *rangebegin = NULL;
5172 bool need_class = 0;
5175 bool optimize_invert = TRUE;
5176 AV* unicode_alternate = NULL;
5178 UV literal_endpoint = 0;
5180 UV stored = 0; /* number of chars stored in the class */
5182 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5183 case we need to change the emitted regop to an EXACT. */
5184 const char * orig_parse = RExC_parse;
5186 PERL_UNUSED_ARG(depth);
5188 GET_RE_DEBUG_FLAGS_DECL;
5189 DEBUG_PARSE("clas");
5191 /* Assume we are going to generate an ANYOF node. */
5192 ret = reganode(pRExC_state, ANYOF, 0);
5195 ANYOF_FLAGS(ret) = 0;
5197 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5201 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5205 RExC_size += ANYOF_SKIP;
5206 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5209 RExC_emit += ANYOF_SKIP;
5211 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5213 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5214 ANYOF_BITMAP_ZERO(ret);
5215 listsv = newSVpvs("# comment\n");
5218 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5220 if (!SIZE_ONLY && POSIXCC(nextvalue))
5221 checkposixcc(pRExC_state);
5223 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5224 if (UCHARAT(RExC_parse) == ']')
5227 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5231 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5234 rangebegin = RExC_parse;
5236 value = utf8n_to_uvchr((U8*)RExC_parse,
5237 RExC_end - RExC_parse,
5238 &numlen, UTF8_ALLOW_DEFAULT);
5239 RExC_parse += numlen;
5242 value = UCHARAT(RExC_parse++);
5244 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5245 if (value == '[' && POSIXCC(nextvalue))
5246 namedclass = regpposixcc(pRExC_state, value);
5247 else if (value == '\\') {
5249 value = utf8n_to_uvchr((U8*)RExC_parse,
5250 RExC_end - RExC_parse,
5251 &numlen, UTF8_ALLOW_DEFAULT);
5252 RExC_parse += numlen;
5255 value = UCHARAT(RExC_parse++);
5256 /* Some compilers cannot handle switching on 64-bit integer
5257 * values, therefore value cannot be an UV. Yes, this will
5258 * be a problem later if we want switch on Unicode.
5259 * A similar issue a little bit later when switching on
5260 * namedclass. --jhi */
5261 switch ((I32)value) {
5262 case 'w': namedclass = ANYOF_ALNUM; break;
5263 case 'W': namedclass = ANYOF_NALNUM; break;
5264 case 's': namedclass = ANYOF_SPACE; break;
5265 case 'S': namedclass = ANYOF_NSPACE; break;
5266 case 'd': namedclass = ANYOF_DIGIT; break;
5267 case 'D': namedclass = ANYOF_NDIGIT; break;
5272 if (RExC_parse >= RExC_end)
5273 vFAIL2("Empty \\%c{}", (U8)value);
5274 if (*RExC_parse == '{') {
5275 const U8 c = (U8)value;
5276 e = strchr(RExC_parse++, '}');
5278 vFAIL2("Missing right brace on \\%c{}", c);
5279 while (isSPACE(UCHARAT(RExC_parse)))
5281 if (e == RExC_parse)
5282 vFAIL2("Empty \\%c{}", c);
5284 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5292 if (UCHARAT(RExC_parse) == '^') {
5295 value = value == 'p' ? 'P' : 'p'; /* toggle */
5296 while (isSPACE(UCHARAT(RExC_parse))) {
5301 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5302 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5305 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5306 namedclass = ANYOF_MAX; /* no official name, but it's named */
5309 case 'n': value = '\n'; break;
5310 case 'r': value = '\r'; break;
5311 case 't': value = '\t'; break;
5312 case 'f': value = '\f'; break;
5313 case 'b': value = '\b'; break;
5314 case 'e': value = ASCII_TO_NATIVE('\033');break;
5315 case 'a': value = ASCII_TO_NATIVE('\007');break;
5317 if (*RExC_parse == '{') {
5318 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5319 | PERL_SCAN_DISALLOW_PREFIX;
5320 char * const e = strchr(RExC_parse++, '}');
5322 vFAIL("Missing right brace on \\x{}");
5324 numlen = e - RExC_parse;
5325 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5329 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5331 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5332 RExC_parse += numlen;
5336 value = UCHARAT(RExC_parse++);
5337 value = toCTRL(value);
5339 case '0': case '1': case '2': case '3': case '4':
5340 case '5': case '6': case '7': case '8': case '9':
5344 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5345 RExC_parse += numlen;
5349 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5351 "Unrecognized escape \\%c in character class passed through",
5355 } /* end of \blah */
5361 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5363 if (!SIZE_ONLY && !need_class)
5364 ANYOF_CLASS_ZERO(ret);
5368 /* a bad range like a-\d, a-[:digit:] ? */
5371 if (ckWARN(WARN_REGEXP)) {
5373 RExC_parse >= rangebegin ?
5374 RExC_parse - rangebegin : 0;
5376 "False [] range \"%*.*s\"",
5379 if (prevvalue < 256) {
5380 ANYOF_BITMAP_SET(ret, prevvalue);
5381 ANYOF_BITMAP_SET(ret, '-');
5384 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5385 Perl_sv_catpvf(aTHX_ listsv,
5386 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5390 range = 0; /* this was not a true range */
5394 const char *what = NULL;
5397 if (namedclass > OOB_NAMEDCLASS)
5398 optimize_invert = FALSE;
5399 /* Possible truncation here but in some 64-bit environments
5400 * the compiler gets heartburn about switch on 64-bit values.
5401 * A similar issue a little earlier when switching on value.
5403 switch ((I32)namedclass) {
5406 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5408 for (value = 0; value < 256; value++)
5410 ANYOF_BITMAP_SET(ret, value);
5417 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5419 for (value = 0; value < 256; value++)
5420 if (!isALNUM(value))
5421 ANYOF_BITMAP_SET(ret, value);
5428 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5430 for (value = 0; value < 256; value++)
5431 if (isALNUMC(value))
5432 ANYOF_BITMAP_SET(ret, value);
5439 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5441 for (value = 0; value < 256; value++)
5442 if (!isALNUMC(value))
5443 ANYOF_BITMAP_SET(ret, value);
5450 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5452 for (value = 0; value < 256; value++)
5454 ANYOF_BITMAP_SET(ret, value);
5461 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5463 for (value = 0; value < 256; value++)
5464 if (!isALPHA(value))
5465 ANYOF_BITMAP_SET(ret, value);
5472 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5475 for (value = 0; value < 128; value++)
5476 ANYOF_BITMAP_SET(ret, value);
5478 for (value = 0; value < 256; value++) {
5480 ANYOF_BITMAP_SET(ret, value);
5489 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5492 for (value = 128; value < 256; value++)
5493 ANYOF_BITMAP_SET(ret, value);
5495 for (value = 0; value < 256; value++) {
5496 if (!isASCII(value))
5497 ANYOF_BITMAP_SET(ret, value);
5506 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5508 for (value = 0; value < 256; value++)
5510 ANYOF_BITMAP_SET(ret, value);
5517 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5519 for (value = 0; value < 256; value++)
5520 if (!isBLANK(value))
5521 ANYOF_BITMAP_SET(ret, value);
5528 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5530 for (value = 0; value < 256; value++)
5532 ANYOF_BITMAP_SET(ret, value);
5539 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5541 for (value = 0; value < 256; value++)
5542 if (!isCNTRL(value))
5543 ANYOF_BITMAP_SET(ret, value);
5550 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5552 /* consecutive digits assumed */
5553 for (value = '0'; value <= '9'; value++)
5554 ANYOF_BITMAP_SET(ret, value);
5561 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5563 /* consecutive digits assumed */
5564 for (value = 0; value < '0'; value++)
5565 ANYOF_BITMAP_SET(ret, value);
5566 for (value = '9' + 1; value < 256; value++)
5567 ANYOF_BITMAP_SET(ret, value);
5574 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5576 for (value = 0; value < 256; value++)
5578 ANYOF_BITMAP_SET(ret, value);
5585 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5587 for (value = 0; value < 256; value++)
5588 if (!isGRAPH(value))
5589 ANYOF_BITMAP_SET(ret, value);
5596 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5598 for (value = 0; value < 256; value++)
5600 ANYOF_BITMAP_SET(ret, value);
5607 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5609 for (value = 0; value < 256; value++)
5610 if (!isLOWER(value))
5611 ANYOF_BITMAP_SET(ret, value);
5618 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5620 for (value = 0; value < 256; value++)
5622 ANYOF_BITMAP_SET(ret, value);
5629 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5631 for (value = 0; value < 256; value++)
5632 if (!isPRINT(value))
5633 ANYOF_BITMAP_SET(ret, value);
5640 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5642 for (value = 0; value < 256; value++)
5643 if (isPSXSPC(value))
5644 ANYOF_BITMAP_SET(ret, value);
5651 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5653 for (value = 0; value < 256; value++)
5654 if (!isPSXSPC(value))
5655 ANYOF_BITMAP_SET(ret, value);
5662 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5664 for (value = 0; value < 256; value++)
5666 ANYOF_BITMAP_SET(ret, value);
5673 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5675 for (value = 0; value < 256; value++)
5676 if (!isPUNCT(value))
5677 ANYOF_BITMAP_SET(ret, value);
5684 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5686 for (value = 0; value < 256; value++)
5688 ANYOF_BITMAP_SET(ret, value);
5695 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5697 for (value = 0; value < 256; value++)
5698 if (!isSPACE(value))
5699 ANYOF_BITMAP_SET(ret, value);
5706 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5708 for (value = 0; value < 256; value++)
5710 ANYOF_BITMAP_SET(ret, value);
5717 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5719 for (value = 0; value < 256; value++)
5720 if (!isUPPER(value))
5721 ANYOF_BITMAP_SET(ret, value);
5728 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5730 for (value = 0; value < 256; value++)
5731 if (isXDIGIT(value))
5732 ANYOF_BITMAP_SET(ret, value);
5739 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5741 for (value = 0; value < 256; value++)
5742 if (!isXDIGIT(value))
5743 ANYOF_BITMAP_SET(ret, value);
5749 /* this is to handle \p and \P */
5752 vFAIL("Invalid [::] class");
5756 /* Strings such as "+utf8::isWord\n" */
5757 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5760 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5763 } /* end of namedclass \blah */
5766 if (prevvalue > (IV)value) /* b-a */ {
5767 const int w = RExC_parse - rangebegin;
5768 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5769 range = 0; /* not a valid range */
5773 prevvalue = value; /* save the beginning of the range */
5774 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5775 RExC_parse[1] != ']') {
5778 /* a bad range like \w-, [:word:]- ? */
5779 if (namedclass > OOB_NAMEDCLASS) {
5780 if (ckWARN(WARN_REGEXP)) {
5782 RExC_parse >= rangebegin ?
5783 RExC_parse - rangebegin : 0;
5785 "False [] range \"%*.*s\"",
5789 ANYOF_BITMAP_SET(ret, '-');
5791 range = 1; /* yeah, it's a range! */
5792 continue; /* but do it the next time */
5796 /* now is the next time */
5797 /*stored += (value - prevvalue + 1);*/
5799 if (prevvalue < 256) {
5800 const IV ceilvalue = value < 256 ? value : 255;
5803 /* In EBCDIC [\x89-\x91] should include
5804 * the \x8e but [i-j] should not. */
5805 if (literal_endpoint == 2 &&
5806 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5807 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5809 if (isLOWER(prevvalue)) {
5810 for (i = prevvalue; i <= ceilvalue; i++)
5812 ANYOF_BITMAP_SET(ret, i);
5814 for (i = prevvalue; i <= ceilvalue; i++)
5816 ANYOF_BITMAP_SET(ret, i);
5821 for (i = prevvalue; i <= ceilvalue; i++) {
5822 if (!ANYOF_BITMAP_TEST(ret,i)) {
5824 ANYOF_BITMAP_SET(ret, i);
5828 if (value > 255 || UTF) {
5829 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5830 const UV natvalue = NATIVE_TO_UNI(value);
5831 stored+=2; /* can't optimize this class */
5832 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5833 if (prevnatvalue < natvalue) { /* what about > ? */
5834 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5835 prevnatvalue, natvalue);
5837 else if (prevnatvalue == natvalue) {
5838 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5840 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5842 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5844 /* If folding and foldable and a single
5845 * character, insert also the folded version
5846 * to the charclass. */
5848 if (foldlen == (STRLEN)UNISKIP(f))
5849 Perl_sv_catpvf(aTHX_ listsv,
5852 /* Any multicharacter foldings
5853 * require the following transform:
5854 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5855 * where E folds into "pq" and F folds
5856 * into "rst", all other characters
5857 * fold to single characters. We save
5858 * away these multicharacter foldings,
5859 * to be later saved as part of the
5860 * additional "s" data. */
5863 if (!unicode_alternate)
5864 unicode_alternate = newAV();
5865 sv = newSVpvn((char*)foldbuf, foldlen);
5867 av_push(unicode_alternate, sv);
5871 /* If folding and the value is one of the Greek
5872 * sigmas insert a few more sigmas to make the
5873 * folding rules of the sigmas to work right.
5874 * Note that not all the possible combinations
5875 * are handled here: some of them are handled
5876 * by the standard folding rules, and some of
5877 * them (literal or EXACTF cases) are handled
5878 * during runtime in regexec.c:S_find_byclass(). */
5879 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5880 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5881 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5882 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5883 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5885 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5886 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5887 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5892 literal_endpoint = 0;
5896 range = 0; /* this range (if it was one) is done now */
5900 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5902 RExC_size += ANYOF_CLASS_ADD_SKIP;
5904 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5910 /****** !SIZE_ONLY AFTER HERE *********/
5912 if( stored == 1 && value < 256
5913 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5915 /* optimize single char class to an EXACT node
5916 but *only* when its not a UTF/high char */
5917 const char * cur_parse= RExC_parse;
5918 RExC_emit = (regnode *)orig_emit;
5919 RExC_parse = (char *)orig_parse;
5920 ret = reg_node(pRExC_state,
5921 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5922 RExC_parse = (char *)cur_parse;
5923 *STRING(ret)= (char)value;
5925 RExC_emit += STR_SZ(1);
5928 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5929 if ( /* If the only flag is folding (plus possibly inversion). */
5930 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5932 for (value = 0; value < 256; ++value) {
5933 if (ANYOF_BITMAP_TEST(ret, value)) {
5934 UV fold = PL_fold[value];
5937 ANYOF_BITMAP_SET(ret, fold);
5940 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5943 /* optimize inverted simple patterns (e.g. [^a-z]) */
5944 if (optimize_invert &&
5945 /* If the only flag is inversion. */
5946 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5947 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5948 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5949 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5952 AV * const av = newAV();
5954 /* The 0th element stores the character class description
5955 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5956 * to initialize the appropriate swash (which gets stored in
5957 * the 1st element), and also useful for dumping the regnode.
5958 * The 2nd element stores the multicharacter foldings,
5959 * used later (regexec.c:S_reginclass()). */
5960 av_store(av, 0, listsv);
5961 av_store(av, 1, NULL);
5962 av_store(av, 2, (SV*)unicode_alternate);
5963 rv = newRV_noinc((SV*)av);
5964 n = add_data(pRExC_state, 1, "s");
5965 RExC_rx->data->data[n] = (void*)rv;
5972 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5974 char* const retval = RExC_parse++;
5977 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5978 RExC_parse[2] == '#') {
5979 while (*RExC_parse != ')') {
5980 if (RExC_parse == RExC_end)
5981 FAIL("Sequence (?#... not terminated");
5987 if (RExC_flags & PMf_EXTENDED) {
5988 if (isSPACE(*RExC_parse)) {
5992 else if (*RExC_parse == '#') {
5993 while (RExC_parse < RExC_end)
5994 if (*RExC_parse++ == '\n') break;
6003 - reg_node - emit a node
6005 STATIC regnode * /* Location. */
6006 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
6009 register regnode *ptr;
6010 regnode * const ret = RExC_emit;
6011 GET_RE_DEBUG_FLAGS_DECL;
6014 SIZE_ALIGN(RExC_size);
6018 NODE_ALIGN_FILL(ret);
6020 FILL_ADVANCE_NODE(ptr, op);
6021 if (RExC_offsets) { /* MJD */
6022 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
6023 "reg_node", __LINE__,
6025 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6026 ? "Overwriting end of array!\n" : "OK",
6027 (UV)(RExC_emit - RExC_emit_start),
6028 (UV)(RExC_parse - RExC_start),
6029 (UV)RExC_offsets[0]));
6030 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
6039 - reganode - emit a node with an argument
6041 STATIC regnode * /* Location. */
6042 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6045 register regnode *ptr;
6046 regnode * const ret = RExC_emit;
6047 GET_RE_DEBUG_FLAGS_DECL;
6050 SIZE_ALIGN(RExC_size);
6055 NODE_ALIGN_FILL(ret);
6057 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6058 if (RExC_offsets) { /* MJD */
6059 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6063 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6064 "Overwriting end of array!\n" : "OK",
6065 (UV)(RExC_emit - RExC_emit_start),
6066 (UV)(RExC_parse - RExC_start),
6067 (UV)RExC_offsets[0]));
6068 Set_Cur_Node_Offset;
6077 - reguni - emit (if appropriate) a Unicode character
6080 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6083 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6087 - reginsert - insert an operator in front of already-emitted operand
6089 * Means relocating the operand.
6092 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6095 register regnode *src;
6096 register regnode *dst;
6097 register regnode *place;
6098 const int offset = regarglen[(U8)op];
6099 GET_RE_DEBUG_FLAGS_DECL;
6100 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6103 RExC_size += NODE_STEP_REGNODE + offset;
6108 RExC_emit += NODE_STEP_REGNODE + offset;
6110 while (src > opnd) {
6111 StructCopy(--src, --dst, regnode);
6112 if (RExC_offsets) { /* MJD 20010112 */
6113 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6117 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6118 ? "Overwriting end of array!\n" : "OK",
6119 (UV)(src - RExC_emit_start),
6120 (UV)(dst - RExC_emit_start),
6121 (UV)RExC_offsets[0]));
6122 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6123 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6128 place = opnd; /* Op node, where operand used to be. */
6129 if (RExC_offsets) { /* MJD */
6130 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6134 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6135 ? "Overwriting end of array!\n" : "OK",
6136 (UV)(place - RExC_emit_start),
6137 (UV)(RExC_parse - RExC_start),
6139 Set_Node_Offset(place, RExC_parse);
6140 Set_Node_Length(place, 1);
6142 src = NEXTOPER(place);
6143 FILL_ADVANCE_NODE(place, op);
6144 Zero(src, offset, regnode);
6148 - regtail - set the next-pointer at the end of a node chain of p to val.
6149 - SEE ALSO: regtail_study
6151 /* TODO: All three parms should be const */
6153 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6156 register regnode *scan;
6158 PERL_UNUSED_ARG(depth);
6160 GET_RE_DEBUG_FLAGS_DECL;
6165 /* Find last node. */
6168 regnode * const temp = regnext(scan);
6170 SV * const mysv=sv_newmortal();
6171 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6172 regprop(RExC_rx, mysv, scan);
6173 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6174 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6181 if (reg_off_by_arg[OP(scan)]) {
6182 ARG_SET(scan, val - scan);
6185 NEXT_OFF(scan) = val - scan;
6191 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6192 - Look for optimizable sequences at the same time.
6193 - currently only looks for EXACT chains.
6195 This is expermental code. The idea is to use this routine to perform
6196 in place optimizations on branches and groups as they are constructed,
6197 with the long term intention of removing optimization from study_chunk so
6198 that it is purely analytical.
6200 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6201 to control which is which.
6204 /* TODO: All four parms should be const */
6207 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6210 register regnode *scan;
6212 #ifdef EXPERIMENTAL_INPLACESCAN
6216 GET_RE_DEBUG_FLAGS_DECL;
6222 /* Find last node. */
6226 regnode * const temp = regnext(scan);
6227 #ifdef EXPERIMENTAL_INPLACESCAN
6228 if (PL_regkind[OP(scan)] == EXACT)
6229 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6237 if( exact == PSEUDO )
6239 else if ( exact != OP(scan) )
6248 SV * const mysv=sv_newmortal();
6249 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6250 regprop(RExC_rx, mysv, scan);
6251 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6252 SvPV_nolen_const(mysv),
6254 REG_NODE_NUM(scan));
6261 SV * const mysv_val=sv_newmortal();
6262 DEBUG_PARSE_MSG("");
6263 regprop(RExC_rx, mysv_val, val);
6264 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6265 SvPV_nolen_const(mysv_val),
6270 if (reg_off_by_arg[OP(scan)]) {
6271 ARG_SET(scan, val - scan);
6274 NEXT_OFF(scan) = val - scan;
6282 - regcurly - a little FSA that accepts {\d+,?\d*}
6285 S_regcurly(register const char *s)
6304 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6307 Perl_regdump(pTHX_ const regexp *r)
6311 SV * const sv = sv_newmortal();
6313 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
6315 /* Header fields of interest. */
6316 if (r->anchored_substr)
6317 PerlIO_printf(Perl_debug_log,
6318 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
6320 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
6321 SvPVX_const(r->anchored_substr),
6323 SvTAIL(r->anchored_substr) ? "$" : "",
6324 (IV)r->anchored_offset);
6325 else if (r->anchored_utf8)
6326 PerlIO_printf(Perl_debug_log,
6327 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
6329 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
6330 SvPVX_const(r->anchored_utf8),
6332 SvTAIL(r->anchored_utf8) ? "$" : "",
6333 (IV)r->anchored_offset);
6334 if (r->float_substr)
6335 PerlIO_printf(Perl_debug_log,
6336 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6338 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
6339 SvPVX_const(r->float_substr),
6341 SvTAIL(r->float_substr) ? "$" : "",
6342 (IV)r->float_min_offset, (UV)r->float_max_offset);
6343 else if (r->float_utf8)
6344 PerlIO_printf(Perl_debug_log,
6345 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6347 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
6348 SvPVX_const(r->float_utf8),
6350 SvTAIL(r->float_utf8) ? "$" : "",
6351 (IV)r->float_min_offset, (UV)r->float_max_offset);
6352 if (r->check_substr || r->check_utf8)
6353 PerlIO_printf(Perl_debug_log,
6354 r->check_substr == r->float_substr
6355 && r->check_utf8 == r->float_utf8
6356 ? "(checking floating" : "(checking anchored");
6357 if (r->reganch & ROPT_NOSCAN)
6358 PerlIO_printf(Perl_debug_log, " noscan");
6359 if (r->reganch & ROPT_CHECK_ALL)
6360 PerlIO_printf(Perl_debug_log, " isall");
6361 if (r->check_substr || r->check_utf8)
6362 PerlIO_printf(Perl_debug_log, ") ");
6364 if (r->regstclass) {
6365 regprop(r, sv, r->regstclass);
6366 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6368 if (r->reganch & ROPT_ANCH) {
6369 PerlIO_printf(Perl_debug_log, "anchored");
6370 if (r->reganch & ROPT_ANCH_BOL)
6371 PerlIO_printf(Perl_debug_log, "(BOL)");
6372 if (r->reganch & ROPT_ANCH_MBOL)
6373 PerlIO_printf(Perl_debug_log, "(MBOL)");
6374 if (r->reganch & ROPT_ANCH_SBOL)
6375 PerlIO_printf(Perl_debug_log, "(SBOL)");
6376 if (r->reganch & ROPT_ANCH_GPOS)
6377 PerlIO_printf(Perl_debug_log, "(GPOS)");
6378 PerlIO_putc(Perl_debug_log, ' ');
6380 if (r->reganch & ROPT_GPOS_SEEN)
6381 PerlIO_printf(Perl_debug_log, "GPOS ");
6382 if (r->reganch & ROPT_SKIP)
6383 PerlIO_printf(Perl_debug_log, "plus ");
6384 if (r->reganch & ROPT_IMPLICIT)
6385 PerlIO_printf(Perl_debug_log, "implicit ");
6386 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6387 if (r->reganch & ROPT_EVAL_SEEN)
6388 PerlIO_printf(Perl_debug_log, "with eval ");
6389 PerlIO_printf(Perl_debug_log, "\n");
6391 PERL_UNUSED_CONTEXT;
6393 #endif /* DEBUGGING */
6397 - regprop - printable representation of opcode
6400 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6406 sv_setpvn(sv, "", 0);
6407 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6408 /* It would be nice to FAIL() here, but this may be called from
6409 regexec.c, and it would be hard to supply pRExC_state. */
6410 Perl_croak(aTHX_ "Corrupted regexp opcode");
6411 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6413 k = PL_regkind[OP(o)];
6416 SV * const dsv = sv_2mortal(newSVpvs(""));
6417 /* Using is_utf8_string() is a crude hack but it may
6418 * be the best for now since we have no flag "this EXACTish
6419 * node was UTF-8" --jhi */
6420 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
6421 const char * const s = do_utf8 ?
6422 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6423 UNI_DISPLAY_REGEX) :
6425 const int len = do_utf8 ?
6428 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6432 } else if (k == TRIE) {
6433 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6434 /* print the details of the trie in dumpuntil instead, as
6435 * prog->data isn't available here */
6436 } else if (k == CURLY) {
6437 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6438 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6439 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6441 else if (k == WHILEM && o->flags) /* Ordinal/of */
6442 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6443 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6444 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6445 else if (k == LOGICAL)
6446 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6447 else if (k == ANYOF) {
6448 int i, rangestart = -1;
6449 const U8 flags = ANYOF_FLAGS(o);
6451 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6452 static const char * const anyofs[] = {
6485 if (flags & ANYOF_LOCALE)
6486 sv_catpvs(sv, "{loc}");
6487 if (flags & ANYOF_FOLD)
6488 sv_catpvs(sv, "{i}");
6489 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6490 if (flags & ANYOF_INVERT)
6492 for (i = 0; i <= 256; i++) {
6493 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6494 if (rangestart == -1)
6496 } else if (rangestart != -1) {
6497 if (i <= rangestart + 3)
6498 for (; rangestart < i; rangestart++)
6499 put_byte(sv, rangestart);
6501 put_byte(sv, rangestart);
6503 put_byte(sv, i - 1);
6509 if (o->flags & ANYOF_CLASS)
6510 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6511 if (ANYOF_CLASS_TEST(o,i))
6512 sv_catpv(sv, anyofs[i]);
6514 if (flags & ANYOF_UNICODE)
6515 sv_catpvs(sv, "{unicode}");
6516 else if (flags & ANYOF_UNICODE_ALL)
6517 sv_catpvs(sv, "{unicode_all}");
6521 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6525 U8 s[UTF8_MAXBYTES_CASE+1];
6527 for (i = 0; i <= 256; i++) { /* just the first 256 */
6528 uvchr_to_utf8(s, i);
6530 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6531 if (rangestart == -1)
6533 } else if (rangestart != -1) {
6534 if (i <= rangestart + 3)
6535 for (; rangestart < i; rangestart++) {
6536 const U8 * const e = uvchr_to_utf8(s,rangestart);
6538 for(p = s; p < e; p++)
6542 const U8 *e = uvchr_to_utf8(s,rangestart);
6544 for (p = s; p < e; p++)
6547 e = uvchr_to_utf8(s, i-1);
6548 for (p = s; p < e; p++)
6555 sv_catpvs(sv, "..."); /* et cetera */
6559 char *s = savesvpv(lv);
6560 char * const origs = s;
6562 while (*s && *s != '\n')
6566 const char * const t = ++s;
6584 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6586 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6587 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
6589 PERL_UNUSED_CONTEXT;
6590 PERL_UNUSED_ARG(sv);
6592 PERL_UNUSED_ARG(prog);
6593 #endif /* DEBUGGING */
6597 Perl_re_intuit_string(pTHX_ regexp *prog)
6598 { /* Assume that RE_INTUIT is set */
6600 GET_RE_DEBUG_FLAGS_DECL;
6601 PERL_UNUSED_CONTEXT;
6605 const char * const s = SvPV_nolen_const(prog->check_substr
6606 ? prog->check_substr : prog->check_utf8);
6608 if (!PL_colorset) reginitcolors();
6609 PerlIO_printf(Perl_debug_log,
6610 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6612 prog->check_substr ? "" : "utf8 ",
6613 PL_colors[5],PL_colors[0],
6616 (strlen(s) > 60 ? "..." : ""));
6619 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6623 Perl_pregfree(pTHX_ struct regexp *r)
6627 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6629 GET_RE_DEBUG_FLAGS_DECL;
6631 if (!r || (--r->refcnt > 0))
6633 DEBUG_COMPILE_r(if (RX_DEBUG(r)){
6634 const char * const s = (r->reganch & ROPT_UTF8)
6635 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6636 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6637 const int len = SvCUR(dsv);
6640 PerlIO_printf(Perl_debug_log,
6641 "%sFreeing REx:%s %s%*.*s%s%s\n",
6642 PL_colors[4],PL_colors[5],PL_colors[0],
6645 len > 60 ? "..." : "");
6648 /* gcov results gave these as non-null 100% of the time, so there's no
6649 optimisation in checking them before calling Safefree */
6650 Safefree(r->precomp);
6651 Safefree(r->offsets); /* 20010421 MJD */
6652 RX_MATCH_COPY_FREE(r);
6653 #ifdef PERL_OLD_COPY_ON_WRITE
6655 SvREFCNT_dec(r->saved_copy);
6658 if (r->anchored_substr)
6659 SvREFCNT_dec(r->anchored_substr);
6660 if (r->anchored_utf8)
6661 SvREFCNT_dec(r->anchored_utf8);
6662 if (r->float_substr)
6663 SvREFCNT_dec(r->float_substr);
6665 SvREFCNT_dec(r->float_utf8);
6666 Safefree(r->substrs);
6669 int n = r->data->count;
6670 PAD* new_comppad = NULL;
6675 /* If you add a ->what type here, update the comment in regcomp.h */
6676 switch (r->data->what[n]) {
6678 SvREFCNT_dec((SV*)r->data->data[n]);
6681 Safefree(r->data->data[n]);
6684 new_comppad = (AV*)r->data->data[n];
6687 if (new_comppad == NULL)
6688 Perl_croak(aTHX_ "panic: pregfree comppad");
6689 PAD_SAVE_LOCAL(old_comppad,
6690 /* Watch out for global destruction's random ordering. */
6691 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6694 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6697 op_free((OP_4tree*)r->data->data[n]);
6699 PAD_RESTORE_LOCAL(old_comppad);
6700 SvREFCNT_dec((SV*)new_comppad);
6706 { /* Aho Corasick add-on structure for a trie node.
6707 Used in stclass optimization only */
6709 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6711 refcount = --aho->refcount;
6714 Safefree(aho->states);
6715 Safefree(aho->fail);
6716 aho->trie=NULL; /* not necessary to free this as it is
6717 handled by the 't' case */
6718 Safefree(r->data->data[n]); /* do this last!!!! */
6719 Safefree(r->regstclass);
6725 /* trie structure. */
6727 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6729 refcount = --trie->refcount;
6732 Safefree(trie->charmap);
6733 if (trie->widecharmap)
6734 SvREFCNT_dec((SV*)trie->widecharmap);
6735 Safefree(trie->states);
6736 Safefree(trie->trans);
6738 Safefree(trie->bitmap);
6740 Safefree(trie->wordlen);
6744 SvREFCNT_dec((SV*)trie->words);
6745 if (trie->revcharmap)
6746 SvREFCNT_dec((SV*)trie->revcharmap);
6749 Safefree(r->data->data[n]); /* do this last!!!! */
6754 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6757 Safefree(r->data->what);
6760 Safefree(r->startp);
6765 #ifndef PERL_IN_XSUB_RE
6767 - regnext - dig the "next" pointer out of a node
6770 Perl_regnext(pTHX_ register regnode *p)
6773 register I32 offset;
6775 if (p == &PL_regdummy)
6778 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6787 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6790 STRLEN l1 = strlen(pat1);
6791 STRLEN l2 = strlen(pat2);
6794 const char *message;
6800 Copy(pat1, buf, l1 , char);
6801 Copy(pat2, buf + l1, l2 , char);
6802 buf[l1 + l2] = '\n';
6803 buf[l1 + l2 + 1] = '\0';
6805 /* ANSI variant takes additional second argument */
6806 va_start(args, pat2);
6810 msv = vmess(buf, &args);
6812 message = SvPV_const(msv,l1);
6815 Copy(message, buf, l1 , char);
6816 buf[l1-1] = '\0'; /* Overwrite \n */
6817 Perl_croak(aTHX_ "%s", buf);
6820 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6822 #ifndef PERL_IN_XSUB_RE
6824 Perl_save_re_context(pTHX)
6828 struct re_save_state *state;
6830 SAVEVPTR(PL_curcop);
6831 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6833 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6834 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6835 SSPUSHINT(SAVEt_RE_STATE);
6837 Copy(&PL_reg_state, state, 1, struct re_save_state);
6839 PL_reg_start_tmp = 0;
6840 PL_reg_start_tmpl = 0;
6841 PL_reg_oldsaved = NULL;
6842 PL_reg_oldsavedlen = 0;
6844 PL_reg_leftiter = 0;
6845 PL_reg_poscache = NULL;
6846 PL_reg_poscache_size = 0;
6847 #ifdef PERL_OLD_COPY_ON_WRITE
6851 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6853 const REGEXP * const rx = PM_GETRE(PL_curpm);
6856 for (i = 1; i <= rx->nparens; i++) {
6857 char digits[TYPE_CHARS(long)];
6858 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
6859 GV *const *const gvp
6860 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6863 GV * const gv = *gvp;
6864 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6874 clear_re(pTHX_ void *r)
6877 ReREFCNT_dec((regexp *)r);
6883 S_put_byte(pTHX_ SV *sv, int c)
6885 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6886 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6887 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6888 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6890 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6893 #define CLEAR_OPTSTART \
6894 if (optstart) STMT_START { \
6895 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
6899 #define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6901 STATIC const regnode *
6902 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6903 const regnode *last, SV* sv, I32 l)
6906 register U8 op = EXACT; /* Arbitrary non-END op. */
6907 register const regnode *next;
6908 const regnode *optstart= NULL;
6909 GET_RE_DEBUG_FLAGS_DECL;
6911 while (op != END && (!last || node < last)) {
6912 /* While that wasn't END last time... */
6918 next = regnext((regnode *)node);
6921 if (OP(node) == OPTIMIZED) {
6922 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
6929 regprop(r, sv, node);
6930 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6931 (int)(2*l + 1), "", SvPVX_const(sv));
6933 if (OP(node) != OPTIMIZED) {
6934 if (next == NULL) /* Next ptr. */
6935 PerlIO_printf(Perl_debug_log, "(0)");
6937 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6938 (void)PerlIO_putc(Perl_debug_log, '\n');
6942 if (PL_regkind[(U8)op] == BRANCHJ) {
6945 register const regnode *nnode = (OP(next) == LONGJMP
6946 ? regnext((regnode *)next)
6948 if (last && nnode > last)
6950 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6953 else if (PL_regkind[(U8)op] == BRANCH) {
6955 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
6957 else if ( PL_regkind[(U8)op] == TRIE ) {
6958 const I32 n = ARG(node);
6959 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6960 const I32 arry_len = av_len(trie->words)+1;
6962 PerlIO_printf(Perl_debug_log,
6963 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
6967 TRIE_WORDCOUNT(trie),
6968 (int)TRIE_CHARCOUNT(trie),
6969 trie->uniquecharcount,
6970 (IV)TRIE_LASTSTATE(trie)-1,
6977 sv_setpvn(sv, "", 0);
6978 for (i = 0; i <= 256; i++) {
6979 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6980 if (rangestart == -1)
6982 } else if (rangestart != -1) {
6983 if (i <= rangestart + 3)
6984 for (; rangestart < i; rangestart++)
6985 put_byte(sv, rangestart);
6987 put_byte(sv, rangestart);
6989 put_byte(sv, i - 1);
6994 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
6996 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
6998 for (word_idx=0; word_idx < arry_len; word_idx++) {
6999 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
7001 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
7004 SvPV_nolen_const(*elem_ptr),
7010 node = NEXTOPER(node);
7011 node += regarglen[(U8)op];
7014 else if ( op == CURLY) { /* "next" might be very big: optimizer */
7015 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7016 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7018 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
7020 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7023 else if ( op == PLUS || op == STAR) {
7024 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
7026 else if (op == ANYOF) {
7027 /* arglen 1 + class block */
7028 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7029 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7030 node = NEXTOPER(node);
7032 else if (PL_regkind[(U8)op] == EXACT) {
7033 /* Literal string, where present. */
7034 node += NODE_SZ_STR(node) - 1;
7035 node = NEXTOPER(node);
7038 node = NEXTOPER(node);
7039 node += regarglen[(U8)op];
7041 if (op == CURLYX || op == OPEN)
7043 else if (op == WHILEM)
7050 #endif /* DEBUGGING */
7054 * c-indentation-style: bsd
7056 * indent-tabs-mode: t
7059 * ex: set ts=8 sts=4 sw=4 noet: