5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
176 /* Length of a variant. */
178 typedef struct scan_data_t {
184 I32 last_end; /* min value, <0 unless valid. */
187 SV **longest; /* Either &l_fixed, or &l_float. */
191 I32 offset_float_min;
192 I32 offset_float_max;
196 struct regnode_charclass_class *start_class;
200 * Forward declarations for pregcomp()'s friends.
203 static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
207 #define SF_BEFORE_SEOL 0x0001
208 #define SF_BEFORE_MEOL 0x0002
209 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
213 # define SF_FIX_SHIFT_EOL (0+2)
214 # define SF_FL_SHIFT_EOL (0+4)
216 # define SF_FIX_SHIFT_EOL (+2)
217 # define SF_FL_SHIFT_EOL (+4)
220 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
225 #define SF_IS_INF 0x0040
226 #define SF_HAS_PAR 0x0080
227 #define SF_IN_PAR 0x0100
228 #define SF_HAS_EVAL 0x0200
229 #define SCF_DO_SUBSTR 0x0400
230 #define SCF_DO_STCLASS_AND 0x0800
231 #define SCF_DO_STCLASS_OR 0x1000
232 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
233 #define SCF_WHILEM_VISITED_POS 0x2000
235 #define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
237 #define UTF (RExC_utf8 != 0)
238 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
239 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
241 #define OOB_UNICODE 12345678
242 #define OOB_NAMEDCLASS -1
244 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248 /* length of regex to show in messages that don't mark a position within */
249 #define RegexLengthToShowInErrorMessages 127
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
256 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
257 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
259 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 #define FAIL(msg) STMT_START { \
267 const char *ellipses = ""; \
268 IV len = RExC_end - RExC_precomp; \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
284 #define Simple_vFAIL(m) STMT_START { \
285 const IV offset = RExC_parse - RExC_precomp; \
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
293 #define vFAIL(m) STMT_START { \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
300 * Like Simple_vFAIL(), but accepts two arguments.
302 #define Simple_vFAIL2(m,a1) STMT_START { \
303 const IV offset = RExC_parse - RExC_precomp; \
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
311 #define vFAIL2(m,a1) STMT_START { \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
319 * Like Simple_vFAIL(), but accepts three arguments.
321 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
322 const IV offset = RExC_parse - RExC_precomp; \
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
330 #define vFAIL3(m,a1,a2) STMT_START { \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
337 * Like Simple_vFAIL(), but accepts four arguments.
339 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
340 const IV offset = RExC_parse - RExC_precomp; \
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
345 #define vWARN(loc,m) STMT_START { \
346 const IV offset = loc - RExC_precomp; \
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351 #define vWARNdep(loc,m) STMT_START { \
352 const IV offset = loc - RExC_precomp; \
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359 #define vWARN2(loc, m, a1) STMT_START { \
360 const IV offset = loc - RExC_precomp; \
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365 #define vWARN3(loc, m, a1, a2) STMT_START { \
366 const IV offset = loc - RExC_precomp; \
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
372 const IV offset = loc - RExC_precomp; \
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
378 const IV offset = loc - RExC_precomp; \
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 /* Allow for side effects in s */
385 #define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389 /* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
393 * Position is 1 indexed.
396 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
399 __LINE__, (node), (int)(byte))); \
401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
403 RExC_offsets[2*(node)-1] = (byte); \
408 #define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
412 #define Set_Node_Length_To_R(node,len) STMT_START { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
415 __LINE__, (int)(node), (int)(len))); \
417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
419 RExC_offsets[2*(node)] = (len); \
424 #define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427 #define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
430 /* Get offsets and lengths */
431 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
434 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
440 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441 #define EXPERIMENTAL_INPLACESCAN
444 static void clear_re(pTHX_ void *r);
446 /* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
451 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
457 SvSetMagicSV(*data->longest, data->last_found);
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
464 data->flags &= ~SF_FIX_BEFORE_EOL;
467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
468 data->offset_float_max = (l
469 ? data->last_start_max
470 : data->pos_min + data->pos_delta);
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
473 if (data->flags & SF_BEFORE_EOL)
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
477 data->flags &= ~SF_FL_BEFORE_EOL;
480 SvCUR_set(data->last_found, 0);
482 SV * const sv = data->last_found;
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
490 data->flags &= ~SF_BEFORE_EOL;
493 /* Can match anything (initialization) */
495 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
497 ANYOF_CLASS_ZERO(cl);
498 ANYOF_BITMAP_SETALL(cl);
499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
501 cl->flags |= ANYOF_LOCALE;
504 /* Can match anything (initialization) */
506 S_cl_is_anything(const struct regnode_charclass_class *cl)
510 for (value = 0; value <= ANYOF_MAX; value += 2)
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
520 /* Can match anything (initialization) */
522 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 Zero(cl, 1, struct regnode_charclass_class);
526 cl_anything(pRExC_state, cl);
530 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 Zero(cl, 1, struct regnode_charclass_class);
534 cl_anything(pRExC_state, cl);
536 cl->flags |= ANYOF_LOCALE;
539 /* 'And' a given class with another one. Can create false positives */
540 /* We assume that cl is not inverted */
542 S_cl_and(struct regnode_charclass_class *cl,
543 const struct regnode_charclass_class *and_with)
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
570 cl->flags &= ~ANYOF_UNICODE_ALL;
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
573 cl->flags &= ~ANYOF_UNICODE;
576 /* 'OR' a given class with another one. Can create false positives */
577 /* We assume that cl is not inverted */
579 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
581 if (or_with->flags & ANYOF_INVERT) {
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
600 cl_anything(pRExC_state, cl);
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && (!(or_with->flags & ANYOF_FOLD)
606 || (cl->flags & ANYOF_FOLD)) ) {
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
619 cl_anything(pRExC_state, cl);
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
638 make_trie(startbranch,first,last,tail,flags,depth)
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
648 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
650 A trie is an N'ary tree where the branches are determined by digital
651 decomposition of the key. IE, at the root node you look up the 1st character and
652 follow that branch repeat until you find the end of the branches. Nodes can be
653 marked as "accepting" meaning they represent a complete word. Eg:
657 would convert into the following structure. Numbers represent states, letters
658 following numbers represent valid transitions on the letter from that state, if
659 the number is in square brackets it represents an accepting state, otherwise it
660 will be in parenthesis.
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
666 (1) +-i->(6)-+-s->[7]
668 +-s->(3)-+-h->(4)-+-e->[5]
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
672 This shows that when matching against the string 'hers' we will begin at state 1
673 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676 single traverse. We store a mapping from accepting to state to which word was
677 matched, and then when we have multiple possibilities we try to complete the
678 rest of the regex in the order in which they occured in the alternation.
680 The only prior NFA like behaviour that would be changed by the TRIE support is
681 the silent ignoring of duplicate alternations which are of the form:
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
685 Thus EVAL blocks follwing a trie may be called a different number of times with
686 and without the optimisation. With the optimisations dupes will be silently
687 ignored. This inconsistant behaviour of EVAL type nodes is well established as
688 the following demonstrates:
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
692 which prints out 'word' three times, but
694 'words'=~/(word|word|word)(?{ print $1 })S/
696 which doesnt print it out at all. This is due to other optimisations kicking in.
698 Example of what happens on a structural level:
700 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
702 1: CURLYM[1] {1,32767}(18)
713 This would be optimizable with startbranch=5, first=5, last=16, tail=16
714 and should turn into:
716 1: CURLYM[1] {1,32767}(18)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
726 Cases where tail != last would be like /(?foo|bar)baz/:
736 which would be optimizable with startbranch=1, first=1, last=7, tail=8
737 and would end up looking like:
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
747 d = uvuni_to_utf8_flags(d, uv, 0);
749 is the recommended Unicode-aware way of saying
754 #define TRIE_STORE_REVCHAR \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
760 #define TRIE_READ_CHAR STMT_START { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
785 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
790 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
801 #define TRIE_LIST_NEW(state) STMT_START { \
802 Newxz( trie->states[ state ].trans.list, \
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
808 #define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
812 trie->wordlen[ curword ] = wordlen; \
813 trie->states[ state ].wordnum = ++curword; \
815 /* store the word for dumping */ \
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
820 tmp = newSVpvn( "", 0 ); \
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
825 NOOP; /* It's a dupe. So ignore it. */ \
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
835 These routines dump out a trie in a somewhat readable format.
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
852 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
855 GET_RE_DEBUG_FLAGS_DECL;
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
862 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
875 const U32 base = trie->states[ state ].trans.base;
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
909 PerlIO_printf( Perl_debug_log, "]");
912 PerlIO_printf( Perl_debug_log, "\n" );
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
923 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
932 for( state=1 ; state < next_alloc ; state ++ ) {
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
945 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
964 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
968 GET_RE_DEBUG_FLAGS_DECL;
971 print out the table precompression so that we can do a visual check
972 that they are identical.
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
978 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
990 PerlIO_printf( Perl_debug_log, "\n" );
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1013 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1023 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1025 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1045 const U32 ucharcount = trie->uniquecharcount;
1046 const U32 numstates = trie->laststate;
1047 const U32 ubound = trie->lasttrans + ucharcount;
1051 U32 base = trie->states[ 1 ].trans.base;
1054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1055 GET_RE_DEBUG_FLAGS_DECL;
1057 PERL_UNUSED_ARG(depth);
1061 ARG_SET( stclass, data_slot );
1062 Newxz( aho, 1, reg_ac_data );
1063 RExC_rx->data->data[ data_slot ] = (void*)aho;
1065 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1066 (trie->laststate+1)*sizeof(reg_trie_state));
1067 Newxz( q, numstates, U32);
1068 Newxz( aho->fail, numstates, U32 );
1071 fail[ 0 ] = fail[ 1 ] = 1;
1073 for ( charid = 0; charid < ucharcount ; charid++ ) {
1074 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1076 q[ q_write ] = newstate;
1077 /* set to point at the root */
1078 fail[ q[ q_write++ ] ]=1;
1081 while ( q_read < q_write) {
1082 const U32 cur = q[ q_read++ % numstates ];
1083 base = trie->states[ cur ].trans.base;
1085 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1086 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1088 U32 fail_state = cur;
1091 fail_state = fail[ fail_state ];
1092 fail_base = aho->states[ fail_state ].trans.base;
1093 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1095 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1096 fail[ ch_state ] = fail_state;
1097 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1099 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1101 q[ q_write++ % numstates] = ch_state;
1106 DEBUG_TRIE_COMPILE_MORE_r({
1107 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1108 for( q_read=2; q_read<numstates; q_read++ ) {
1109 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1111 PerlIO_printf(Perl_debug_log, "\n");
1114 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1120 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1123 /* first pass, loop through and scan words */
1124 reg_trie_data *trie;
1126 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1131 /* we just use folder as a flag in utf8 */
1132 const U8 * const folder = ( flags == EXACTF
1134 : ( flags == EXACTFL
1140 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1141 SV *re_trie_maxbuff;
1143 /* these are only used during construction but are useful during
1144 * debugging so we store them in the struct when debugging.
1145 * Wordcount is actually superfluous in debugging as we have
1146 * (AV*)trie->words to use for it, but that's not available when
1147 * not debugging... We could make the macro use the AV during
1148 * debugging though...
1150 U16 trie_wordcount=0;
1151 STRLEN trie_charcount=0;
1152 /*U32 trie_laststate=0;*/
1153 AV *trie_revcharmap;
1155 GET_RE_DEBUG_FLAGS_DECL;
1157 PERL_UNUSED_ARG(depth);
1160 Newxz( trie, 1, reg_trie_data );
1162 trie->startstate = 1;
1163 RExC_rx->data->data[ data_slot ] = (void*)trie;
1164 Newxz( trie->charmap, 256, U16 );
1165 if (!(UTF && folder))
1166 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1168 trie->words = newAV();
1170 TRIE_REVCHARMAP(trie) = newAV();
1172 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1173 if (!SvIOK(re_trie_maxbuff)) {
1174 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1177 PerlIO_printf( Perl_debug_log,
1178 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1179 (int)depth * 2 + 2, "",
1180 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1181 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1183 /* -- First loop and Setup --
1185 We first traverse the branches and scan each word to determine if it
1186 contains widechars, and how many unique chars there are, this is
1187 important as we have to build a table with at least as many columns as we
1190 We use an array of integers to represent the character codes 0..255
1191 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1192 native representation of the character value as the key and IV's for the
1195 *TODO* If we keep track of how many times each character is used we can
1196 remap the columns so that the table compression later on is more
1197 efficient in terms of memory by ensuring most common value is in the
1198 middle and the least common are on the outside. IMO this would be better
1199 than a most to least common mapping as theres a decent chance the most
1200 common letter will share a node with the least common, meaning the node
1201 will not be compressable. With a middle is most common approach the worst
1202 case is when we have the least common nodes twice.
1206 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1207 regnode * const noper = NEXTOPER( cur );
1208 const U8 *uc = (U8*)STRING( noper );
1209 const U8 * const e = uc + STR_LEN( noper );
1211 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1212 const U8 *scan = (U8*)NULL;
1213 U32 wordlen = 0; /* required init */
1216 TRIE_WORDCOUNT(trie)++;
1217 if (OP(noper) == NOTHING) {
1222 TRIE_BITMAP_SET(trie,*uc);
1223 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1225 for ( ; uc < e ; uc += len ) {
1226 TRIE_CHARCOUNT(trie)++;
1230 if ( !trie->charmap[ uvc ] ) {
1231 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1233 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1238 if ( !trie->widecharmap )
1239 trie->widecharmap = newHV();
1241 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1244 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1246 if ( !SvTRUE( *svpp ) ) {
1247 sv_setiv( *svpp, ++trie->uniquecharcount );
1252 if( cur == first ) {
1255 } else if (chars < trie->minlen) {
1257 } else if (chars > trie->maxlen) {
1261 } /* end first pass */
1262 DEBUG_TRIE_COMPILE_r(
1263 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1264 (int)depth * 2 + 2,"",
1265 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1266 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1267 (int)trie->minlen, (int)trie->maxlen )
1269 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
1272 We now know what we are dealing with in terms of unique chars and
1273 string sizes so we can calculate how much memory a naive
1274 representation using a flat table will take. If it's over a reasonable
1275 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1276 conservative but potentially much slower representation using an array
1279 At the end we convert both representations into the same compressed
1280 form that will be used in regexec.c for matching with. The latter
1281 is a form that cannot be used to construct with but has memory
1282 properties similar to the list form and access properties similar
1283 to the table form making it both suitable for fast searches and
1284 small enough that its feasable to store for the duration of a program.
1286 See the comment in the code where the compressed table is produced
1287 inplace from the flat tabe representation for an explanation of how
1288 the compression works.
1293 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1295 Second Pass -- Array Of Lists Representation
1297 Each state will be represented by a list of charid:state records
1298 (reg_trie_trans_le) the first such element holds the CUR and LEN
1299 points of the allocated array. (See defines above).
1301 We build the initial structure using the lists, and then convert
1302 it into the compressed table form which allows faster lookups
1303 (but cant be modified once converted).
1306 STRLEN transcount = 1;
1308 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1312 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1314 regnode * const noper = NEXTOPER( cur );
1315 U8 *uc = (U8*)STRING( noper );
1316 const U8 * const e = uc + STR_LEN( noper );
1317 U32 state = 1; /* required init */
1318 U16 charid = 0; /* sanity init */
1319 U8 *scan = (U8*)NULL; /* sanity init */
1320 STRLEN foldlen = 0; /* required init */
1321 U32 wordlen = 0; /* required init */
1322 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1324 if (OP(noper) != NOTHING) {
1325 for ( ; uc < e ; uc += len ) {
1330 charid = trie->charmap[ uvc ];
1332 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1336 charid=(U16)SvIV( *svpp );
1345 if ( !trie->states[ state ].trans.list ) {
1346 TRIE_LIST_NEW( state );
1348 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1349 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1350 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1355 newstate = next_alloc++;
1356 TRIE_LIST_PUSH( state, charid, newstate );
1361 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1363 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1366 TRIE_HANDLE_WORD(state);
1368 } /* end second pass */
1370 TRIE_LASTSTATE(trie) = next_alloc;
1371 Renew( trie->states, next_alloc, reg_trie_state );
1373 /* and now dump it out before we compress it */
1374 DEBUG_TRIE_COMPILE_MORE_r(
1375 dump_trie_interim_list(trie,next_alloc,depth+1)
1378 Newxz( trie->trans, transcount ,reg_trie_trans );
1385 for( state=1 ; state < next_alloc ; state ++ ) {
1389 DEBUG_TRIE_COMPILE_MORE_r(
1390 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1394 if (trie->states[state].trans.list) {
1395 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1399 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1400 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1401 if ( forid < minid ) {
1403 } else if ( forid > maxid ) {
1407 if ( transcount < tp + maxid - minid + 1) {
1409 Renew( trie->trans, transcount, reg_trie_trans );
1410 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1412 base = trie->uniquecharcount + tp - minid;
1413 if ( maxid == minid ) {
1415 for ( ; zp < tp ; zp++ ) {
1416 if ( ! trie->trans[ zp ].next ) {
1417 base = trie->uniquecharcount + zp - minid;
1418 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1419 trie->trans[ zp ].check = state;
1425 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1426 trie->trans[ tp ].check = state;
1431 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1432 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1433 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1434 trie->trans[ tid ].check = state;
1436 tp += ( maxid - minid + 1 );
1438 Safefree(trie->states[ state ].trans.list);
1441 DEBUG_TRIE_COMPILE_MORE_r(
1442 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1445 trie->states[ state ].trans.base=base;
1447 trie->lasttrans = tp + 1;
1451 Second Pass -- Flat Table Representation.
1453 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1454 We know that we will need Charcount+1 trans at most to store the data
1455 (one row per char at worst case) So we preallocate both structures
1456 assuming worst case.
1458 We then construct the trie using only the .next slots of the entry
1461 We use the .check field of the first entry of the node temporarily to
1462 make compression both faster and easier by keeping track of how many non
1463 zero fields are in the node.
1465 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1468 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1469 number representing the first entry of the node, and state as a
1470 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1471 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1472 are 2 entrys per node. eg:
1480 The table is internally in the right hand, idx form. However as we also
1481 have to deal with the states array which is indexed by nodenum we have to
1482 use TRIE_NODENUM() to convert.
1487 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1489 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1490 next_alloc = trie->uniquecharcount + 1;
1493 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1495 regnode * const noper = NEXTOPER( cur );
1496 const U8 *uc = (U8*)STRING( noper );
1497 const U8 * const e = uc + STR_LEN( noper );
1499 U32 state = 1; /* required init */
1501 U16 charid = 0; /* sanity init */
1502 U32 accept_state = 0; /* sanity init */
1503 U8 *scan = (U8*)NULL; /* sanity init */
1505 STRLEN foldlen = 0; /* required init */
1506 U32 wordlen = 0; /* required init */
1507 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1509 if ( OP(noper) != NOTHING ) {
1510 for ( ; uc < e ; uc += len ) {
1515 charid = trie->charmap[ uvc ];
1517 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1518 charid = svpp ? (U16)SvIV(*svpp) : 0;
1522 if ( !trie->trans[ state + charid ].next ) {
1523 trie->trans[ state + charid ].next = next_alloc;
1524 trie->trans[ state ].check++;
1525 next_alloc += trie->uniquecharcount;
1527 state = trie->trans[ state + charid ].next;
1529 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1531 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1534 accept_state = TRIE_NODENUM( state );
1535 TRIE_HANDLE_WORD(accept_state);
1537 } /* end second pass */
1539 /* and now dump it out before we compress it */
1540 DEBUG_TRIE_COMPILE_MORE_r(
1541 dump_trie_interim_table(trie,next_alloc,depth+1)
1546 * Inplace compress the table.*
1548 For sparse data sets the table constructed by the trie algorithm will
1549 be mostly 0/FAIL transitions or to put it another way mostly empty.
1550 (Note that leaf nodes will not contain any transitions.)
1552 This algorithm compresses the tables by eliminating most such
1553 transitions, at the cost of a modest bit of extra work during lookup:
1555 - Each states[] entry contains a .base field which indicates the
1556 index in the state[] array wheres its transition data is stored.
1558 - If .base is 0 there are no valid transitions from that node.
1560 - If .base is nonzero then charid is added to it to find an entry in
1563 -If trans[states[state].base+charid].check!=state then the
1564 transition is taken to be a 0/Fail transition. Thus if there are fail
1565 transitions at the front of the node then the .base offset will point
1566 somewhere inside the previous nodes data (or maybe even into a node
1567 even earlier), but the .check field determines if the transition is
1570 The following process inplace converts the table to the compressed
1571 table: We first do not compress the root node 1,and mark its all its
1572 .check pointers as 1 and set its .base pointer as 1 as well. This
1573 allows to do a DFA construction from the compressed table later, and
1574 ensures that any .base pointers we calculate later are greater than
1577 - We set 'pos' to indicate the first entry of the second node.
1579 - We then iterate over the columns of the node, finding the first and
1580 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1581 and set the .check pointers accordingly, and advance pos
1582 appropriately and repreat for the next node. Note that when we copy
1583 the next pointers we have to convert them from the original
1584 NODEIDX form to NODENUM form as the former is not valid post
1587 - If a node has no transitions used we mark its base as 0 and do not
1588 advance the pos pointer.
1590 - If a node only has one transition we use a second pointer into the
1591 structure to fill in allocated fail transitions from other states.
1592 This pointer is independent of the main pointer and scans forward
1593 looking for null transitions that are allocated to a state. When it
1594 finds one it writes the single transition into the "hole". If the
1595 pointer doesnt find one the single transition is appeneded as normal.
1597 - Once compressed we can Renew/realloc the structures to release the
1600 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1601 specifically Fig 3.47 and the associated pseudocode.
1605 const U32 laststate = TRIE_NODENUM( next_alloc );
1608 TRIE_LASTSTATE(trie) = laststate;
1610 for ( state = 1 ; state < laststate ; state++ ) {
1612 const U32 stateidx = TRIE_NODEIDX( state );
1613 const U32 o_used = trie->trans[ stateidx ].check;
1614 U32 used = trie->trans[ stateidx ].check;
1615 trie->trans[ stateidx ].check = 0;
1617 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1618 if ( flag || trie->trans[ stateidx + charid ].next ) {
1619 if ( trie->trans[ stateidx + charid ].next ) {
1621 for ( ; zp < pos ; zp++ ) {
1622 if ( ! trie->trans[ zp ].next ) {
1626 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1627 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1628 trie->trans[ zp ].check = state;
1629 if ( ++zp > pos ) pos = zp;
1636 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1638 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1639 trie->trans[ pos ].check = state;
1644 trie->lasttrans = pos + 1;
1645 Renew( trie->states, laststate + 1, reg_trie_state);
1646 DEBUG_TRIE_COMPILE_MORE_r(
1647 PerlIO_printf( Perl_debug_log,
1648 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1649 (int)depth * 2 + 2,"",
1650 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1653 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1656 } /* end table compress */
1658 /* resize the trans array to remove unused space */
1659 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1661 /* and now dump out the compressed format */
1662 DEBUG_TRIE_COMPILE_r(
1663 dump_trie(trie,depth+1)
1666 { /* Modify the program and insert the new TRIE node*/
1668 U8 nodetype =(U8)(flags & 0xFF);
1675 This means we convert either the first branch or the first Exact,
1676 depending on whether the thing following (in 'last') is a branch
1677 or not and whther first is the startbranch (ie is it a sub part of
1678 the alternation or is it the whole thing.)
1679 Assuming its a sub part we conver the EXACT otherwise we convert
1680 the whole branch sequence, including the first.
1682 /* Find the node we are going to overwrite */
1683 if ( first == startbranch && OP( last ) != BRANCH ) {
1684 /* whole branch chain */
1687 const regnode *nop = NEXTOPER( convert );
1688 mjd_offset= Node_Offset((nop));
1689 mjd_nodelen= Node_Length((nop));
1692 /* branch sub-chain */
1693 convert = NEXTOPER( first );
1694 NEXT_OFF( first ) = (U16)(last - first);
1696 mjd_offset= Node_Offset((convert));
1697 mjd_nodelen= Node_Length((convert));
1701 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1702 (int)depth * 2 + 2, "",
1703 mjd_offset,mjd_nodelen)
1706 /* But first we check to see if there is a common prefix we can
1707 split out as an EXACT and put in front of the TRIE node. */
1708 trie->startstate= 1;
1709 if ( trie->bitmap && !trie->widecharmap ) {
1712 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1713 (int)depth * 2 + 2, "",
1714 TRIE_LASTSTATE(trie))
1716 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1720 const U32 base = trie->states[ state ].trans.base;
1722 if ( trie->states[state].wordnum )
1725 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1726 if ( ( base + ofs >= trie->uniquecharcount ) &&
1727 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1728 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1730 if ( ++count > 1 ) {
1731 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1732 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1733 if ( state == 1 ) break;
1735 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1737 PerlIO_printf(Perl_debug_log,
1738 "%*sNew Start State=%"UVuf" Class: [",
1739 (int)depth * 2 + 2, "",
1742 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1743 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1745 TRIE_BITMAP_SET(trie,*ch);
1747 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1749 PerlIO_printf(Perl_debug_log, (char*)ch)
1753 TRIE_BITMAP_SET(trie,*ch);
1755 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1756 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1762 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1763 const char *ch = SvPV_nolen_const( *tmp );
1765 PerlIO_printf( Perl_debug_log,
1766 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1767 (int)depth * 2 + 2, "",
1771 OP( convert ) = nodetype;
1772 str=STRING(convert);
1781 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1787 regnode *n = convert+NODE_SZ_STR(convert);
1788 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1789 trie->startstate = state;
1790 trie->minlen -= (state - 1);
1791 trie->maxlen -= (state - 1);
1793 regnode *fix = convert;
1795 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1796 while( ++fix < n ) {
1797 Set_Node_Offset_Length(fix, 0, 0);
1803 NEXT_OFF(convert) = (U16)(tail - convert);
1807 if ( trie->maxlen ) {
1808 OP( convert ) = TRIE;
1809 NEXT_OFF( convert ) = (U16)(tail - convert);
1810 ARG_SET( convert, data_slot );
1812 /* store the type in the flags */
1813 convert->flags = nodetype;
1814 /* XXX We really should free up the resource in trie now, as we wont use them */
1816 /* needed for dumping*/
1818 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1819 regnode *opt = convert;
1820 while (++opt<optimize) {
1821 Set_Node_Offset_Length(opt,0,0);
1823 /* We now need to mark all of the space originally used by the
1824 branches as optimized away. This keeps the dumpuntil from
1825 throwing a wobbly as it doesnt use regnext() to traverse the
1827 We also "fix" the offsets
1829 while( optimize < last ) {
1830 mjd_nodelen += Node_Length((optimize));
1831 OP( optimize ) = OPTIMIZED;
1832 Set_Node_Offset_Length(optimize,0,0);
1835 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1837 } /* end node insert */
1839 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1845 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1846 * These need to be revisited when a newer toolchain becomes available.
1848 #if defined(__sparc64__) && defined(__GNUC__)
1849 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1850 # undef SPARC64_GCC_WORKAROUND
1851 # define SPARC64_GCC_WORKAROUND 1
1855 #define DEBUG_PEEP(str,scan,depth) \
1856 DEBUG_OPTIMISE_r({ \
1857 SV * const mysv=sv_newmortal(); \
1858 regnode *Next = regnext(scan); \
1859 regprop(RExC_rx, mysv, scan); \
1860 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1861 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1862 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1865 #define JOIN_EXACT(scan,min,flags) \
1866 if (PL_regkind[OP(scan)] == EXACT) \
1867 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1870 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1871 /* Merge several consecutive EXACTish nodes into one. */
1872 regnode *n = regnext(scan);
1874 regnode *next = scan + NODE_SZ_STR(scan);
1878 regnode *stop = scan;
1879 GET_RE_DEBUG_FLAGS_DECL;
1881 PERL_UNUSED_ARG(flags);
1882 PERL_UNUSED_ARG(val);
1883 PERL_UNUSED_ARG(depth);
1885 DEBUG_PEEP("join",scan,depth);
1887 /* Skip NOTHING, merge EXACT*. */
1889 ( PL_regkind[OP(n)] == NOTHING ||
1890 (stringok && (OP(n) == OP(scan))))
1892 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1894 if (OP(n) == TAIL || n > next)
1896 if (PL_regkind[OP(n)] == NOTHING) {
1898 DEBUG_PEEP("skip:",n,depth);
1899 NEXT_OFF(scan) += NEXT_OFF(n);
1900 next = n + NODE_STEP_REGNODE;
1907 else if (stringok) {
1908 const int oldl = STR_LEN(scan);
1909 regnode * const nnext = regnext(n);
1911 DEBUG_PEEP("merg",n,depth);
1914 if (oldl + STR_LEN(n) > U8_MAX)
1916 NEXT_OFF(scan) += NEXT_OFF(n);
1917 STR_LEN(scan) += STR_LEN(n);
1918 next = n + NODE_SZ_STR(n);
1919 /* Now we can overwrite *n : */
1920 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1928 #ifdef EXPERIMENTAL_INPLACESCAN
1929 if (flags && !NEXT_OFF(n)) {
1930 DEBUG_PEEP("atch",val,depth);
1931 if (reg_off_by_arg[OP(n)]) {
1932 ARG_SET(n, val - n);
1935 NEXT_OFF(n) = val - n;
1942 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1944 Two problematic code points in Unicode casefolding of EXACT nodes:
1946 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1947 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1953 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1954 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1956 This means that in case-insensitive matching (or "loose matching",
1957 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1958 length of the above casefolded versions) can match a target string
1959 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1960 This would rather mess up the minimum length computation.
1962 What we'll do is to look for the tail four bytes, and then peek
1963 at the preceding two bytes to see whether we need to decrease
1964 the minimum length by four (six minus two).
1966 Thanks to the design of UTF-8, there cannot be false matches:
1967 A sequence of valid UTF-8 bytes cannot be a subsequence of
1968 another valid sequence of UTF-8 bytes.
1971 char * const s0 = STRING(scan), *s, *t;
1972 char * const s1 = s0 + STR_LEN(scan) - 1;
1973 char * const s2 = s1 - 4;
1974 const char t0[] = "\xcc\x88\xcc\x81";
1975 const char * const t1 = t0 + 3;
1978 s < s2 && (t = ninstr(s, s1, t0, t1));
1980 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1981 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1988 n = scan + NODE_SZ_STR(scan);
1990 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1997 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2001 /* REx optimizer. Converts nodes into quickier variants "in place".
2002 Finds fixed substrings. */
2004 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2005 to the position after last scanned or to NULL. */
2010 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2011 regnode *last, scan_data_t *data, U32 flags, U32 depth)
2012 /* scanp: Start here (read-write). */
2013 /* deltap: Write maxlen-minlen here. */
2014 /* last: Stop before this one. */
2017 I32 min = 0, pars = 0, code;
2018 regnode *scan = *scanp, *next;
2020 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2021 int is_inf_internal = 0; /* The studied chunk is infinite */
2022 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2023 scan_data_t data_fake;
2024 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2025 SV *re_trie_maxbuff = NULL;
2027 GET_RE_DEBUG_FLAGS_DECL;
2029 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2032 while (scan && OP(scan) != END && scan < last) {
2033 /* Peephole optimizer: */
2034 DEBUG_PEEP("Peep",scan,depth);
2036 JOIN_EXACT(scan,&min,0);
2038 /* Follow the next-chain of the current node and optimize
2039 away all the NOTHINGs from it. */
2040 if (OP(scan) != CURLYX) {
2041 const int max = (reg_off_by_arg[OP(scan)]
2043 /* I32 may be smaller than U16 on CRAYs! */
2044 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2045 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2049 /* Skip NOTHING and LONGJMP. */
2050 while ((n = regnext(n))
2051 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2052 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2053 && off + noff < max)
2055 if (reg_off_by_arg[OP(scan)])
2058 NEXT_OFF(scan) = off;
2063 /* The principal pseudo-switch. Cannot be a switch, since we
2064 look into several different things. */
2065 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2066 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2067 next = regnext(scan);
2069 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2071 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2072 I32 max1 = 0, min1 = I32_MAX, num = 0;
2073 struct regnode_charclass_class accum;
2074 regnode * const startbranch=scan;
2076 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2077 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2078 if (flags & SCF_DO_STCLASS)
2079 cl_init_zero(pRExC_state, &accum);
2081 while (OP(scan) == code) {
2082 I32 deltanext, minnext, f = 0, fake;
2083 struct regnode_charclass_class this_class;
2086 data_fake.flags = 0;
2088 data_fake.whilem_c = data->whilem_c;
2089 data_fake.last_closep = data->last_closep;
2092 data_fake.last_closep = &fake;
2093 next = regnext(scan);
2094 scan = NEXTOPER(scan);
2096 scan = NEXTOPER(scan);
2097 if (flags & SCF_DO_STCLASS) {
2098 cl_init(pRExC_state, &this_class);
2099 data_fake.start_class = &this_class;
2100 f = SCF_DO_STCLASS_AND;
2102 if (flags & SCF_WHILEM_VISITED_POS)
2103 f |= SCF_WHILEM_VISITED_POS;
2105 /* we suppose the run is continuous, last=next...*/
2106 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2107 next, &data_fake, f,depth+1);
2110 if (max1 < minnext + deltanext)
2111 max1 = minnext + deltanext;
2112 if (deltanext == I32_MAX)
2113 is_inf = is_inf_internal = 1;
2115 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2118 if (data_fake.flags & SF_HAS_EVAL)
2119 data->flags |= SF_HAS_EVAL;
2120 data->whilem_c = data_fake.whilem_c;
2122 if (flags & SCF_DO_STCLASS)
2123 cl_or(pRExC_state, &accum, &this_class);
2124 if (code == SUSPEND)
2127 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2129 if (flags & SCF_DO_SUBSTR) {
2130 data->pos_min += min1;
2131 data->pos_delta += max1 - min1;
2132 if (max1 != min1 || is_inf)
2133 data->longest = &(data->longest_float);
2136 delta += max1 - min1;
2137 if (flags & SCF_DO_STCLASS_OR) {
2138 cl_or(pRExC_state, data->start_class, &accum);
2140 cl_and(data->start_class, &and_with);
2141 flags &= ~SCF_DO_STCLASS;
2144 else if (flags & SCF_DO_STCLASS_AND) {
2146 cl_and(data->start_class, &accum);
2147 flags &= ~SCF_DO_STCLASS;
2150 /* Switch to OR mode: cache the old value of
2151 * data->start_class */
2152 StructCopy(data->start_class, &and_with,
2153 struct regnode_charclass_class);
2154 flags &= ~SCF_DO_STCLASS_AND;
2155 StructCopy(&accum, data->start_class,
2156 struct regnode_charclass_class);
2157 flags |= SCF_DO_STCLASS_OR;
2158 data->start_class->flags |= ANYOF_EOS;
2164 Assuming this was/is a branch we are dealing with: 'scan' now
2165 points at the item that follows the branch sequence, whatever
2166 it is. We now start at the beginning of the sequence and look
2172 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2174 If we can find such a subseqence we need to turn the first
2175 element into a trie and then add the subsequent branch exact
2176 strings to the trie.
2180 1. patterns where the whole set of branch can be converted to a trie,
2182 2. patterns where only a subset of the alternations can be
2183 converted to a trie.
2185 In case 1 we can replace the whole set with a single regop
2186 for the trie. In case 2 we need to keep the start and end
2189 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2190 becomes BRANCH TRIE; BRANCH X;
2192 Hypthetically when we know the regex isnt anchored we can
2193 turn a case 1 into a DFA and let it rip... Every time it finds a match
2194 it would just call its tail, no WHILEM/CURLY needed.
2197 if (PERL_ENABLE_TRIE_OPTIMISATION) {
2199 if (!re_trie_maxbuff) {
2200 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2201 if (!SvIOK(re_trie_maxbuff))
2202 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2204 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2206 regnode *first = (regnode *)NULL;
2207 regnode *last = (regnode *)NULL;
2208 regnode *tail = scan;
2213 SV * const mysv = sv_newmortal(); /* for dumping */
2215 /* var tail is used because there may be a TAIL
2216 regop in the way. Ie, the exacts will point to the
2217 thing following the TAIL, but the last branch will
2218 point at the TAIL. So we advance tail. If we
2219 have nested (?:) we may have to move through several
2223 while ( OP( tail ) == TAIL ) {
2224 /* this is the TAIL generated by (?:) */
2225 tail = regnext( tail );
2230 regprop(RExC_rx, mysv, tail );
2231 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2232 (int)depth * 2 + 2, "",
2233 "Looking for TRIE'able sequences. Tail node is: ",
2234 SvPV_nolen_const( mysv )
2240 step through the branches, cur represents each
2241 branch, noper is the first thing to be matched
2242 as part of that branch and noper_next is the
2243 regnext() of that node. if noper is an EXACT
2244 and noper_next is the same as scan (our current
2245 position in the regex) then the EXACT branch is
2246 a possible optimization target. Once we have
2247 two or more consequetive such branches we can
2248 create a trie of the EXACT's contents and stich
2249 it in place. If the sequence represents all of
2250 the branches we eliminate the whole thing and
2251 replace it with a single TRIE. If it is a
2252 subsequence then we need to stitch it in. This
2253 means the first branch has to remain, and needs
2254 to be repointed at the item on the branch chain
2255 following the last branch optimized. This could
2256 be either a BRANCH, in which case the
2257 subsequence is internal, or it could be the
2258 item following the branch sequence in which
2259 case the subsequence is at the end.
2263 /* dont use tail as the end marker for this traverse */
2264 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2265 regnode * const noper = NEXTOPER( cur );
2266 regnode * const noper_next = regnext( noper );
2269 regprop(RExC_rx, mysv, cur);
2270 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2271 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2273 regprop(RExC_rx, mysv, noper);
2274 PerlIO_printf( Perl_debug_log, " -> %s",
2275 SvPV_nolen_const(mysv));
2278 regprop(RExC_rx, mysv, noper_next );
2279 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2280 SvPV_nolen_const(mysv));
2282 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2283 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2285 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2286 : PL_regkind[ OP( noper ) ] == EXACT )
2287 || OP(noper) == NOTHING )
2288 && noper_next == tail && count<U16_MAX)
2291 if ( !first || optype == NOTHING ) {
2292 if (!first) first = cur;
2293 optype = OP( noper );
2299 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2301 if ( PL_regkind[ OP( noper ) ] == EXACT
2302 && noper_next == tail )
2306 optype = OP( noper );
2316 regprop(RExC_rx, mysv, cur);
2317 PerlIO_printf( Perl_debug_log,
2318 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2319 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2323 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2324 #ifdef TRIE_STUDY_OPT
2325 if ( made && startbranch == first ) {
2326 if ( OP(first)!=TRIE )
2327 flags |= SCF_EXACT_TRIE;
2329 regnode *chk=*scanp;
2330 while ( OP( chk ) == OPEN )
2331 chk = regnext( chk );
2333 flags |= SCF_EXACT_TRIE;
2342 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2343 scan = NEXTOPER(NEXTOPER(scan));
2344 } else /* single branch is optimized. */
2345 scan = NEXTOPER(scan);
2348 else if (OP(scan) == EXACT) {
2349 I32 l = STR_LEN(scan);
2352 const U8 * const s = (U8*)STRING(scan);
2353 l = utf8_length(s, s + l);
2354 uc = utf8_to_uvchr(s, NULL);
2356 uc = *((U8*)STRING(scan));
2359 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2360 /* The code below prefers earlier match for fixed
2361 offset, later match for variable offset. */
2362 if (data->last_end == -1) { /* Update the start info. */
2363 data->last_start_min = data->pos_min;
2364 data->last_start_max = is_inf
2365 ? I32_MAX : data->pos_min + data->pos_delta;
2367 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2369 SvUTF8_on(data->last_found);
2371 SV * const sv = data->last_found;
2372 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2373 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2374 if (mg && mg->mg_len >= 0)
2375 mg->mg_len += utf8_length((U8*)STRING(scan),
2376 (U8*)STRING(scan)+STR_LEN(scan));
2378 data->last_end = data->pos_min + l;
2379 data->pos_min += l; /* As in the first entry. */
2380 data->flags &= ~SF_BEFORE_EOL;
2382 if (flags & SCF_DO_STCLASS_AND) {
2383 /* Check whether it is compatible with what we know already! */
2387 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2388 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2389 && (!(data->start_class->flags & ANYOF_FOLD)
2390 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2393 ANYOF_CLASS_ZERO(data->start_class);
2394 ANYOF_BITMAP_ZERO(data->start_class);
2396 ANYOF_BITMAP_SET(data->start_class, uc);
2397 data->start_class->flags &= ~ANYOF_EOS;
2399 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2401 else if (flags & SCF_DO_STCLASS_OR) {
2402 /* false positive possible if the class is case-folded */
2404 ANYOF_BITMAP_SET(data->start_class, uc);
2406 data->start_class->flags |= ANYOF_UNICODE_ALL;
2407 data->start_class->flags &= ~ANYOF_EOS;
2408 cl_and(data->start_class, &and_with);
2410 flags &= ~SCF_DO_STCLASS;
2412 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2413 I32 l = STR_LEN(scan);
2414 UV uc = *((U8*)STRING(scan));
2416 /* Search for fixed substrings supports EXACT only. */
2417 if (flags & SCF_DO_SUBSTR) {
2419 scan_commit(pRExC_state, data);
2422 const U8 * const s = (U8 *)STRING(scan);
2423 l = utf8_length(s, s + l);
2424 uc = utf8_to_uvchr(s, NULL);
2427 if (flags & SCF_DO_SUBSTR)
2429 if (flags & SCF_DO_STCLASS_AND) {
2430 /* Check whether it is compatible with what we know already! */
2434 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2435 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2436 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2438 ANYOF_CLASS_ZERO(data->start_class);
2439 ANYOF_BITMAP_ZERO(data->start_class);
2441 ANYOF_BITMAP_SET(data->start_class, uc);
2442 data->start_class->flags &= ~ANYOF_EOS;
2443 data->start_class->flags |= ANYOF_FOLD;
2444 if (OP(scan) == EXACTFL)
2445 data->start_class->flags |= ANYOF_LOCALE;
2448 else if (flags & SCF_DO_STCLASS_OR) {
2449 if (data->start_class->flags & ANYOF_FOLD) {
2450 /* false positive possible if the class is case-folded.
2451 Assume that the locale settings are the same... */
2453 ANYOF_BITMAP_SET(data->start_class, uc);
2454 data->start_class->flags &= ~ANYOF_EOS;
2456 cl_and(data->start_class, &and_with);
2458 flags &= ~SCF_DO_STCLASS;
2460 #ifdef TRIE_STUDY_OPT
2461 else if (OP(scan) == TRIE) {
2462 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2463 min += trie->minlen;
2464 delta += (trie->maxlen - trie->minlen);
2465 flags &= ~SCF_DO_STCLASS; /* xxx */
2466 if (flags & SCF_DO_SUBSTR) {
2467 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2468 data->pos_min += trie->minlen;
2469 data->pos_delta += (trie->maxlen - trie->minlen);
2470 if (trie->maxlen != trie->minlen)
2471 data->longest = &(data->longest_float);
2475 else if (strchr((const char*)PL_varies,OP(scan))) {
2476 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2477 I32 f = flags, pos_before = 0;
2478 regnode * const oscan = scan;
2479 struct regnode_charclass_class this_class;
2480 struct regnode_charclass_class *oclass = NULL;
2481 I32 next_is_eval = 0;
2483 switch (PL_regkind[OP(scan)]) {
2484 case WHILEM: /* End of (?:...)* . */
2485 scan = NEXTOPER(scan);
2488 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2489 next = NEXTOPER(scan);
2490 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2492 maxcount = REG_INFTY;
2493 next = regnext(scan);
2494 scan = NEXTOPER(scan);
2498 if (flags & SCF_DO_SUBSTR)
2503 if (flags & SCF_DO_STCLASS) {
2505 maxcount = REG_INFTY;
2506 next = regnext(scan);
2507 scan = NEXTOPER(scan);
2510 is_inf = is_inf_internal = 1;
2511 scan = regnext(scan);
2512 if (flags & SCF_DO_SUBSTR) {
2513 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2514 data->longest = &(data->longest_float);
2516 goto optimize_curly_tail;
2518 mincount = ARG1(scan);
2519 maxcount = ARG2(scan);
2520 next = regnext(scan);
2521 if (OP(scan) == CURLYX) {
2522 I32 lp = (data ? *(data->last_closep) : 0);
2523 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2525 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2526 next_is_eval = (OP(scan) == EVAL);
2528 if (flags & SCF_DO_SUBSTR) {
2529 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2530 pos_before = data->pos_min;
2534 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2536 data->flags |= SF_IS_INF;
2538 if (flags & SCF_DO_STCLASS) {
2539 cl_init(pRExC_state, &this_class);
2540 oclass = data->start_class;
2541 data->start_class = &this_class;
2542 f |= SCF_DO_STCLASS_AND;
2543 f &= ~SCF_DO_STCLASS_OR;
2545 /* These are the cases when once a subexpression
2546 fails at a particular position, it cannot succeed
2547 even after backtracking at the enclosing scope.
2549 XXXX what if minimal match and we are at the
2550 initial run of {n,m}? */
2551 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2552 f &= ~SCF_WHILEM_VISITED_POS;
2554 /* This will finish on WHILEM, setting scan, or on NULL: */
2555 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2557 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2559 if (flags & SCF_DO_STCLASS)
2560 data->start_class = oclass;
2561 if (mincount == 0 || minnext == 0) {
2562 if (flags & SCF_DO_STCLASS_OR) {
2563 cl_or(pRExC_state, data->start_class, &this_class);
2565 else if (flags & SCF_DO_STCLASS_AND) {
2566 /* Switch to OR mode: cache the old value of
2567 * data->start_class */
2568 StructCopy(data->start_class, &and_with,
2569 struct regnode_charclass_class);
2570 flags &= ~SCF_DO_STCLASS_AND;
2571 StructCopy(&this_class, data->start_class,
2572 struct regnode_charclass_class);
2573 flags |= SCF_DO_STCLASS_OR;
2574 data->start_class->flags |= ANYOF_EOS;
2576 } else { /* Non-zero len */
2577 if (flags & SCF_DO_STCLASS_OR) {
2578 cl_or(pRExC_state, data->start_class, &this_class);
2579 cl_and(data->start_class, &and_with);
2581 else if (flags & SCF_DO_STCLASS_AND)
2582 cl_and(data->start_class, &this_class);
2583 flags &= ~SCF_DO_STCLASS;
2585 if (!scan) /* It was not CURLYX, but CURLY. */
2587 if ( /* ? quantifier ok, except for (?{ ... }) */
2588 (next_is_eval || !(mincount == 0 && maxcount == 1))
2589 && (minnext == 0) && (deltanext == 0)
2590 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2591 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2592 && ckWARN(WARN_REGEXP))
2595 "Quantifier unexpected on zero-length expression");
2598 min += minnext * mincount;
2599 is_inf_internal |= ((maxcount == REG_INFTY
2600 && (minnext + deltanext) > 0)
2601 || deltanext == I32_MAX);
2602 is_inf |= is_inf_internal;
2603 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2605 /* Try powerful optimization CURLYX => CURLYN. */
2606 if ( OP(oscan) == CURLYX && data
2607 && data->flags & SF_IN_PAR
2608 && !(data->flags & SF_HAS_EVAL)
2609 && !deltanext && minnext == 1 ) {
2610 /* Try to optimize to CURLYN. */
2611 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2612 regnode * const nxt1 = nxt;
2619 if (!strchr((const char*)PL_simple,OP(nxt))
2620 && !(PL_regkind[OP(nxt)] == EXACT
2621 && STR_LEN(nxt) == 1))
2627 if (OP(nxt) != CLOSE)
2629 /* Now we know that nxt2 is the only contents: */
2630 oscan->flags = (U8)ARG(nxt);
2632 OP(nxt1) = NOTHING; /* was OPEN. */
2634 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2635 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2636 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2637 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2638 OP(nxt + 1) = OPTIMIZED; /* was count. */
2639 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2644 /* Try optimization CURLYX => CURLYM. */
2645 if ( OP(oscan) == CURLYX && data
2646 && !(data->flags & SF_HAS_PAR)
2647 && !(data->flags & SF_HAS_EVAL)
2648 && !deltanext /* atom is fixed width */
2649 && minnext != 0 /* CURLYM can't handle zero width */
2651 /* XXXX How to optimize if data == 0? */
2652 /* Optimize to a simpler form. */
2653 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2657 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2658 && (OP(nxt2) != WHILEM))
2660 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2661 /* Need to optimize away parenths. */
2662 if (data->flags & SF_IN_PAR) {
2663 /* Set the parenth number. */
2664 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2666 if (OP(nxt) != CLOSE)
2667 FAIL("Panic opt close");
2668 oscan->flags = (U8)ARG(nxt);
2669 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2670 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2672 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2673 OP(nxt + 1) = OPTIMIZED; /* was count. */
2674 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2675 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2678 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2679 regnode *nnxt = regnext(nxt1);
2682 if (reg_off_by_arg[OP(nxt1)])
2683 ARG_SET(nxt1, nxt2 - nxt1);
2684 else if (nxt2 - nxt1 < U16_MAX)
2685 NEXT_OFF(nxt1) = nxt2 - nxt1;
2687 OP(nxt) = NOTHING; /* Cannot beautify */
2692 /* Optimize again: */
2693 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2699 else if ((OP(oscan) == CURLYX)
2700 && (flags & SCF_WHILEM_VISITED_POS)
2701 /* See the comment on a similar expression above.
2702 However, this time it not a subexpression
2703 we care about, but the expression itself. */
2704 && (maxcount == REG_INFTY)
2705 && data && ++data->whilem_c < 16) {
2706 /* This stays as CURLYX, we can put the count/of pair. */
2707 /* Find WHILEM (as in regexec.c) */
2708 regnode *nxt = oscan + NEXT_OFF(oscan);
2710 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2712 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2713 | (RExC_whilem_seen << 4)); /* On WHILEM */
2715 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2717 if (flags & SCF_DO_SUBSTR) {
2718 SV *last_str = NULL;
2719 int counted = mincount != 0;
2721 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2722 #if defined(SPARC64_GCC_WORKAROUND)
2725 const char *s = NULL;
2728 if (pos_before >= data->last_start_min)
2731 b = data->last_start_min;
2734 s = SvPV_const(data->last_found, l);
2735 old = b - data->last_start_min;
2738 I32 b = pos_before >= data->last_start_min
2739 ? pos_before : data->last_start_min;
2741 const char * const s = SvPV_const(data->last_found, l);
2742 I32 old = b - data->last_start_min;
2746 old = utf8_hop((U8*)s, old) - (U8*)s;
2749 /* Get the added string: */
2750 last_str = newSVpvn(s + old, l);
2752 SvUTF8_on(last_str);
2753 if (deltanext == 0 && pos_before == b) {
2754 /* What was added is a constant string */
2756 SvGROW(last_str, (mincount * l) + 1);
2757 repeatcpy(SvPVX(last_str) + l,
2758 SvPVX_const(last_str), l, mincount - 1);
2759 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2760 /* Add additional parts. */
2761 SvCUR_set(data->last_found,
2762 SvCUR(data->last_found) - l);
2763 sv_catsv(data->last_found, last_str);
2765 SV * sv = data->last_found;
2767 SvUTF8(sv) && SvMAGICAL(sv) ?
2768 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2769 if (mg && mg->mg_len >= 0)
2770 mg->mg_len += CHR_SVLEN(last_str);
2772 data->last_end += l * (mincount - 1);
2775 /* start offset must point into the last copy */
2776 data->last_start_min += minnext * (mincount - 1);
2777 data->last_start_max += is_inf ? I32_MAX
2778 : (maxcount - 1) * (minnext + data->pos_delta);
2781 /* It is counted once already... */
2782 data->pos_min += minnext * (mincount - counted);
2783 data->pos_delta += - counted * deltanext +
2784 (minnext + deltanext) * maxcount - minnext * mincount;
2785 if (mincount != maxcount) {
2786 /* Cannot extend fixed substrings found inside
2788 scan_commit(pRExC_state,data);
2789 if (mincount && last_str) {
2790 SV * const sv = data->last_found;
2791 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2792 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2796 sv_setsv(sv, last_str);
2797 data->last_end = data->pos_min;
2798 data->last_start_min =
2799 data->pos_min - CHR_SVLEN(last_str);
2800 data->last_start_max = is_inf
2802 : data->pos_min + data->pos_delta
2803 - CHR_SVLEN(last_str);
2805 data->longest = &(data->longest_float);
2807 SvREFCNT_dec(last_str);
2809 if (data && (fl & SF_HAS_EVAL))
2810 data->flags |= SF_HAS_EVAL;
2811 optimize_curly_tail:
2812 if (OP(oscan) != CURLYX) {
2813 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2815 NEXT_OFF(oscan) += NEXT_OFF(next);
2818 default: /* REF and CLUMP only? */
2819 if (flags & SCF_DO_SUBSTR) {
2820 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2821 data->longest = &(data->longest_float);
2823 is_inf = is_inf_internal = 1;
2824 if (flags & SCF_DO_STCLASS_OR)
2825 cl_anything(pRExC_state, data->start_class);
2826 flags &= ~SCF_DO_STCLASS;
2830 else if (strchr((const char*)PL_simple,OP(scan))) {
2833 if (flags & SCF_DO_SUBSTR) {
2834 scan_commit(pRExC_state,data);
2838 if (flags & SCF_DO_STCLASS) {
2839 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2841 /* Some of the logic below assumes that switching
2842 locale on will only add false positives. */
2843 switch (PL_regkind[OP(scan)]) {
2847 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2848 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2849 cl_anything(pRExC_state, data->start_class);
2852 if (OP(scan) == SANY)
2854 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2855 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2856 || (data->start_class->flags & ANYOF_CLASS));
2857 cl_anything(pRExC_state, data->start_class);
2859 if (flags & SCF_DO_STCLASS_AND || !value)
2860 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2863 if (flags & SCF_DO_STCLASS_AND)
2864 cl_and(data->start_class,
2865 (struct regnode_charclass_class*)scan);
2867 cl_or(pRExC_state, data->start_class,
2868 (struct regnode_charclass_class*)scan);
2871 if (flags & SCF_DO_STCLASS_AND) {
2872 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2873 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2874 for (value = 0; value < 256; value++)
2875 if (!isALNUM(value))
2876 ANYOF_BITMAP_CLEAR(data->start_class, value);
2880 if (data->start_class->flags & ANYOF_LOCALE)
2881 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2883 for (value = 0; value < 256; value++)
2885 ANYOF_BITMAP_SET(data->start_class, value);
2890 if (flags & SCF_DO_STCLASS_AND) {
2891 if (data->start_class->flags & ANYOF_LOCALE)
2892 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2895 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2896 data->start_class->flags |= ANYOF_LOCALE;
2900 if (flags & SCF_DO_STCLASS_AND) {
2901 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2902 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2903 for (value = 0; value < 256; value++)
2905 ANYOF_BITMAP_CLEAR(data->start_class, value);
2909 if (data->start_class->flags & ANYOF_LOCALE)
2910 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2912 for (value = 0; value < 256; value++)
2913 if (!isALNUM(value))
2914 ANYOF_BITMAP_SET(data->start_class, value);
2919 if (flags & SCF_DO_STCLASS_AND) {
2920 if (data->start_class->flags & ANYOF_LOCALE)
2921 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2924 data->start_class->flags |= ANYOF_LOCALE;
2925 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2929 if (flags & SCF_DO_STCLASS_AND) {
2930 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2931 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2932 for (value = 0; value < 256; value++)
2933 if (!isSPACE(value))
2934 ANYOF_BITMAP_CLEAR(data->start_class, value);
2938 if (data->start_class->flags & ANYOF_LOCALE)
2939 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2941 for (value = 0; value < 256; value++)
2943 ANYOF_BITMAP_SET(data->start_class, value);
2948 if (flags & SCF_DO_STCLASS_AND) {
2949 if (data->start_class->flags & ANYOF_LOCALE)
2950 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2953 data->start_class->flags |= ANYOF_LOCALE;
2954 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2958 if (flags & SCF_DO_STCLASS_AND) {
2959 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2960 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2961 for (value = 0; value < 256; value++)
2963 ANYOF_BITMAP_CLEAR(data->start_class, value);
2967 if (data->start_class->flags & ANYOF_LOCALE)
2968 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2970 for (value = 0; value < 256; value++)
2971 if (!isSPACE(value))
2972 ANYOF_BITMAP_SET(data->start_class, value);
2977 if (flags & SCF_DO_STCLASS_AND) {
2978 if (data->start_class->flags & ANYOF_LOCALE) {
2979 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2980 for (value = 0; value < 256; value++)
2981 if (!isSPACE(value))
2982 ANYOF_BITMAP_CLEAR(data->start_class, value);
2986 data->start_class->flags |= ANYOF_LOCALE;
2987 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2991 if (flags & SCF_DO_STCLASS_AND) {
2992 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2993 for (value = 0; value < 256; value++)
2994 if (!isDIGIT(value))
2995 ANYOF_BITMAP_CLEAR(data->start_class, value);
2998 if (data->start_class->flags & ANYOF_LOCALE)
2999 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3001 for (value = 0; value < 256; value++)
3003 ANYOF_BITMAP_SET(data->start_class, value);
3008 if (flags & SCF_DO_STCLASS_AND) {
3009 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3010 for (value = 0; value < 256; value++)
3012 ANYOF_BITMAP_CLEAR(data->start_class, value);
3015 if (data->start_class->flags & ANYOF_LOCALE)
3016 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3018 for (value = 0; value < 256; value++)
3019 if (!isDIGIT(value))
3020 ANYOF_BITMAP_SET(data->start_class, value);
3025 if (flags & SCF_DO_STCLASS_OR)
3026 cl_and(data->start_class, &and_with);
3027 flags &= ~SCF_DO_STCLASS;
3030 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3031 data->flags |= (OP(scan) == MEOL
3035 else if ( PL_regkind[OP(scan)] == BRANCHJ
3036 /* Lookbehind, or need to calculate parens/evals/stclass: */
3037 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3038 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3039 /* Lookahead/lookbehind */
3040 I32 deltanext, minnext, fake = 0;
3042 struct regnode_charclass_class intrnl;
3045 data_fake.flags = 0;
3047 data_fake.whilem_c = data->whilem_c;
3048 data_fake.last_closep = data->last_closep;
3051 data_fake.last_closep = &fake;
3052 if ( flags & SCF_DO_STCLASS && !scan->flags
3053 && OP(scan) == IFMATCH ) { /* Lookahead */
3054 cl_init(pRExC_state, &intrnl);
3055 data_fake.start_class = &intrnl;
3056 f |= SCF_DO_STCLASS_AND;
3058 if (flags & SCF_WHILEM_VISITED_POS)
3059 f |= SCF_WHILEM_VISITED_POS;
3060 next = regnext(scan);
3061 nscan = NEXTOPER(NEXTOPER(scan));
3062 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3065 vFAIL("Variable length lookbehind not implemented");
3067 else if (minnext > U8_MAX) {
3068 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3070 scan->flags = (U8)minnext;
3073 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3075 if (data_fake.flags & SF_HAS_EVAL)
3076 data->flags |= SF_HAS_EVAL;
3077 data->whilem_c = data_fake.whilem_c;
3079 if (f & SCF_DO_STCLASS_AND) {
3080 const int was = (data->start_class->flags & ANYOF_EOS);
3082 cl_and(data->start_class, &intrnl);
3084 data->start_class->flags |= ANYOF_EOS;
3087 else if (OP(scan) == OPEN) {
3090 else if (OP(scan) == CLOSE) {
3091 if ((I32)ARG(scan) == is_par) {
3092 next = regnext(scan);
3094 if ( next && (OP(next) != WHILEM) && next < last)
3095 is_par = 0; /* Disable optimization */
3098 *(data->last_closep) = ARG(scan);
3100 else if (OP(scan) == EVAL) {
3102 data->flags |= SF_HAS_EVAL;
3104 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3105 if (flags & SCF_DO_SUBSTR) {
3106 scan_commit(pRExC_state,data);
3107 data->longest = &(data->longest_float);
3109 is_inf = is_inf_internal = 1;
3110 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3111 cl_anything(pRExC_state, data->start_class);
3112 flags &= ~SCF_DO_STCLASS;
3114 /* Else: zero-length, ignore. */
3115 scan = regnext(scan);
3120 *deltap = is_inf_internal ? I32_MAX : delta;
3121 if (flags & SCF_DO_SUBSTR && is_inf)
3122 data->pos_delta = I32_MAX - data->pos_min;
3123 if (is_par > U8_MAX)
3125 if (is_par && pars==1 && data) {
3126 data->flags |= SF_IN_PAR;
3127 data->flags &= ~SF_HAS_PAR;
3129 else if (pars && data) {
3130 data->flags |= SF_HAS_PAR;
3131 data->flags &= ~SF_IN_PAR;
3133 if (flags & SCF_DO_STCLASS_OR)
3134 cl_and(data->start_class, &and_with);
3135 if (flags & SCF_EXACT_TRIE)
3136 data->flags |= SCF_EXACT_TRIE;
3141 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3143 if (RExC_rx->data) {
3144 Renewc(RExC_rx->data,
3145 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3146 char, struct reg_data);
3147 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3148 RExC_rx->data->count += n;
3151 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3152 char, struct reg_data);
3153 Newx(RExC_rx->data->what, n, U8);
3154 RExC_rx->data->count = n;
3156 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3157 return RExC_rx->data->count - n;
3160 #ifndef PERL_IN_XSUB_RE
3162 Perl_reginitcolors(pTHX)
3165 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3167 char *t = savepv(s);
3171 t = strchr(t, '\t');
3177 PL_colors[i] = t = (char *)"";
3182 PL_colors[i++] = (char *)"";
3190 - pregcomp - compile a regular expression into internal code
3192 * We can't allocate space until we know how big the compiled form will be,
3193 * but we can't compile it (and thus know how big it is) until we've got a
3194 * place to put the code. So we cheat: we compile it twice, once with code
3195 * generation turned off and size counting turned on, and once "for real".
3196 * This also means that we don't allocate space until we are sure that the
3197 * thing really will compile successfully, and we never have to move the
3198 * code and thus invalidate pointers into it. (Note that it has to be in
3199 * one piece because free() must be able to free it all.) [NB: not true in perl]
3201 * Beware that the optimization-preparation code in here knows about some
3202 * of the structure of the compiled regexp. [I'll say.]
3205 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3216 RExC_state_t RExC_state;
3217 RExC_state_t * const pRExC_state = &RExC_state;
3218 #ifdef TRIE_STUDY_OPT
3220 RExC_state_t copyRExC_state;
3223 GET_RE_DEBUG_FLAGS_DECL;
3226 FAIL("NULL regexp argument");
3228 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3231 DEBUG_r(if (!PL_colorset) reginitcolors());
3233 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
3234 PL_colors[4],PL_colors[5],PL_colors[0],
3235 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3237 RExC_flags = pm->op_pmflags;
3241 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3242 RExC_seen_evals = 0;
3245 /* First pass: determine size, legality. */
3252 RExC_emit = &PL_regdummy;
3253 RExC_whilem_seen = 0;
3254 #if 0 /* REGC() is (currently) a NOP at the first pass.
3255 * Clever compilers notice this and complain. --jhi */
3256 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3258 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3259 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3260 RExC_precomp = NULL;
3263 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3264 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3265 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3268 RExC_lastparse=NULL;
3272 /* Small enough for pointer-storage convention?
3273 If extralen==0, this means that we will not need long jumps. */
3274 if (RExC_size >= 0x10000L && RExC_extralen)
3275 RExC_size += RExC_extralen;
3278 if (RExC_whilem_seen > 15)
3279 RExC_whilem_seen = 15;
3281 /* Allocate space and initialize. */
3282 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3285 FAIL("Regexp out of space");
3288 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3289 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3292 r->prelen = xend - exp;
3293 r->precomp = savepvn(RExC_precomp, r->prelen);
3295 #ifdef PERL_OLD_COPY_ON_WRITE
3296 r->saved_copy = NULL;
3298 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3299 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3300 r->lastparen = 0; /* mg.c reads this. */
3302 r->substrs = 0; /* Useful during FAIL. */
3303 r->startp = 0; /* Useful during FAIL. */
3304 r->endp = 0; /* Useful during FAIL. */
3306 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3308 r->offsets[0] = RExC_size;
3310 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3311 "%s %"UVuf" bytes for offset annotations.\n",
3312 r->offsets ? "Got" : "Couldn't get",
3313 (UV)((2*RExC_size+1) * sizeof(U32))));
3317 /* Second pass: emit code. */
3318 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3323 RExC_emit_start = r->program;
3324 RExC_emit = r->program;
3325 /* Store the count of eval-groups for security checks: */
3326 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3327 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3329 if (reg(pRExC_state, 0, &flags,1) == NULL)
3331 /* XXXX To minimize changes to RE engine we always allocate
3332 3-units-long substrs field. */
3333 Newx(r->substrs, 1, struct reg_substr_data);
3336 Zero(r->substrs, 1, struct reg_substr_data);
3337 StructCopy(&zero_scan_data, &data, scan_data_t);
3339 #ifdef TRIE_STUDY_OPT
3341 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3342 RExC_state=copyRExC_state;
3343 if (data.longest_fixed)
3344 SvREFCNT_dec(data.longest_fixed);
3345 if (data.longest_float)
3346 SvREFCNT_dec(data.longest_float);
3347 if (data.last_found)
3348 SvREFCNT_dec(data.last_found);
3350 copyRExC_state=RExC_state;
3353 /* Dig out information for optimizations. */
3354 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3355 pm->op_pmflags = RExC_flags;
3357 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3358 r->regstclass = NULL;
3359 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3360 r->reganch |= ROPT_NAUGHTY;
3361 scan = r->program + 1; /* First BRANCH. */
3363 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3364 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3366 STRLEN longest_float_length, longest_fixed_length;
3367 struct regnode_charclass_class ch_class; /* pointed to by data */
3369 I32 last_close = 0; /* pointed to by data */
3372 /* Skip introductions and multiplicators >= 1. */
3373 while ((OP(first) == OPEN && (sawopen = 1)) ||
3374 /* An OR of *one* alternative - should not happen now. */
3375 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3376 /* for now we can't handle lookbehind IFMATCH*/
3377 (OP(first) == IFMATCH && !first->flags) ||
3378 (OP(first) == PLUS) ||
3379 (OP(first) == MINMOD) ||
3380 /* An {n,m} with n>0 */
3381 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3383 DEBUG_PEEP("first:",first,0);
3384 if (OP(first) == PLUS)
3387 first += regarglen[OP(first)];
3388 if (OP(first) == IFMATCH) {
3389 first = NEXTOPER(first);
3390 first += EXTRA_STEP_2ARGS;
3391 } else /*xxx possible optimisation for /(?=)/*/
3392 first = NEXTOPER(first);
3395 /* Starting-point info. */
3397 /* Ignore EXACT as we deal with it later. */
3398 if (PL_regkind[OP(first)] == EXACT) {
3399 if (OP(first) == EXACT)
3400 NOOP; /* Empty, get anchored substr later. */
3401 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3402 r->regstclass = first;
3405 else if (OP(first) == TRIE &&
3406 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3408 /* this can happen only on restudy */
3409 struct regnode_1 *trie_op;
3410 Newxz(trie_op,1,struct regnode_1);
3411 StructCopy(first,trie_op,struct regnode_1);
3412 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3413 r->regstclass = (regnode *)trie_op;
3416 else if (strchr((const char*)PL_simple,OP(first)))
3417 r->regstclass = first;
3418 else if (PL_regkind[OP(first)] == BOUND ||
3419 PL_regkind[OP(first)] == NBOUND)
3420 r->regstclass = first;
3421 else if (PL_regkind[OP(first)] == BOL) {
3422 r->reganch |= (OP(first) == MBOL
3424 : (OP(first) == SBOL
3427 first = NEXTOPER(first);
3430 else if (OP(first) == GPOS) {
3431 r->reganch |= ROPT_ANCH_GPOS;
3432 first = NEXTOPER(first);
3435 else if (!sawopen && (OP(first) == STAR &&
3436 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3437 !(r->reganch & ROPT_ANCH) )
3439 /* turn .* into ^.* with an implied $*=1 */
3441 (OP(NEXTOPER(first)) == REG_ANY)
3444 r->reganch |= type | ROPT_IMPLICIT;
3445 first = NEXTOPER(first);
3448 if (sawplus && (!sawopen || !RExC_sawback)
3449 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3450 /* x+ must match at the 1st pos of run of x's */
3451 r->reganch |= ROPT_SKIP;
3453 /* Scan is after the zeroth branch, first is atomic matcher. */
3454 #ifdef TRIE_STUDY_OPT
3457 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3458 (IV)(first - scan + 1))
3462 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3463 (IV)(first - scan + 1))
3469 * If there's something expensive in the r.e., find the
3470 * longest literal string that must appear and make it the
3471 * regmust. Resolve ties in favor of later strings, since
3472 * the regstart check works with the beginning of the r.e.
3473 * and avoiding duplication strengthens checking. Not a
3474 * strong reason, but sufficient in the absence of others.
3475 * [Now we resolve ties in favor of the earlier string if
3476 * it happens that c_offset_min has been invalidated, since the
3477 * earlier string may buy us something the later one won't.]
3481 data.longest_fixed = newSVpvs("");
3482 data.longest_float = newSVpvs("");
3483 data.last_found = newSVpvs("");
3484 data.longest = &(data.longest_fixed);
3486 if (!r->regstclass) {
3487 cl_init(pRExC_state, &ch_class);
3488 data.start_class = &ch_class;
3489 stclass_flag = SCF_DO_STCLASS_AND;
3490 } else /* XXXX Check for BOUND? */
3492 data.last_closep = &last_close;
3494 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3495 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3497 #ifdef TRIE_STUDY_OPT
3498 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3503 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3504 && data.last_start_min == 0 && data.last_end > 0
3505 && !RExC_seen_zerolen
3506 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3507 r->reganch |= ROPT_CHECK_ALL;
3508 scan_commit(pRExC_state, &data);
3509 SvREFCNT_dec(data.last_found);
3511 longest_float_length = CHR_SVLEN(data.longest_float);
3512 if (longest_float_length
3513 || (data.flags & SF_FL_BEFORE_EOL
3514 && (!(data.flags & SF_FL_BEFORE_MEOL)
3515 || (RExC_flags & PMf_MULTILINE)))) {
3518 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3519 && data.offset_fixed == data.offset_float_min
3520 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3521 goto remove_float; /* As in (a)+. */
3523 if (SvUTF8(data.longest_float)) {
3524 r->float_utf8 = data.longest_float;
3525 r->float_substr = NULL;
3527 r->float_substr = data.longest_float;
3528 r->float_utf8 = NULL;
3530 r->float_min_offset = data.offset_float_min;
3531 r->float_max_offset = data.offset_float_max;
3532 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3533 && (!(data.flags & SF_FL_BEFORE_MEOL)
3534 || (RExC_flags & PMf_MULTILINE)));
3535 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3539 r->float_substr = r->float_utf8 = NULL;
3540 SvREFCNT_dec(data.longest_float);
3541 longest_float_length = 0;
3544 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3545 if (longest_fixed_length
3546 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3547 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3548 || (RExC_flags & PMf_MULTILINE)))) {
3551 if (SvUTF8(data.longest_fixed)) {
3552 r->anchored_utf8 = data.longest_fixed;
3553 r->anchored_substr = NULL;
3555 r->anchored_substr = data.longest_fixed;
3556 r->anchored_utf8 = NULL;
3558 r->anchored_offset = data.offset_fixed;
3559 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3560 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3561 || (RExC_flags & PMf_MULTILINE)));
3562 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3565 r->anchored_substr = r->anchored_utf8 = NULL;
3566 SvREFCNT_dec(data.longest_fixed);
3567 longest_fixed_length = 0;
3570 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3571 r->regstclass = NULL;
3572 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3574 && !(data.start_class->flags & ANYOF_EOS)
3575 && !cl_is_anything(data.start_class))
3577 const I32 n = add_data(pRExC_state, 1, "f");
3579 Newx(RExC_rx->data->data[n], 1,
3580 struct regnode_charclass_class);
3581 StructCopy(data.start_class,
3582 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3583 struct regnode_charclass_class);
3584 r->regstclass = (regnode*)RExC_rx->data->data[n];
3585 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3586 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3587 regprop(r, sv, (regnode*)data.start_class);
3588 PerlIO_printf(Perl_debug_log,
3589 "synthetic stclass \"%s\".\n",
3590 SvPVX_const(sv));});
3593 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3594 if (longest_fixed_length > longest_float_length) {
3595 r->check_substr = r->anchored_substr;
3596 r->check_utf8 = r->anchored_utf8;
3597 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3598 if (r->reganch & ROPT_ANCH_SINGLE)
3599 r->reganch |= ROPT_NOSCAN;
3602 r->check_substr = r->float_substr;
3603 r->check_utf8 = r->float_utf8;
3604 r->check_offset_min = data.offset_float_min;
3605 r->check_offset_max = data.offset_float_max;
3607 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3608 This should be changed ASAP! */
3609 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3610 r->reganch |= RE_USE_INTUIT;
3611 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3612 r->reganch |= RE_INTUIT_TAIL;
3616 /* Several toplevels. Best we can is to set minlen. */
3618 struct regnode_charclass_class ch_class;
3621 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3623 scan = r->program + 1;
3624 cl_init(pRExC_state, &ch_class);
3625 data.start_class = &ch_class;
3626 data.last_closep = &last_close;
3628 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3629 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3631 #ifdef TRIE_STUDY_OPT
3632 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3637 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3638 = r->float_substr = r->float_utf8 = NULL;
3639 if (!(data.start_class->flags & ANYOF_EOS)
3640 && !cl_is_anything(data.start_class))
3642 const I32 n = add_data(pRExC_state, 1, "f");
3644 Newx(RExC_rx->data->data[n], 1,
3645 struct regnode_charclass_class);
3646 StructCopy(data.start_class,
3647 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3648 struct regnode_charclass_class);
3649 r->regstclass = (regnode*)RExC_rx->data->data[n];
3650 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3651 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3652 regprop(r, sv, (regnode*)data.start_class);
3653 PerlIO_printf(Perl_debug_log,
3654 "synthetic stclass \"%s\".\n",
3655 SvPVX_const(sv));});
3660 if (RExC_seen & REG_SEEN_GPOS)
3661 r->reganch |= ROPT_GPOS_SEEN;
3662 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3663 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3664 if (RExC_seen & REG_SEEN_EVAL)
3665 r->reganch |= ROPT_EVAL_SEEN;
3666 if (RExC_seen & REG_SEEN_CANY)
3667 r->reganch |= ROPT_CANY_SEEN;
3668 Newxz(r->startp, RExC_npar, I32);
3669 Newxz(r->endp, RExC_npar, I32);
3671 DEBUG_r( RX_DEBUG_on(r) );
3673 PerlIO_printf(Perl_debug_log,"Final program:\n");
3676 DEBUG_OFFSETS_r(if (r->offsets) {
3677 const U32 len = r->offsets[0];
3679 GET_RE_DEBUG_FLAGS_DECL;
3680 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3681 for (i = 1; i <= len; i++) {
3682 if (r->offsets[i*2-1] || r->offsets[i*2])
3683 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3684 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3686 PerlIO_printf(Perl_debug_log, "\n");
3692 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3693 int rem=(int)(RExC_end - RExC_parse); \
3702 if (RExC_lastparse!=RExC_parse) \
3703 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3706 iscut ? "..." : "<" \
3709 PerlIO_printf(Perl_debug_log,"%16s",""); \
3714 num=REG_NODE_NUM(RExC_emit); \
3715 if (RExC_lastnum!=num) \
3716 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3718 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3719 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3720 (int)((depth*2)), "", \
3724 RExC_lastparse=RExC_parse; \
3729 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3730 DEBUG_PARSE_MSG((funcname)); \
3731 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3734 - reg - regular expression, i.e. main body or parenthesized thing
3736 * Caller must absorb opening parenthesis.
3738 * Combining parenthesis handling with the base level of regular expression
3739 * is a trifle forced, but the need to tie the tails of the branches to what
3740 * follows makes it hard to avoid.
3742 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3744 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3746 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3750 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3751 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3754 register regnode *ret; /* Will be the head of the group. */
3755 register regnode *br;
3756 register regnode *lastbr;
3757 register regnode *ender = NULL;
3758 register I32 parno = 0;
3760 const I32 oregflags = RExC_flags;
3761 bool have_branch = 0;
3764 /* for (?g), (?gc), and (?o) warnings; warning
3765 about (?c) will warn about (?g) -- japhy */
3767 #define WASTED_O 0x01
3768 #define WASTED_G 0x02
3769 #define WASTED_C 0x04
3770 #define WASTED_GC (0x02|0x04)
3771 I32 wastedflags = 0x00;
3773 char * parse_start = RExC_parse; /* MJD */
3774 char * const oregcomp_parse = RExC_parse;
3776 GET_RE_DEBUG_FLAGS_DECL;
3777 DEBUG_PARSE("reg ");
3780 *flagp = 0; /* Tentatively. */
3783 /* Make an OPEN node, if parenthesized. */
3785 if (*RExC_parse == '?') { /* (?...) */
3786 U32 posflags = 0, negflags = 0;
3787 U32 *flagsp = &posflags;
3788 bool is_logical = 0;
3789 const char * const seqstart = RExC_parse;
3792 paren = *RExC_parse++;
3793 ret = NULL; /* For look-ahead/behind. */
3795 case '<': /* (?<...) */
3796 RExC_seen |= REG_SEEN_LOOKBEHIND;
3797 if (*RExC_parse == '!')
3799 if (*RExC_parse != '=' && *RExC_parse != '!')
3802 case '=': /* (?=...) */
3803 case '!': /* (?!...) */
3804 RExC_seen_zerolen++;
3805 case ':': /* (?:...) */
3806 case '>': /* (?>...) */
3808 case '$': /* (?$...) */
3809 case '@': /* (?@...) */
3810 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3812 case '#': /* (?#...) */
3813 while (*RExC_parse && *RExC_parse != ')')
3815 if (*RExC_parse != ')')
3816 FAIL("Sequence (?#... not terminated");
3817 nextchar(pRExC_state);
3820 case 'p': /* (?p...) */
3821 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3822 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3824 case '?': /* (??...) */
3826 if (*RExC_parse != '{')
3828 paren = *RExC_parse++;
3830 case '{': /* (?{...}) */
3832 I32 count = 1, n = 0;
3834 char *s = RExC_parse;
3836 RExC_seen_zerolen++;
3837 RExC_seen |= REG_SEEN_EVAL;
3838 while (count && (c = *RExC_parse)) {
3849 if (*RExC_parse != ')') {
3851 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3855 OP_4tree *sop, *rop;
3856 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3859 Perl_save_re_context(aTHX);
3860 rop = sv_compile_2op(sv, &sop, "re", &pad);
3861 sop->op_private |= OPpREFCOUNTED;
3862 /* re_dup will OpREFCNT_inc */
3863 OpREFCNT_set(sop, 1);
3866 n = add_data(pRExC_state, 3, "nop");
3867 RExC_rx->data->data[n] = (void*)rop;
3868 RExC_rx->data->data[n+1] = (void*)sop;
3869 RExC_rx->data->data[n+2] = (void*)pad;
3872 else { /* First pass */
3873 if (PL_reginterp_cnt < ++RExC_seen_evals
3875 /* No compiled RE interpolated, has runtime
3876 components ===> unsafe. */
3877 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3878 if (PL_tainting && PL_tainted)
3879 FAIL("Eval-group in insecure regular expression");
3880 #if PERL_VERSION > 8
3881 if (IN_PERL_COMPILETIME)
3886 nextchar(pRExC_state);
3888 ret = reg_node(pRExC_state, LOGICAL);
3891 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3892 /* deal with the length of this later - MJD */
3895 ret = reganode(pRExC_state, EVAL, n);
3896 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3897 Set_Node_Offset(ret, parse_start);
3900 case '(': /* (?(?{...})...) and (?(?=...)...) */
3902 if (RExC_parse[0] == '?') { /* (?(?...)) */
3903 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3904 || RExC_parse[1] == '<'
3905 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3908 ret = reg_node(pRExC_state, LOGICAL);
3911 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3915 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3918 parno = atoi(RExC_parse++);
3920 while (isDIGIT(*RExC_parse))
3922 ret = reganode(pRExC_state, GROUPP, parno);
3924 if ((c = *nextchar(pRExC_state)) != ')')
3925 vFAIL("Switch condition not recognized");
3927 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3928 br = regbranch(pRExC_state, &flags, 1,depth+1);
3930 br = reganode(pRExC_state, LONGJMP, 0);
3932 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3933 c = *nextchar(pRExC_state);
3937 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3938 regbranch(pRExC_state, &flags, 1,depth+1);
3939 REGTAIL(pRExC_state, ret, lastbr);
3942 c = *nextchar(pRExC_state);
3947 vFAIL("Switch (?(condition)... contains too many branches");
3948 ender = reg_node(pRExC_state, TAIL);
3949 REGTAIL(pRExC_state, br, ender);
3951 REGTAIL(pRExC_state, lastbr, ender);
3952 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3955 REGTAIL(pRExC_state, ret, ender);
3959 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3963 RExC_parse--; /* for vFAIL to print correctly */
3964 vFAIL("Sequence (? incomplete");
3968 parse_flags: /* (?i) */
3969 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3970 /* (?g), (?gc) and (?o) are useless here
3971 and must be globally applied -- japhy */
3973 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3974 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3975 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3976 if (! (wastedflags & wflagbit) ) {
3977 wastedflags |= wflagbit;
3980 "Useless (%s%c) - %suse /%c modifier",
3981 flagsp == &negflags ? "?-" : "?",
3983 flagsp == &negflags ? "don't " : "",
3989 else if (*RExC_parse == 'c') {
3990 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3991 if (! (wastedflags & WASTED_C) ) {
3992 wastedflags |= WASTED_GC;
3995 "Useless (%sc) - %suse /gc modifier",
3996 flagsp == &negflags ? "?-" : "?",
3997 flagsp == &negflags ? "don't " : ""
4002 else { pmflag(flagsp, *RExC_parse); }
4006 if (*RExC_parse == '-') {
4008 wastedflags = 0; /* reset so (?g-c) warns twice */
4012 RExC_flags |= posflags;
4013 RExC_flags &= ~negflags;
4014 if (*RExC_parse == ':') {
4020 if (*RExC_parse != ')') {
4022 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
4024 nextchar(pRExC_state);
4032 ret = reganode(pRExC_state, OPEN, parno);
4033 Set_Node_Length(ret, 1); /* MJD */
4034 Set_Node_Offset(ret, RExC_parse); /* MJD */
4041 /* Pick up the branches, linking them together. */
4042 parse_start = RExC_parse; /* MJD */
4043 br = regbranch(pRExC_state, &flags, 1,depth+1);
4044 /* branch_len = (paren != 0); */
4048 if (*RExC_parse == '|') {
4049 if (!SIZE_ONLY && RExC_extralen) {
4050 reginsert(pRExC_state, BRANCHJ, br);
4053 reginsert(pRExC_state, BRANCH, br);
4054 Set_Node_Length(br, paren != 0);
4055 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4059 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4061 else if (paren == ':') {
4062 *flagp |= flags&SIMPLE;
4064 if (is_open) { /* Starts with OPEN. */
4065 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4067 else if (paren != '?') /* Not Conditional */
4069 *flagp |= flags & (SPSTART | HASWIDTH);
4071 while (*RExC_parse == '|') {
4072 if (!SIZE_ONLY && RExC_extralen) {
4073 ender = reganode(pRExC_state, LONGJMP,0);
4074 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4077 RExC_extralen += 2; /* Account for LONGJMP. */
4078 nextchar(pRExC_state);
4079 br = regbranch(pRExC_state, &flags, 0, depth+1);
4083 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4087 *flagp |= flags&SPSTART;
4090 if (have_branch || paren != ':') {
4091 /* Make a closing node, and hook it on the end. */
4094 ender = reg_node(pRExC_state, TAIL);
4097 ender = reganode(pRExC_state, CLOSE, parno);
4098 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4099 Set_Node_Length(ender,1); /* MJD */
4105 *flagp &= ~HASWIDTH;
4108 ender = reg_node(pRExC_state, SUCCEED);
4111 ender = reg_node(pRExC_state, END);
4114 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4116 if (have_branch && !SIZE_ONLY) {
4117 /* Hook the tails of the branches to the closing node. */
4118 for (br = ret; br; br = regnext(br)) {
4119 const U8 op = PL_regkind[OP(br)];
4121 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4123 else if (op == BRANCHJ) {
4124 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4132 static const char parens[] = "=!<,>";
4134 if (paren && (p = strchr(parens, paren))) {
4135 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4136 int flag = (p - parens) > 1;
4139 node = SUSPEND, flag = 0;
4140 reginsert(pRExC_state, node,ret);
4141 Set_Node_Cur_Length(ret);
4142 Set_Node_Offset(ret, parse_start + 1);
4144 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4148 /* Check for proper termination. */
4150 RExC_flags = oregflags;
4151 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4152 RExC_parse = oregcomp_parse;
4153 vFAIL("Unmatched (");
4156 else if (!paren && RExC_parse < RExC_end) {
4157 if (*RExC_parse == ')') {
4159 vFAIL("Unmatched )");
4162 FAIL("Junk on end of regexp"); /* "Can't happen". */
4170 - regbranch - one alternative of an | operator
4172 * Implements the concatenation operator.
4175 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4178 register regnode *ret;
4179 register regnode *chain = NULL;
4180 register regnode *latest;
4181 I32 flags = 0, c = 0;
4182 GET_RE_DEBUG_FLAGS_DECL;
4183 DEBUG_PARSE("brnc");
4187 if (!SIZE_ONLY && RExC_extralen)
4188 ret = reganode(pRExC_state, BRANCHJ,0);
4190 ret = reg_node(pRExC_state, BRANCH);
4191 Set_Node_Length(ret, 1);
4195 if (!first && SIZE_ONLY)
4196 RExC_extralen += 1; /* BRANCHJ */
4198 *flagp = WORST; /* Tentatively. */
4201 nextchar(pRExC_state);
4202 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4204 latest = regpiece(pRExC_state, &flags,depth+1);
4205 if (latest == NULL) {
4206 if (flags & TRYAGAIN)
4210 else if (ret == NULL)
4212 *flagp |= flags&HASWIDTH;
4213 if (chain == NULL) /* First piece. */
4214 *flagp |= flags&SPSTART;
4217 REGTAIL(pRExC_state, chain, latest);
4222 if (chain == NULL) { /* Loop ran zero times. */
4223 chain = reg_node(pRExC_state, NOTHING);
4228 *flagp |= flags&SIMPLE;
4235 - regpiece - something followed by possible [*+?]
4237 * Note that the branching code sequences used for ? and the general cases
4238 * of * and + are somewhat optimized: they use the same NOTHING node as
4239 * both the endmarker for their branch list and the body of the last branch.
4240 * It might seem that this node could be dispensed with entirely, but the
4241 * endmarker role is not redundant.
4244 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4247 register regnode *ret;
4249 register char *next;
4251 const char * const origparse = RExC_parse;
4253 I32 max = REG_INFTY;
4255 GET_RE_DEBUG_FLAGS_DECL;
4256 DEBUG_PARSE("piec");
4258 ret = regatom(pRExC_state, &flags,depth+1);
4260 if (flags & TRYAGAIN)
4267 if (op == '{' && regcurly(RExC_parse)) {
4268 const char *maxpos = NULL;
4269 parse_start = RExC_parse; /* MJD */
4270 next = RExC_parse + 1;
4271 while (isDIGIT(*next) || *next == ',') {
4280 if (*next == '}') { /* got one */
4284 min = atoi(RExC_parse);
4288 maxpos = RExC_parse;
4290 if (!max && *maxpos != '0')
4291 max = REG_INFTY; /* meaning "infinity" */
4292 else if (max >= REG_INFTY)
4293 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4295 nextchar(pRExC_state);
4298 if ((flags&SIMPLE)) {
4299 RExC_naughty += 2 + RExC_naughty / 2;
4300 reginsert(pRExC_state, CURLY, ret);
4301 Set_Node_Offset(ret, parse_start+1); /* MJD */
4302 Set_Node_Cur_Length(ret);
4305 regnode * const w = reg_node(pRExC_state, WHILEM);
4308 REGTAIL(pRExC_state, ret, w);
4309 if (!SIZE_ONLY && RExC_extralen) {
4310 reginsert(pRExC_state, LONGJMP,ret);
4311 reginsert(pRExC_state, NOTHING,ret);
4312 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4314 reginsert(pRExC_state, CURLYX,ret);
4316 Set_Node_Offset(ret, parse_start+1);
4317 Set_Node_Length(ret,
4318 op == '{' ? (RExC_parse - parse_start) : 1);
4320 if (!SIZE_ONLY && RExC_extralen)
4321 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4322 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4324 RExC_whilem_seen++, RExC_extralen += 3;
4325 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4333 if (max && max < min)
4334 vFAIL("Can't do {n,m} with n > m");
4336 ARG1_SET(ret, (U16)min);
4337 ARG2_SET(ret, (U16)max);
4349 #if 0 /* Now runtime fix should be reliable. */
4351 /* if this is reinstated, don't forget to put this back into perldiag:
4353 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4355 (F) The part of the regexp subject to either the * or + quantifier
4356 could match an empty string. The {#} shows in the regular
4357 expression about where the problem was discovered.
4361 if (!(flags&HASWIDTH) && op != '?')
4362 vFAIL("Regexp *+ operand could be empty");
4365 parse_start = RExC_parse;
4366 nextchar(pRExC_state);
4368 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4370 if (op == '*' && (flags&SIMPLE)) {
4371 reginsert(pRExC_state, STAR, ret);
4375 else if (op == '*') {
4379 else if (op == '+' && (flags&SIMPLE)) {
4380 reginsert(pRExC_state, PLUS, ret);
4384 else if (op == '+') {
4388 else if (op == '?') {
4393 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4395 "%.*s matches null string many times",
4396 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4400 if (*RExC_parse == '?') {
4401 nextchar(pRExC_state);
4402 reginsert(pRExC_state, MINMOD, ret);
4403 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4405 if (ISMULT2(RExC_parse)) {
4407 vFAIL("Nested quantifiers");
4414 - regatom - the lowest level
4416 * Optimization: gobbles an entire sequence of ordinary characters so that
4417 * it can turn them into a single node, which is smaller to store and
4418 * faster to run. Backslashed characters are exceptions, each becoming a
4419 * separate node; the code is simpler that way and it's not worth fixing.
4421 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4422 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4425 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4428 register regnode *ret = NULL;
4430 char *parse_start = RExC_parse;
4431 GET_RE_DEBUG_FLAGS_DECL;
4432 DEBUG_PARSE("atom");
4433 *flagp = WORST; /* Tentatively. */
4436 switch (*RExC_parse) {
4438 RExC_seen_zerolen++;
4439 nextchar(pRExC_state);
4440 if (RExC_flags & PMf_MULTILINE)
4441 ret = reg_node(pRExC_state, MBOL);
4442 else if (RExC_flags & PMf_SINGLELINE)
4443 ret = reg_node(pRExC_state, SBOL);
4445 ret = reg_node(pRExC_state, BOL);
4446 Set_Node_Length(ret, 1); /* MJD */
4449 nextchar(pRExC_state);
4451 RExC_seen_zerolen++;
4452 if (RExC_flags & PMf_MULTILINE)
4453 ret = reg_node(pRExC_state, MEOL);
4454 else if (RExC_flags & PMf_SINGLELINE)
4455 ret = reg_node(pRExC_state, SEOL);
4457 ret = reg_node(pRExC_state, EOL);
4458 Set_Node_Length(ret, 1); /* MJD */
4461 nextchar(pRExC_state);
4462 if (RExC_flags & PMf_SINGLELINE)
4463 ret = reg_node(pRExC_state, SANY);
4465 ret = reg_node(pRExC_state, REG_ANY);
4466 *flagp |= HASWIDTH|SIMPLE;
4468 Set_Node_Length(ret, 1); /* MJD */
4472 char * const oregcomp_parse = ++RExC_parse;
4473 ret = regclass(pRExC_state,depth+1);
4474 if (*RExC_parse != ']') {
4475 RExC_parse = oregcomp_parse;
4476 vFAIL("Unmatched [");
4478 nextchar(pRExC_state);
4479 *flagp |= HASWIDTH|SIMPLE;
4480 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4484 nextchar(pRExC_state);
4485 ret = reg(pRExC_state, 1, &flags,depth+1);
4487 if (flags & TRYAGAIN) {
4488 if (RExC_parse == RExC_end) {
4489 /* Make parent create an empty node if needed. */
4497 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4501 if (flags & TRYAGAIN) {
4505 vFAIL("Internal urp");
4506 /* Supposed to be caught earlier. */
4509 if (!regcurly(RExC_parse)) {
4518 vFAIL("Quantifier follows nothing");
4521 switch (*++RExC_parse) {
4523 RExC_seen_zerolen++;
4524 ret = reg_node(pRExC_state, SBOL);
4526 nextchar(pRExC_state);
4527 Set_Node_Length(ret, 2); /* MJD */
4530 ret = reg_node(pRExC_state, GPOS);
4531 RExC_seen |= REG_SEEN_GPOS;
4533 nextchar(pRExC_state);
4534 Set_Node_Length(ret, 2); /* MJD */
4537 ret = reg_node(pRExC_state, SEOL);
4539 RExC_seen_zerolen++; /* Do not optimize RE away */
4540 nextchar(pRExC_state);
4543 ret = reg_node(pRExC_state, EOS);
4545 RExC_seen_zerolen++; /* Do not optimize RE away */
4546 nextchar(pRExC_state);
4547 Set_Node_Length(ret, 2); /* MJD */
4550 ret = reg_node(pRExC_state, CANY);
4551 RExC_seen |= REG_SEEN_CANY;
4552 *flagp |= HASWIDTH|SIMPLE;
4553 nextchar(pRExC_state);
4554 Set_Node_Length(ret, 2); /* MJD */
4557 ret = reg_node(pRExC_state, CLUMP);
4559 nextchar(pRExC_state);
4560 Set_Node_Length(ret, 2); /* MJD */
4563 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4564 *flagp |= HASWIDTH|SIMPLE;
4565 nextchar(pRExC_state);
4566 Set_Node_Length(ret, 2); /* MJD */
4569 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4570 *flagp |= HASWIDTH|SIMPLE;
4571 nextchar(pRExC_state);
4572 Set_Node_Length(ret, 2); /* MJD */
4575 RExC_seen_zerolen++;
4576 RExC_seen |= REG_SEEN_LOOKBEHIND;
4577 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4579 nextchar(pRExC_state);
4580 Set_Node_Length(ret, 2); /* MJD */
4583 RExC_seen_zerolen++;
4584 RExC_seen |= REG_SEEN_LOOKBEHIND;
4585 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4587 nextchar(pRExC_state);
4588 Set_Node_Length(ret, 2); /* MJD */
4591 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4592 *flagp |= HASWIDTH|SIMPLE;
4593 nextchar(pRExC_state);
4594 Set_Node_Length(ret, 2); /* MJD */
4597 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4598 *flagp |= HASWIDTH|SIMPLE;
4599 nextchar(pRExC_state);
4600 Set_Node_Length(ret, 2); /* MJD */
4603 ret = reg_node(pRExC_state, DIGIT);
4604 *flagp |= HASWIDTH|SIMPLE;
4605 nextchar(pRExC_state);
4606 Set_Node_Length(ret, 2); /* MJD */
4609 ret = reg_node(pRExC_state, NDIGIT);
4610 *flagp |= HASWIDTH|SIMPLE;
4611 nextchar(pRExC_state);
4612 Set_Node_Length(ret, 2); /* MJD */
4617 char* const oldregxend = RExC_end;
4618 char* parse_start = RExC_parse - 2;
4620 if (RExC_parse[1] == '{') {
4621 /* a lovely hack--pretend we saw [\pX] instead */
4622 RExC_end = strchr(RExC_parse, '}');
4624 const U8 c = (U8)*RExC_parse;
4626 RExC_end = oldregxend;
4627 vFAIL2("Missing right brace on \\%c{}", c);
4632 RExC_end = RExC_parse + 2;
4633 if (RExC_end > oldregxend)
4634 RExC_end = oldregxend;
4638 ret = regclass(pRExC_state,depth+1);
4640 RExC_end = oldregxend;
4643 Set_Node_Offset(ret, parse_start + 2);
4644 Set_Node_Cur_Length(ret);
4645 nextchar(pRExC_state);
4646 *flagp |= HASWIDTH|SIMPLE;
4659 case '1': case '2': case '3': case '4':
4660 case '5': case '6': case '7': case '8': case '9':
4662 const I32 num = atoi(RExC_parse);
4664 if (num > 9 && num >= RExC_npar)
4667 char * const parse_start = RExC_parse - 1; /* MJD */
4668 while (isDIGIT(*RExC_parse))
4671 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4672 vFAIL("Reference to nonexistent group");
4674 ret = reganode(pRExC_state,
4675 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4679 /* override incorrect value set in reganode MJD */
4680 Set_Node_Offset(ret, parse_start+1);
4681 Set_Node_Cur_Length(ret); /* MJD */
4683 nextchar(pRExC_state);
4688 if (RExC_parse >= RExC_end)
4689 FAIL("Trailing \\");
4692 /* Do not generate "unrecognized" warnings here, we fall
4693 back into the quick-grab loop below */
4700 if (RExC_flags & PMf_EXTENDED) {
4701 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4703 if (RExC_parse < RExC_end)
4709 register STRLEN len;
4714 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4716 parse_start = RExC_parse - 1;
4722 ret = reg_node(pRExC_state,
4723 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4725 for (len = 0, p = RExC_parse - 1;
4726 len < 127 && p < RExC_end;
4729 char * const oldp = p;
4731 if (RExC_flags & PMf_EXTENDED)
4732 p = regwhite(p, RExC_end);
4779 ender = ASCII_TO_NATIVE('\033');
4783 ender = ASCII_TO_NATIVE('\007');
4788 char* const e = strchr(p, '}');
4792 vFAIL("Missing right brace on \\x{}");
4795 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4796 | PERL_SCAN_DISALLOW_PREFIX;
4797 STRLEN numlen = e - p - 1;
4798 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4805 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4807 ender = grok_hex(p, &numlen, &flags, NULL);
4813 ender = UCHARAT(p++);
4814 ender = toCTRL(ender);
4816 case '0': case '1': case '2': case '3':case '4':
4817 case '5': case '6': case '7': case '8':case '9':
4819 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4822 ender = grok_oct(p, &numlen, &flags, NULL);
4832 FAIL("Trailing \\");
4835 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4836 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4837 goto normal_default;
4842 if (UTF8_IS_START(*p) && UTF) {
4844 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4845 &numlen, UTF8_ALLOW_DEFAULT);
4852 if (RExC_flags & PMf_EXTENDED)
4853 p = regwhite(p, RExC_end);
4855 /* Prime the casefolded buffer. */
4856 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4858 if (ISMULT2(p)) { /* Back off on ?+*. */
4863 /* Emit all the Unicode characters. */
4865 for (foldbuf = tmpbuf;
4867 foldlen -= numlen) {
4868 ender = utf8_to_uvchr(foldbuf, &numlen);
4870 const STRLEN unilen = reguni(pRExC_state, ender, s);
4873 /* In EBCDIC the numlen
4874 * and unilen can differ. */
4876 if (numlen >= foldlen)
4880 break; /* "Can't happen." */
4884 const STRLEN unilen = reguni(pRExC_state, ender, s);
4893 REGC((char)ender, s++);
4899 /* Emit all the Unicode characters. */
4901 for (foldbuf = tmpbuf;
4903 foldlen -= numlen) {
4904 ender = utf8_to_uvchr(foldbuf, &numlen);
4906 const STRLEN unilen = reguni(pRExC_state, ender, s);
4909 /* In EBCDIC the numlen
4910 * and unilen can differ. */
4912 if (numlen >= foldlen)
4920 const STRLEN unilen = reguni(pRExC_state, ender, s);
4929 REGC((char)ender, s++);
4933 Set_Node_Cur_Length(ret); /* MJD */
4934 nextchar(pRExC_state);
4936 /* len is STRLEN which is unsigned, need to copy to signed */
4939 vFAIL("Internal disaster");
4943 if (len == 1 && UNI_IS_INVARIANT(ender))
4947 RExC_size += STR_SZ(len);
4950 RExC_emit += STR_SZ(len);
4956 /* If the encoding pragma is in effect recode the text of
4957 * any EXACT-kind nodes. */
4958 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4959 const STRLEN oldlen = STR_LEN(ret);
4960 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4964 if (sv_utf8_downgrade(sv, TRUE)) {
4965 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4966 const STRLEN newlen = SvCUR(sv);
4971 GET_RE_DEBUG_FLAGS_DECL;
4972 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4973 (int)oldlen, STRING(ret),
4975 Copy(s, STRING(ret), newlen, char);
4976 STR_LEN(ret) += newlen - oldlen;
4977 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4979 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4987 S_regwhite(char *p, const char *e)
4992 else if (*p == '#') {
4995 } while (p < e && *p != '\n');
5003 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5004 Character classes ([:foo:]) can also be negated ([:^foo:]).
5005 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5006 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
5007 but trigger failures because they are currently unimplemented. */
5009 #define POSIXCC_DONE(c) ((c) == ':')
5010 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5011 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5014 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
5017 I32 namedclass = OOB_NAMEDCLASS;
5019 if (value == '[' && RExC_parse + 1 < RExC_end &&
5020 /* I smell either [: or [= or [. -- POSIX has been here, right? */
5021 POSIXCC(UCHARAT(RExC_parse))) {
5022 const char c = UCHARAT(RExC_parse);
5023 char* const s = RExC_parse++;
5025 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
5027 if (RExC_parse == RExC_end)
5028 /* Grandfather lone [:, [=, [. */
5031 const char* const t = RExC_parse++; /* skip over the c */
5034 if (UCHARAT(RExC_parse) == ']') {
5035 const char *posixcc = s + 1;
5036 RExC_parse++; /* skip over the ending ] */
5039 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5040 const I32 skip = t - posixcc;
5042 /* Initially switch on the length of the name. */
5045 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5046 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5049 /* Names all of length 5. */
5050 /* alnum alpha ascii blank cntrl digit graph lower
5051 print punct space upper */
5052 /* Offset 4 gives the best switch position. */
5053 switch (posixcc[4]) {
5055 if (memEQ(posixcc, "alph", 4)) /* alpha */
5056 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5059 if (memEQ(posixcc, "spac", 4)) /* space */
5060 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5063 if (memEQ(posixcc, "grap", 4)) /* graph */
5064 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5067 if (memEQ(posixcc, "asci", 4)) /* ascii */
5068 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5071 if (memEQ(posixcc, "blan", 4)) /* blank */
5072 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5075 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5076 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5079 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5080 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5083 if (memEQ(posixcc, "lowe", 4)) /* lower */
5084 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5085 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5086 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5089 if (memEQ(posixcc, "digi", 4)) /* digit */
5090 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5091 else if (memEQ(posixcc, "prin", 4)) /* print */
5092 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5093 else if (memEQ(posixcc, "punc", 4)) /* punct */
5094 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5099 if (memEQ(posixcc, "xdigit", 6))
5100 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5104 if (namedclass == OOB_NAMEDCLASS)
5105 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5107 assert (posixcc[skip] == ':');
5108 assert (posixcc[skip+1] == ']');
5109 } else if (!SIZE_ONLY) {
5110 /* [[=foo=]] and [[.foo.]] are still future. */
5112 /* adjust RExC_parse so the warning shows after
5114 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5116 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5119 /* Maternal grandfather:
5120 * "[:" ending in ":" but not in ":]" */
5130 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5133 if (POSIXCC(UCHARAT(RExC_parse))) {
5134 const char *s = RExC_parse;
5135 const char c = *s++;
5139 if (*s && c == *s && s[1] == ']') {
5140 if (ckWARN(WARN_REGEXP))
5142 "POSIX syntax [%c %c] belongs inside character classes",
5145 /* [[=foo=]] and [[.foo.]] are still future. */
5146 if (POSIXCC_NOTYET(c)) {
5147 /* adjust RExC_parse so the error shows after
5149 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5151 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5159 parse a class specification and produce either an ANYOF node that
5160 matches the pattern. If the pattern matches a single char only and
5161 that char is < 256 then we produce an EXACT node instead.
5164 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5168 register UV nextvalue;
5169 register IV prevvalue = OOB_UNICODE;
5170 register IV range = 0;
5171 register regnode *ret;
5174 char *rangebegin = NULL;
5175 bool need_class = 0;
5178 bool optimize_invert = TRUE;
5179 AV* unicode_alternate = NULL;
5181 UV literal_endpoint = 0;
5183 UV stored = 0; /* number of chars stored in the class */
5185 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5186 case we need to change the emitted regop to an EXACT. */
5187 const char * orig_parse = RExC_parse;
5188 GET_RE_DEBUG_FLAGS_DECL;
5190 PERL_UNUSED_ARG(depth);
5193 DEBUG_PARSE("clas");
5195 /* Assume we are going to generate an ANYOF node. */
5196 ret = reganode(pRExC_state, ANYOF, 0);
5199 ANYOF_FLAGS(ret) = 0;
5201 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5205 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5209 RExC_size += ANYOF_SKIP;
5210 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5213 RExC_emit += ANYOF_SKIP;
5215 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5217 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5218 ANYOF_BITMAP_ZERO(ret);
5219 listsv = newSVpvs("# comment\n");
5222 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5224 if (!SIZE_ONLY && POSIXCC(nextvalue))
5225 checkposixcc(pRExC_state);
5227 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5228 if (UCHARAT(RExC_parse) == ']')
5231 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5235 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5238 rangebegin = RExC_parse;
5240 value = utf8n_to_uvchr((U8*)RExC_parse,
5241 RExC_end - RExC_parse,
5242 &numlen, UTF8_ALLOW_DEFAULT);
5243 RExC_parse += numlen;
5246 value = UCHARAT(RExC_parse++);
5248 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5249 if (value == '[' && POSIXCC(nextvalue))
5250 namedclass = regpposixcc(pRExC_state, value);
5251 else if (value == '\\') {
5253 value = utf8n_to_uvchr((U8*)RExC_parse,
5254 RExC_end - RExC_parse,
5255 &numlen, UTF8_ALLOW_DEFAULT);
5256 RExC_parse += numlen;
5259 value = UCHARAT(RExC_parse++);
5260 /* Some compilers cannot handle switching on 64-bit integer
5261 * values, therefore value cannot be an UV. Yes, this will
5262 * be a problem later if we want switch on Unicode.
5263 * A similar issue a little bit later when switching on
5264 * namedclass. --jhi */
5265 switch ((I32)value) {
5266 case 'w': namedclass = ANYOF_ALNUM; break;
5267 case 'W': namedclass = ANYOF_NALNUM; break;
5268 case 's': namedclass = ANYOF_SPACE; break;
5269 case 'S': namedclass = ANYOF_NSPACE; break;
5270 case 'd': namedclass = ANYOF_DIGIT; break;
5271 case 'D': namedclass = ANYOF_NDIGIT; break;
5276 if (RExC_parse >= RExC_end)
5277 vFAIL2("Empty \\%c{}", (U8)value);
5278 if (*RExC_parse == '{') {
5279 const U8 c = (U8)value;
5280 e = strchr(RExC_parse++, '}');
5282 vFAIL2("Missing right brace on \\%c{}", c);
5283 while (isSPACE(UCHARAT(RExC_parse)))
5285 if (e == RExC_parse)
5286 vFAIL2("Empty \\%c{}", c);
5288 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5296 if (UCHARAT(RExC_parse) == '^') {
5299 value = value == 'p' ? 'P' : 'p'; /* toggle */
5300 while (isSPACE(UCHARAT(RExC_parse))) {
5305 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5306 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5309 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5310 namedclass = ANYOF_MAX; /* no official name, but it's named */
5313 case 'n': value = '\n'; break;
5314 case 'r': value = '\r'; break;
5315 case 't': value = '\t'; break;
5316 case 'f': value = '\f'; break;
5317 case 'b': value = '\b'; break;
5318 case 'e': value = ASCII_TO_NATIVE('\033');break;
5319 case 'a': value = ASCII_TO_NATIVE('\007');break;
5321 if (*RExC_parse == '{') {
5322 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5323 | PERL_SCAN_DISALLOW_PREFIX;
5324 char * const e = strchr(RExC_parse++, '}');
5326 vFAIL("Missing right brace on \\x{}");
5328 numlen = e - RExC_parse;
5329 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5333 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5335 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5336 RExC_parse += numlen;
5340 value = UCHARAT(RExC_parse++);
5341 value = toCTRL(value);
5343 case '0': case '1': case '2': case '3': case '4':
5344 case '5': case '6': case '7': case '8': case '9':
5348 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5349 RExC_parse += numlen;
5353 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5355 "Unrecognized escape \\%c in character class passed through",
5359 } /* end of \blah */
5365 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5367 if (!SIZE_ONLY && !need_class)
5368 ANYOF_CLASS_ZERO(ret);
5372 /* a bad range like a-\d, a-[:digit:] ? */
5375 if (ckWARN(WARN_REGEXP)) {
5377 RExC_parse >= rangebegin ?
5378 RExC_parse - rangebegin : 0;
5380 "False [] range \"%*.*s\"",
5383 if (prevvalue < 256) {
5384 ANYOF_BITMAP_SET(ret, prevvalue);
5385 ANYOF_BITMAP_SET(ret, '-');
5388 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5389 Perl_sv_catpvf(aTHX_ listsv,
5390 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5394 range = 0; /* this was not a true range */
5398 const char *what = NULL;
5401 if (namedclass > OOB_NAMEDCLASS)
5402 optimize_invert = FALSE;
5403 /* Possible truncation here but in some 64-bit environments
5404 * the compiler gets heartburn about switch on 64-bit values.
5405 * A similar issue a little earlier when switching on value.
5407 switch ((I32)namedclass) {
5410 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5412 for (value = 0; value < 256; value++)
5414 ANYOF_BITMAP_SET(ret, value);
5421 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5423 for (value = 0; value < 256; value++)
5424 if (!isALNUM(value))
5425 ANYOF_BITMAP_SET(ret, value);
5432 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5434 for (value = 0; value < 256; value++)
5435 if (isALNUMC(value))
5436 ANYOF_BITMAP_SET(ret, value);
5443 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5445 for (value = 0; value < 256; value++)
5446 if (!isALNUMC(value))
5447 ANYOF_BITMAP_SET(ret, value);
5454 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5456 for (value = 0; value < 256; value++)
5458 ANYOF_BITMAP_SET(ret, value);
5465 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5467 for (value = 0; value < 256; value++)
5468 if (!isALPHA(value))
5469 ANYOF_BITMAP_SET(ret, value);
5476 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5479 for (value = 0; value < 128; value++)
5480 ANYOF_BITMAP_SET(ret, value);
5482 for (value = 0; value < 256; value++) {
5484 ANYOF_BITMAP_SET(ret, value);
5493 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5496 for (value = 128; value < 256; value++)
5497 ANYOF_BITMAP_SET(ret, value);
5499 for (value = 0; value < 256; value++) {
5500 if (!isASCII(value))
5501 ANYOF_BITMAP_SET(ret, value);
5510 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5512 for (value = 0; value < 256; value++)
5514 ANYOF_BITMAP_SET(ret, value);
5521 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5523 for (value = 0; value < 256; value++)
5524 if (!isBLANK(value))
5525 ANYOF_BITMAP_SET(ret, value);
5532 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5534 for (value = 0; value < 256; value++)
5536 ANYOF_BITMAP_SET(ret, value);
5543 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5545 for (value = 0; value < 256; value++)
5546 if (!isCNTRL(value))
5547 ANYOF_BITMAP_SET(ret, value);
5554 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5556 /* consecutive digits assumed */
5557 for (value = '0'; value <= '9'; value++)
5558 ANYOF_BITMAP_SET(ret, value);
5565 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5567 /* consecutive digits assumed */
5568 for (value = 0; value < '0'; value++)
5569 ANYOF_BITMAP_SET(ret, value);
5570 for (value = '9' + 1; value < 256; value++)
5571 ANYOF_BITMAP_SET(ret, value);
5578 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5580 for (value = 0; value < 256; value++)
5582 ANYOF_BITMAP_SET(ret, value);
5589 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5591 for (value = 0; value < 256; value++)
5592 if (!isGRAPH(value))
5593 ANYOF_BITMAP_SET(ret, value);
5600 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5602 for (value = 0; value < 256; value++)
5604 ANYOF_BITMAP_SET(ret, value);
5611 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5613 for (value = 0; value < 256; value++)
5614 if (!isLOWER(value))
5615 ANYOF_BITMAP_SET(ret, value);
5622 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5624 for (value = 0; value < 256; value++)
5626 ANYOF_BITMAP_SET(ret, value);
5633 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5635 for (value = 0; value < 256; value++)
5636 if (!isPRINT(value))
5637 ANYOF_BITMAP_SET(ret, value);
5644 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5646 for (value = 0; value < 256; value++)
5647 if (isPSXSPC(value))
5648 ANYOF_BITMAP_SET(ret, value);
5655 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5657 for (value = 0; value < 256; value++)
5658 if (!isPSXSPC(value))
5659 ANYOF_BITMAP_SET(ret, value);
5666 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5668 for (value = 0; value < 256; value++)
5670 ANYOF_BITMAP_SET(ret, value);
5677 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5679 for (value = 0; value < 256; value++)
5680 if (!isPUNCT(value))
5681 ANYOF_BITMAP_SET(ret, value);
5688 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5690 for (value = 0; value < 256; value++)
5692 ANYOF_BITMAP_SET(ret, value);
5699 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5701 for (value = 0; value < 256; value++)
5702 if (!isSPACE(value))
5703 ANYOF_BITMAP_SET(ret, value);
5710 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5712 for (value = 0; value < 256; value++)
5714 ANYOF_BITMAP_SET(ret, value);
5721 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5723 for (value = 0; value < 256; value++)
5724 if (!isUPPER(value))
5725 ANYOF_BITMAP_SET(ret, value);
5732 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5734 for (value = 0; value < 256; value++)
5735 if (isXDIGIT(value))
5736 ANYOF_BITMAP_SET(ret, value);
5743 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5745 for (value = 0; value < 256; value++)
5746 if (!isXDIGIT(value))
5747 ANYOF_BITMAP_SET(ret, value);
5753 /* this is to handle \p and \P */
5756 vFAIL("Invalid [::] class");
5760 /* Strings such as "+utf8::isWord\n" */
5761 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5764 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5767 } /* end of namedclass \blah */
5770 if (prevvalue > (IV)value) /* b-a */ {
5771 const int w = RExC_parse - rangebegin;
5772 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5773 range = 0; /* not a valid range */
5777 prevvalue = value; /* save the beginning of the range */
5778 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5779 RExC_parse[1] != ']') {
5782 /* a bad range like \w-, [:word:]- ? */
5783 if (namedclass > OOB_NAMEDCLASS) {
5784 if (ckWARN(WARN_REGEXP)) {
5786 RExC_parse >= rangebegin ?
5787 RExC_parse - rangebegin : 0;
5789 "False [] range \"%*.*s\"",
5793 ANYOF_BITMAP_SET(ret, '-');
5795 range = 1; /* yeah, it's a range! */
5796 continue; /* but do it the next time */
5800 /* now is the next time */
5801 /*stored += (value - prevvalue + 1);*/
5803 if (prevvalue < 256) {
5804 const IV ceilvalue = value < 256 ? value : 255;
5807 /* In EBCDIC [\x89-\x91] should include
5808 * the \x8e but [i-j] should not. */
5809 if (literal_endpoint == 2 &&
5810 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5811 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5813 if (isLOWER(prevvalue)) {
5814 for (i = prevvalue; i <= ceilvalue; i++)
5816 ANYOF_BITMAP_SET(ret, i);
5818 for (i = prevvalue; i <= ceilvalue; i++)
5820 ANYOF_BITMAP_SET(ret, i);
5825 for (i = prevvalue; i <= ceilvalue; i++) {
5826 if (!ANYOF_BITMAP_TEST(ret,i)) {
5828 ANYOF_BITMAP_SET(ret, i);
5832 if (value > 255 || UTF) {
5833 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5834 const UV natvalue = NATIVE_TO_UNI(value);
5835 stored+=2; /* can't optimize this class */
5836 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5837 if (prevnatvalue < natvalue) { /* what about > ? */
5838 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5839 prevnatvalue, natvalue);
5841 else if (prevnatvalue == natvalue) {
5842 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5844 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5846 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5848 /* If folding and foldable and a single
5849 * character, insert also the folded version
5850 * to the charclass. */
5852 if (foldlen == (STRLEN)UNISKIP(f))
5853 Perl_sv_catpvf(aTHX_ listsv,
5856 /* Any multicharacter foldings
5857 * require the following transform:
5858 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5859 * where E folds into "pq" and F folds
5860 * into "rst", all other characters
5861 * fold to single characters. We save
5862 * away these multicharacter foldings,
5863 * to be later saved as part of the
5864 * additional "s" data. */
5867 if (!unicode_alternate)
5868 unicode_alternate = newAV();
5869 sv = newSVpvn((char*)foldbuf, foldlen);
5871 av_push(unicode_alternate, sv);
5875 /* If folding and the value is one of the Greek
5876 * sigmas insert a few more sigmas to make the
5877 * folding rules of the sigmas to work right.
5878 * Note that not all the possible combinations
5879 * are handled here: some of them are handled
5880 * by the standard folding rules, and some of
5881 * them (literal or EXACTF cases) are handled
5882 * during runtime in regexec.c:S_find_byclass(). */
5883 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5884 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5885 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5886 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5887 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5889 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5890 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5891 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5896 literal_endpoint = 0;
5900 range = 0; /* this range (if it was one) is done now */
5904 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5906 RExC_size += ANYOF_CLASS_ADD_SKIP;
5908 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5914 /****** !SIZE_ONLY AFTER HERE *********/
5916 if( stored == 1 && value < 256
5917 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5919 /* optimize single char class to an EXACT node
5920 but *only* when its not a UTF/high char */
5921 const char * cur_parse= RExC_parse;
5922 RExC_emit = (regnode *)orig_emit;
5923 RExC_parse = (char *)orig_parse;
5924 ret = reg_node(pRExC_state,
5925 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5926 RExC_parse = (char *)cur_parse;
5927 *STRING(ret)= (char)value;
5929 RExC_emit += STR_SZ(1);
5932 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5933 if ( /* If the only flag is folding (plus possibly inversion). */
5934 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5936 for (value = 0; value < 256; ++value) {
5937 if (ANYOF_BITMAP_TEST(ret, value)) {
5938 UV fold = PL_fold[value];
5941 ANYOF_BITMAP_SET(ret, fold);
5944 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5947 /* optimize inverted simple patterns (e.g. [^a-z]) */
5948 if (optimize_invert &&
5949 /* If the only flag is inversion. */
5950 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5951 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5952 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5953 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5956 AV * const av = newAV();
5958 /* The 0th element stores the character class description
5959 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5960 * to initialize the appropriate swash (which gets stored in
5961 * the 1st element), and also useful for dumping the regnode.
5962 * The 2nd element stores the multicharacter foldings,
5963 * used later (regexec.c:S_reginclass()). */
5964 av_store(av, 0, listsv);
5965 av_store(av, 1, NULL);
5966 av_store(av, 2, (SV*)unicode_alternate);
5967 rv = newRV_noinc((SV*)av);
5968 n = add_data(pRExC_state, 1, "s");
5969 RExC_rx->data->data[n] = (void*)rv;
5976 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5978 char* const retval = RExC_parse++;
5981 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5982 RExC_parse[2] == '#') {
5983 while (*RExC_parse != ')') {
5984 if (RExC_parse == RExC_end)
5985 FAIL("Sequence (?#... not terminated");
5991 if (RExC_flags & PMf_EXTENDED) {
5992 if (isSPACE(*RExC_parse)) {
5996 else if (*RExC_parse == '#') {
5997 while (RExC_parse < RExC_end)
5998 if (*RExC_parse++ == '\n') break;
6007 - reg_node - emit a node
6009 STATIC regnode * /* Location. */
6010 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
6013 register regnode *ptr;
6014 regnode * const ret = RExC_emit;
6015 GET_RE_DEBUG_FLAGS_DECL;
6018 SIZE_ALIGN(RExC_size);
6022 NODE_ALIGN_FILL(ret);
6024 FILL_ADVANCE_NODE(ptr, op);
6025 if (RExC_offsets) { /* MJD */
6026 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
6027 "reg_node", __LINE__,
6029 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6030 ? "Overwriting end of array!\n" : "OK",
6031 (UV)(RExC_emit - RExC_emit_start),
6032 (UV)(RExC_parse - RExC_start),
6033 (UV)RExC_offsets[0]));
6034 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
6043 - reganode - emit a node with an argument
6045 STATIC regnode * /* Location. */
6046 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6049 register regnode *ptr;
6050 regnode * const ret = RExC_emit;
6051 GET_RE_DEBUG_FLAGS_DECL;
6054 SIZE_ALIGN(RExC_size);
6059 NODE_ALIGN_FILL(ret);
6061 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6062 if (RExC_offsets) { /* MJD */
6063 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6067 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6068 "Overwriting end of array!\n" : "OK",
6069 (UV)(RExC_emit - RExC_emit_start),
6070 (UV)(RExC_parse - RExC_start),
6071 (UV)RExC_offsets[0]));
6072 Set_Cur_Node_Offset;
6081 - reguni - emit (if appropriate) a Unicode character
6084 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6087 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6091 - reginsert - insert an operator in front of already-emitted operand
6093 * Means relocating the operand.
6096 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6099 register regnode *src;
6100 register regnode *dst;
6101 register regnode *place;
6102 const int offset = regarglen[(U8)op];
6103 GET_RE_DEBUG_FLAGS_DECL;
6104 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6107 RExC_size += NODE_STEP_REGNODE + offset;
6112 RExC_emit += NODE_STEP_REGNODE + offset;
6114 while (src > opnd) {
6115 StructCopy(--src, --dst, regnode);
6116 if (RExC_offsets) { /* MJD 20010112 */
6117 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6121 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6122 ? "Overwriting end of array!\n" : "OK",
6123 (UV)(src - RExC_emit_start),
6124 (UV)(dst - RExC_emit_start),
6125 (UV)RExC_offsets[0]));
6126 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6127 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6132 place = opnd; /* Op node, where operand used to be. */
6133 if (RExC_offsets) { /* MJD */
6134 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6138 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6139 ? "Overwriting end of array!\n" : "OK",
6140 (UV)(place - RExC_emit_start),
6141 (UV)(RExC_parse - RExC_start),
6143 Set_Node_Offset(place, RExC_parse);
6144 Set_Node_Length(place, 1);
6146 src = NEXTOPER(place);
6147 FILL_ADVANCE_NODE(place, op);
6148 Zero(src, offset, regnode);
6152 - regtail - set the next-pointer at the end of a node chain of p to val.
6153 - SEE ALSO: regtail_study
6155 /* TODO: All three parms should be const */
6157 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6160 register regnode *scan;
6161 GET_RE_DEBUG_FLAGS_DECL;
6163 PERL_UNUSED_ARG(depth);
6169 /* Find last node. */
6172 regnode * const temp = regnext(scan);
6174 SV * const mysv=sv_newmortal();
6175 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6176 regprop(RExC_rx, mysv, scan);
6177 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6178 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6185 if (reg_off_by_arg[OP(scan)]) {
6186 ARG_SET(scan, val - scan);
6189 NEXT_OFF(scan) = val - scan;
6195 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6196 - Look for optimizable sequences at the same time.
6197 - currently only looks for EXACT chains.
6199 This is expermental code. The idea is to use this routine to perform
6200 in place optimizations on branches and groups as they are constructed,
6201 with the long term intention of removing optimization from study_chunk so
6202 that it is purely analytical.
6204 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6205 to control which is which.
6208 /* TODO: All four parms should be const */
6211 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6214 register regnode *scan;
6216 #ifdef EXPERIMENTAL_INPLACESCAN
6220 GET_RE_DEBUG_FLAGS_DECL;
6226 /* Find last node. */
6230 regnode * const temp = regnext(scan);
6231 #ifdef EXPERIMENTAL_INPLACESCAN
6232 if (PL_regkind[OP(scan)] == EXACT)
6233 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6241 if( exact == PSEUDO )
6243 else if ( exact != OP(scan) )
6252 SV * const mysv=sv_newmortal();
6253 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6254 regprop(RExC_rx, mysv, scan);
6255 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6256 SvPV_nolen_const(mysv),
6258 REG_NODE_NUM(scan));
6265 SV * const mysv_val=sv_newmortal();
6266 DEBUG_PARSE_MSG("");
6267 regprop(RExC_rx, mysv_val, val);
6268 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6269 SvPV_nolen_const(mysv_val),
6274 if (reg_off_by_arg[OP(scan)]) {
6275 ARG_SET(scan, val - scan);
6278 NEXT_OFF(scan) = val - scan;
6286 - regcurly - a little FSA that accepts {\d+,?\d*}
6289 S_regcurly(register const char *s)
6308 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6311 Perl_regdump(pTHX_ const regexp *r)
6315 SV * const sv = sv_newmortal();
6317 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
6319 /* Header fields of interest. */
6320 if (r->anchored_substr)
6321 PerlIO_printf(Perl_debug_log,
6322 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
6324 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
6325 SvPVX_const(r->anchored_substr),
6327 SvTAIL(r->anchored_substr) ? "$" : "",
6328 (IV)r->anchored_offset);
6329 else if (r->anchored_utf8)
6330 PerlIO_printf(Perl_debug_log,
6331 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
6333 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
6334 SvPVX_const(r->anchored_utf8),
6336 SvTAIL(r->anchored_utf8) ? "$" : "",
6337 (IV)r->anchored_offset);
6338 if (r->float_substr)
6339 PerlIO_printf(Perl_debug_log,
6340 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6342 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
6343 SvPVX_const(r->float_substr),
6345 SvTAIL(r->float_substr) ? "$" : "",
6346 (IV)r->float_min_offset, (UV)r->float_max_offset);
6347 else if (r->float_utf8)
6348 PerlIO_printf(Perl_debug_log,
6349 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6351 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
6352 SvPVX_const(r->float_utf8),
6354 SvTAIL(r->float_utf8) ? "$" : "",
6355 (IV)r->float_min_offset, (UV)r->float_max_offset);
6356 if (r->check_substr || r->check_utf8)
6357 PerlIO_printf(Perl_debug_log,
6358 r->check_substr == r->float_substr
6359 && r->check_utf8 == r->float_utf8
6360 ? "(checking floating" : "(checking anchored");
6361 if (r->reganch & ROPT_NOSCAN)
6362 PerlIO_printf(Perl_debug_log, " noscan");
6363 if (r->reganch & ROPT_CHECK_ALL)
6364 PerlIO_printf(Perl_debug_log, " isall");
6365 if (r->check_substr || r->check_utf8)
6366 PerlIO_printf(Perl_debug_log, ") ");
6368 if (r->regstclass) {
6369 regprop(r, sv, r->regstclass);
6370 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6372 if (r->reganch & ROPT_ANCH) {
6373 PerlIO_printf(Perl_debug_log, "anchored");
6374 if (r->reganch & ROPT_ANCH_BOL)
6375 PerlIO_printf(Perl_debug_log, "(BOL)");
6376 if (r->reganch & ROPT_ANCH_MBOL)
6377 PerlIO_printf(Perl_debug_log, "(MBOL)");
6378 if (r->reganch & ROPT_ANCH_SBOL)
6379 PerlIO_printf(Perl_debug_log, "(SBOL)");
6380 if (r->reganch & ROPT_ANCH_GPOS)
6381 PerlIO_printf(Perl_debug_log, "(GPOS)");
6382 PerlIO_putc(Perl_debug_log, ' ');
6384 if (r->reganch & ROPT_GPOS_SEEN)
6385 PerlIO_printf(Perl_debug_log, "GPOS ");
6386 if (r->reganch & ROPT_SKIP)
6387 PerlIO_printf(Perl_debug_log, "plus ");
6388 if (r->reganch & ROPT_IMPLICIT)
6389 PerlIO_printf(Perl_debug_log, "implicit ");
6390 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6391 if (r->reganch & ROPT_EVAL_SEEN)
6392 PerlIO_printf(Perl_debug_log, "with eval ");
6393 PerlIO_printf(Perl_debug_log, "\n");
6395 PERL_UNUSED_CONTEXT;
6397 #endif /* DEBUGGING */
6401 - regprop - printable representation of opcode
6404 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6410 sv_setpvn(sv, "", 0);
6411 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6412 /* It would be nice to FAIL() here, but this may be called from
6413 regexec.c, and it would be hard to supply pRExC_state. */
6414 Perl_croak(aTHX_ "Corrupted regexp opcode");
6415 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6417 k = PL_regkind[OP(o)];
6420 SV * const dsv = sv_2mortal(newSVpvs(""));
6421 /* Using is_utf8_string() is a crude hack but it may
6422 * be the best for now since we have no flag "this EXACTish
6423 * node was UTF-8" --jhi */
6424 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
6425 const char * const s = do_utf8 ?
6426 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6427 UNI_DISPLAY_REGEX) :
6429 const int len = do_utf8 ?
6432 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6436 } else if (k == TRIE) {
6437 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6438 /* print the details of the trie in dumpuntil instead, as
6439 * prog->data isn't available here */
6440 } else if (k == CURLY) {
6441 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6442 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6443 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6445 else if (k == WHILEM && o->flags) /* Ordinal/of */
6446 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6447 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6448 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6449 else if (k == LOGICAL)
6450 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6451 else if (k == ANYOF) {
6452 int i, rangestart = -1;
6453 const U8 flags = ANYOF_FLAGS(o);
6455 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6456 static const char * const anyofs[] = {
6489 if (flags & ANYOF_LOCALE)
6490 sv_catpvs(sv, "{loc}");
6491 if (flags & ANYOF_FOLD)
6492 sv_catpvs(sv, "{i}");
6493 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6494 if (flags & ANYOF_INVERT)
6496 for (i = 0; i <= 256; i++) {
6497 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6498 if (rangestart == -1)
6500 } else if (rangestart != -1) {
6501 if (i <= rangestart + 3)
6502 for (; rangestart < i; rangestart++)
6503 put_byte(sv, rangestart);
6505 put_byte(sv, rangestart);
6507 put_byte(sv, i - 1);
6513 if (o->flags & ANYOF_CLASS)
6514 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6515 if (ANYOF_CLASS_TEST(o,i))
6516 sv_catpv(sv, anyofs[i]);
6518 if (flags & ANYOF_UNICODE)
6519 sv_catpvs(sv, "{unicode}");
6520 else if (flags & ANYOF_UNICODE_ALL)
6521 sv_catpvs(sv, "{unicode_all}");
6525 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6529 U8 s[UTF8_MAXBYTES_CASE+1];
6531 for (i = 0; i <= 256; i++) { /* just the first 256 */
6532 uvchr_to_utf8(s, i);
6534 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6535 if (rangestart == -1)
6537 } else if (rangestart != -1) {
6538 if (i <= rangestart + 3)
6539 for (; rangestart < i; rangestart++) {
6540 const U8 * const e = uvchr_to_utf8(s,rangestart);
6542 for(p = s; p < e; p++)
6546 const U8 *e = uvchr_to_utf8(s,rangestart);
6548 for (p = s; p < e; p++)
6551 e = uvchr_to_utf8(s, i-1);
6552 for (p = s; p < e; p++)
6559 sv_catpvs(sv, "..."); /* et cetera */
6563 char *s = savesvpv(lv);
6564 char * const origs = s;
6566 while (*s && *s != '\n')
6570 const char * const t = ++s;
6588 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6590 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6591 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
6593 PERL_UNUSED_CONTEXT;
6594 PERL_UNUSED_ARG(sv);
6596 PERL_UNUSED_ARG(prog);
6597 #endif /* DEBUGGING */
6601 Perl_re_intuit_string(pTHX_ regexp *prog)
6602 { /* Assume that RE_INTUIT is set */
6604 GET_RE_DEBUG_FLAGS_DECL;
6605 PERL_UNUSED_CONTEXT;
6609 const char * const s = SvPV_nolen_const(prog->check_substr
6610 ? prog->check_substr : prog->check_utf8);
6612 if (!PL_colorset) reginitcolors();
6613 PerlIO_printf(Perl_debug_log,
6614 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6616 prog->check_substr ? "" : "utf8 ",
6617 PL_colors[5],PL_colors[0],
6620 (strlen(s) > 60 ? "..." : ""));
6623 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6627 Perl_pregfree(pTHX_ struct regexp *r)
6631 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6633 GET_RE_DEBUG_FLAGS_DECL;
6635 if (!r || (--r->refcnt > 0))
6637 DEBUG_COMPILE_r(if (RX_DEBUG(r)){
6638 const char * const s = (r->reganch & ROPT_UTF8)
6639 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6640 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6641 const int len = SvCUR(dsv);
6644 PerlIO_printf(Perl_debug_log,
6645 "%sFreeing REx:%s %s%*.*s%s%s\n",
6646 PL_colors[4],PL_colors[5],PL_colors[0],
6649 len > 60 ? "..." : "");
6652 /* gcov results gave these as non-null 100% of the time, so there's no
6653 optimisation in checking them before calling Safefree */
6654 Safefree(r->precomp);
6655 Safefree(r->offsets); /* 20010421 MJD */
6656 RX_MATCH_COPY_FREE(r);
6657 #ifdef PERL_OLD_COPY_ON_WRITE
6659 SvREFCNT_dec(r->saved_copy);
6662 if (r->anchored_substr)
6663 SvREFCNT_dec(r->anchored_substr);
6664 if (r->anchored_utf8)
6665 SvREFCNT_dec(r->anchored_utf8);
6666 if (r->float_substr)
6667 SvREFCNT_dec(r->float_substr);
6669 SvREFCNT_dec(r->float_utf8);
6670 Safefree(r->substrs);
6673 int n = r->data->count;
6674 PAD* new_comppad = NULL;
6679 /* If you add a ->what type here, update the comment in regcomp.h */
6680 switch (r->data->what[n]) {
6682 SvREFCNT_dec((SV*)r->data->data[n]);
6685 Safefree(r->data->data[n]);
6688 new_comppad = (AV*)r->data->data[n];
6691 if (new_comppad == NULL)
6692 Perl_croak(aTHX_ "panic: pregfree comppad");
6693 PAD_SAVE_LOCAL(old_comppad,
6694 /* Watch out for global destruction's random ordering. */
6695 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6698 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6701 op_free((OP_4tree*)r->data->data[n]);
6703 PAD_RESTORE_LOCAL(old_comppad);
6704 SvREFCNT_dec((SV*)new_comppad);
6710 { /* Aho Corasick add-on structure for a trie node.
6711 Used in stclass optimization only */
6713 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6715 refcount = --aho->refcount;
6718 Safefree(aho->states);
6719 Safefree(aho->fail);
6720 aho->trie=NULL; /* not necessary to free this as it is
6721 handled by the 't' case */
6722 Safefree(r->data->data[n]); /* do this last!!!! */
6723 Safefree(r->regstclass);
6729 /* trie structure. */
6731 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6733 refcount = --trie->refcount;
6736 Safefree(trie->charmap);
6737 if (trie->widecharmap)
6738 SvREFCNT_dec((SV*)trie->widecharmap);
6739 Safefree(trie->states);
6740 Safefree(trie->trans);
6742 Safefree(trie->bitmap);
6744 Safefree(trie->wordlen);
6748 SvREFCNT_dec((SV*)trie->words);
6749 if (trie->revcharmap)
6750 SvREFCNT_dec((SV*)trie->revcharmap);
6753 Safefree(r->data->data[n]); /* do this last!!!! */
6758 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6761 Safefree(r->data->what);
6764 Safefree(r->startp);
6769 #ifndef PERL_IN_XSUB_RE
6771 - regnext - dig the "next" pointer out of a node
6774 Perl_regnext(pTHX_ register regnode *p)
6777 register I32 offset;
6779 if (p == &PL_regdummy)
6782 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6791 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6794 STRLEN l1 = strlen(pat1);
6795 STRLEN l2 = strlen(pat2);
6798 const char *message;
6804 Copy(pat1, buf, l1 , char);
6805 Copy(pat2, buf + l1, l2 , char);
6806 buf[l1 + l2] = '\n';
6807 buf[l1 + l2 + 1] = '\0';
6809 /* ANSI variant takes additional second argument */
6810 va_start(args, pat2);
6814 msv = vmess(buf, &args);
6816 message = SvPV_const(msv,l1);
6819 Copy(message, buf, l1 , char);
6820 buf[l1-1] = '\0'; /* Overwrite \n */
6821 Perl_croak(aTHX_ "%s", buf);
6824 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6826 #ifndef PERL_IN_XSUB_RE
6828 Perl_save_re_context(pTHX)
6832 struct re_save_state *state;
6834 SAVEVPTR(PL_curcop);
6835 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6837 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6838 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6839 SSPUSHINT(SAVEt_RE_STATE);
6841 Copy(&PL_reg_state, state, 1, struct re_save_state);
6843 PL_reg_start_tmp = 0;
6844 PL_reg_start_tmpl = 0;
6845 PL_reg_oldsaved = NULL;
6846 PL_reg_oldsavedlen = 0;
6848 PL_reg_leftiter = 0;
6849 PL_reg_poscache = NULL;
6850 PL_reg_poscache_size = 0;
6851 #ifdef PERL_OLD_COPY_ON_WRITE
6855 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6857 const REGEXP * const rx = PM_GETRE(PL_curpm);
6860 for (i = 1; i <= rx->nparens; i++) {
6861 char digits[TYPE_CHARS(long)];
6862 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
6863 GV *const *const gvp
6864 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6867 GV * const gv = *gvp;
6868 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6878 clear_re(pTHX_ void *r)
6881 ReREFCNT_dec((regexp *)r);
6887 S_put_byte(pTHX_ SV *sv, int c)
6889 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6890 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6891 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6892 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6894 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6897 #define CLEAR_OPTSTART \
6898 if (optstart) STMT_START { \
6899 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
6903 #define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6905 STATIC const regnode *
6906 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6907 const regnode *last, SV* sv, I32 l)
6910 register U8 op = EXACT; /* Arbitrary non-END op. */
6911 register const regnode *next;
6912 const regnode *optstart= NULL;
6913 GET_RE_DEBUG_FLAGS_DECL;
6915 while (op != END && (!last || node < last)) {
6916 /* While that wasn't END last time... */
6922 next = regnext((regnode *)node);
6925 if (OP(node) == OPTIMIZED) {
6926 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
6933 regprop(r, sv, node);
6934 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6935 (int)(2*l + 1), "", SvPVX_const(sv));
6937 if (OP(node) != OPTIMIZED) {
6938 if (next == NULL) /* Next ptr. */
6939 PerlIO_printf(Perl_debug_log, "(0)");
6941 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6942 (void)PerlIO_putc(Perl_debug_log, '\n');
6946 if (PL_regkind[(U8)op] == BRANCHJ) {
6949 register const regnode *nnode = (OP(next) == LONGJMP
6950 ? regnext((regnode *)next)
6952 if (last && nnode > last)
6954 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6957 else if (PL_regkind[(U8)op] == BRANCH) {
6959 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
6961 else if ( PL_regkind[(U8)op] == TRIE ) {
6962 const I32 n = ARG(node);
6963 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6964 const I32 arry_len = av_len(trie->words)+1;
6966 PerlIO_printf(Perl_debug_log,
6967 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
6971 TRIE_WORDCOUNT(trie),
6972 (int)TRIE_CHARCOUNT(trie),
6973 trie->uniquecharcount,
6974 (IV)TRIE_LASTSTATE(trie)-1,
6981 sv_setpvn(sv, "", 0);
6982 for (i = 0; i <= 256; i++) {
6983 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6984 if (rangestart == -1)
6986 } else if (rangestart != -1) {
6987 if (i <= rangestart + 3)
6988 for (; rangestart < i; rangestart++)
6989 put_byte(sv, rangestart);
6991 put_byte(sv, rangestart);
6993 put_byte(sv, i - 1);
6998 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
7000 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
7002 for (word_idx=0; word_idx < arry_len; word_idx++) {
7003 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
7005 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
7008 SvPV_nolen_const(*elem_ptr),
7014 node = NEXTOPER(node);
7015 node += regarglen[(U8)op];
7018 else if ( op == CURLY) { /* "next" might be very big: optimizer */
7019 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7020 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7022 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
7024 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
7027 else if ( op == PLUS || op == STAR) {
7028 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
7030 else if (op == ANYOF) {
7031 /* arglen 1 + class block */
7032 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7033 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7034 node = NEXTOPER(node);
7036 else if (PL_regkind[(U8)op] == EXACT) {
7037 /* Literal string, where present. */
7038 node += NODE_SZ_STR(node) - 1;
7039 node = NEXTOPER(node);
7042 node = NEXTOPER(node);
7043 node += regarglen[(U8)op];
7045 if (op == CURLYX || op == OPEN)
7047 else if (op == WHILEM)
7054 #endif /* DEBUGGING */
7058 * c-indentation-style: bsd
7060 * indent-tabs-mode: t
7063 * ex: set ts=8 sts=4 sw=4 noet: