5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static const scan_data_t zero_scan_data =
199 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
259 #define FAIL(msg) STMT_START { \
260 const char *ellipses = ""; \
261 IV len = RExC_end - RExC_precomp; \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
275 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
277 #define Simple_vFAIL(m) STMT_START { \
278 const IV offset = RExC_parse - RExC_precomp; \
279 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
280 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
284 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
286 #define vFAIL(m) STMT_START { \
288 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
293 * Like Simple_vFAIL(), but accepts two arguments.
295 #define Simple_vFAIL2(m,a1) STMT_START { \
296 const IV offset = RExC_parse - RExC_precomp; \
297 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
298 (int)offset, RExC_precomp, RExC_precomp + offset); \
302 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
304 #define vFAIL2(m,a1) STMT_START { \
306 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
307 Simple_vFAIL2(m, a1); \
312 * Like Simple_vFAIL(), but accepts three arguments.
314 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
315 const IV offset = RExC_parse - RExC_precomp; \
316 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
317 (int)offset, RExC_precomp, RExC_precomp + offset); \
321 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
323 #define vFAIL3(m,a1,a2) STMT_START { \
325 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
326 Simple_vFAIL3(m, a1, a2); \
330 * Like Simple_vFAIL(), but accepts four arguments.
332 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
333 const IV offset = RExC_parse - RExC_precomp; \
334 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
335 (int)offset, RExC_precomp, RExC_precomp + offset); \
338 #define vWARN(loc,m) STMT_START { \
339 const IV offset = loc - RExC_precomp; \
340 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
341 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
344 #define vWARNdep(loc,m) STMT_START { \
345 const IV offset = loc - RExC_precomp; \
346 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
347 "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
352 #define vWARN2(loc, m, a1) STMT_START { \
353 const IV offset = loc - RExC_precomp; \
354 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
355 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
358 #define vWARN3(loc, m, a1, a2) STMT_START { \
359 const IV offset = loc - RExC_precomp; \
360 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
361 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
364 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
365 const IV offset = loc - RExC_precomp; \
366 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
367 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
370 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
371 const IV offset = loc - RExC_precomp; \
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
373 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
377 /* Allow for side effects in s */
378 #define REGC(c,s) STMT_START { \
379 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
382 /* Macros for recording node offsets. 20001227 mjd@plover.com
383 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
384 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
385 * Element 0 holds the number n.
388 #define MJD_OFFSET_DEBUG(x)
389 /* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
392 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
394 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
395 __LINE__, (node), (byte))); \
397 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
399 RExC_offsets[2*(node)-1] = (byte); \
404 #define Set_Node_Offset(node,byte) \
405 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
406 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
408 #define Set_Node_Length_To_R(node,len) STMT_START { \
410 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
411 __LINE__, (int)(node), (int)(len))); \
413 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
415 RExC_offsets[2*(node)] = (len); \
420 #define Set_Node_Length(node,len) \
421 Set_Node_Length_To_R((node)-RExC_emit_start, len)
422 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
423 #define Set_Node_Cur_Length(node) \
424 Set_Node_Length(node, RExC_parse - parse_start)
426 /* Get offsets and lengths */
427 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
428 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
430 static void clear_re(pTHX_ void *r);
432 /* Mark that we cannot extend a found fixed substring at this point.
433 Updata the longest found anchored substring and the longest found
434 floating substrings if needed. */
437 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
439 const STRLEN l = CHR_SVLEN(data->last_found);
440 const STRLEN old_l = CHR_SVLEN(*data->longest);
442 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
443 SvSetMagicSV(*data->longest, data->last_found);
444 if (*data->longest == data->longest_fixed) {
445 data->offset_fixed = l ? data->last_start_min : data->pos_min;
446 if (data->flags & SF_BEFORE_EOL)
448 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
450 data->flags &= ~SF_FIX_BEFORE_EOL;
453 data->offset_float_min = l ? data->last_start_min : data->pos_min;
454 data->offset_float_max = (l
455 ? data->last_start_max
456 : data->pos_min + data->pos_delta);
457 if ((U32)data->offset_float_max > (U32)I32_MAX)
458 data->offset_float_max = I32_MAX;
459 if (data->flags & SF_BEFORE_EOL)
461 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
463 data->flags &= ~SF_FL_BEFORE_EOL;
466 SvCUR_set(data->last_found, 0);
468 SV * const sv = data->last_found;
469 if (SvUTF8(sv) && SvMAGICAL(sv)) {
470 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
476 data->flags &= ~SF_BEFORE_EOL;
479 /* Can match anything (initialization) */
481 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
483 ANYOF_CLASS_ZERO(cl);
484 ANYOF_BITMAP_SETALL(cl);
485 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
487 cl->flags |= ANYOF_LOCALE;
490 /* Can match anything (initialization) */
492 S_cl_is_anything(const struct regnode_charclass_class *cl)
496 for (value = 0; value <= ANYOF_MAX; value += 2)
497 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
499 if (!(cl->flags & ANYOF_UNICODE_ALL))
501 if (!ANYOF_BITMAP_TESTALLSET(cl))
506 /* Can match anything (initialization) */
508 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
510 Zero(cl, 1, struct regnode_charclass_class);
512 cl_anything(pRExC_state, cl);
516 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
518 Zero(cl, 1, struct regnode_charclass_class);
520 cl_anything(pRExC_state, cl);
522 cl->flags |= ANYOF_LOCALE;
525 /* 'And' a given class with another one. Can create false positives */
526 /* We assume that cl is not inverted */
528 S_cl_and(struct regnode_charclass_class *cl,
529 const struct regnode_charclass_class *and_with)
531 if (!(and_with->flags & ANYOF_CLASS)
532 && !(cl->flags & ANYOF_CLASS)
533 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
534 && !(and_with->flags & ANYOF_FOLD)
535 && !(cl->flags & ANYOF_FOLD)) {
538 if (and_with->flags & ANYOF_INVERT)
539 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
540 cl->bitmap[i] &= ~and_with->bitmap[i];
542 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
543 cl->bitmap[i] &= and_with->bitmap[i];
544 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
545 if (!(and_with->flags & ANYOF_EOS))
546 cl->flags &= ~ANYOF_EOS;
548 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
549 !(and_with->flags & ANYOF_INVERT)) {
550 cl->flags &= ~ANYOF_UNICODE_ALL;
551 cl->flags |= ANYOF_UNICODE;
552 ARG_SET(cl, ARG(and_with));
554 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
555 !(and_with->flags & ANYOF_INVERT))
556 cl->flags &= ~ANYOF_UNICODE_ALL;
557 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
558 !(and_with->flags & ANYOF_INVERT))
559 cl->flags &= ~ANYOF_UNICODE;
562 /* 'OR' a given class with another one. Can create false positives */
563 /* We assume that cl is not inverted */
565 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
567 if (or_with->flags & ANYOF_INVERT) {
569 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
570 * <= (B1 | !B2) | (CL1 | !CL2)
571 * which is wasteful if CL2 is small, but we ignore CL2:
572 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
573 * XXXX Can we handle case-fold? Unclear:
574 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
575 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
577 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
578 && !(or_with->flags & ANYOF_FOLD)
579 && !(cl->flags & ANYOF_FOLD) ) {
582 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583 cl->bitmap[i] |= ~or_with->bitmap[i];
584 } /* XXXX: logic is complicated otherwise */
586 cl_anything(pRExC_state, cl);
589 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
590 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
591 && (!(or_with->flags & ANYOF_FOLD)
592 || (cl->flags & ANYOF_FOLD)) ) {
595 /* OR char bitmap and class bitmap separately */
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= or_with->bitmap[i];
598 if (or_with->flags & ANYOF_CLASS) {
599 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
600 cl->classflags[i] |= or_with->classflags[i];
601 cl->flags |= ANYOF_CLASS;
604 else { /* XXXX: logic is complicated, leave it along for a moment. */
605 cl_anything(pRExC_state, cl);
608 if (or_with->flags & ANYOF_EOS)
609 cl->flags |= ANYOF_EOS;
611 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
612 ARG(cl) != ARG(or_with)) {
613 cl->flags |= ANYOF_UNICODE_ALL;
614 cl->flags &= ~ANYOF_UNICODE;
616 if (or_with->flags & ANYOF_UNICODE_ALL) {
617 cl->flags |= ANYOF_UNICODE_ALL;
618 cl->flags &= ~ANYOF_UNICODE;
624 make_trie(startbranch,first,last,tail,flags,depth)
625 startbranch: the first branch in the whole branch sequence
626 first : start branch of sequence of branch-exact nodes.
627 May be the same as startbranch
628 last : Thing following the last branch.
629 May be the same as tail.
630 tail : item following the branch sequence
631 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
634 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
636 A trie is an N'ary tree where the branches are determined by digital
637 decomposition of the key. IE, at the root node you look up the 1st character and
638 follow that branch repeat until you find the end of the branches. Nodes can be
639 marked as "accepting" meaning they represent a complete word. Eg:
643 would convert into the following structure. Numbers represent states, letters
644 following numbers represent valid transitions on the letter from that state, if
645 the number is in square brackets it represents an accepting state, otherwise it
646 will be in parenthesis.
648 +-h->+-e->[3]-+-r->(8)-+-s->[9]
652 (1) +-i->(6)-+-s->[7]
654 +-s->(3)-+-h->(4)-+-e->[5]
656 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
658 This shows that when matching against the string 'hers' we will begin at state 1
659 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
660 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
661 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
662 single traverse. We store a mapping from accepting to state to which word was
663 matched, and then when we have multiple possibilities we try to complete the
664 rest of the regex in the order in which they occured in the alternation.
666 The only prior NFA like behaviour that would be changed by the TRIE support is
667 the silent ignoring of duplicate alternations which are of the form:
669 / (DUPE|DUPE) X? (?{ ... }) Y /x
671 Thus EVAL blocks follwing a trie may be called a different number of times with
672 and without the optimisation. With the optimisations dupes will be silently
673 ignored. This inconsistant behaviour of EVAL type nodes is well established as
674 the following demonstrates:
676 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
678 which prints out 'word' three times, but
680 'words'=~/(word|word|word)(?{ print $1 })S/
682 which doesnt print it out at all. This is due to other optimisations kicking in.
684 Example of what happens on a structural level:
686 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
688 1: CURLYM[1] {1,32767}(18)
699 This would be optimizable with startbranch=5, first=5, last=16, tail=16
700 and should turn into:
702 1: CURLYM[1] {1,32767}(18)
704 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
712 Cases where tail != last would be like /(?foo|bar)baz/:
722 which would be optimizable with startbranch=1, first=1, last=7, tail=8
723 and would end up looking like:
726 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
733 d = uvuni_to_utf8_flags(d, uv, 0);
735 is the recommended Unicode-aware way of saying
740 #define TRIE_STORE_REVCHAR \
742 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
743 av_push( TRIE_REVCHARMAP(trie), tmp ); \
746 #define TRIE_READ_CHAR STMT_START { \
749 if ( foldlen > 0 ) { \
750 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
755 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
756 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
757 foldlen -= UNISKIP( uvc ); \
758 scan = foldbuf + UNISKIP( uvc ); \
761 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
770 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
771 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
772 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
773 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
775 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
776 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
777 TRIE_LIST_LEN( state ) *= 2; \
778 Renew( trie->states[ state ].trans.list, \
779 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
781 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
782 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
783 TRIE_LIST_CUR( state )++; \
786 #define TRIE_LIST_NEW(state) STMT_START { \
787 Newxz( trie->states[ state ].trans.list, \
788 4, reg_trie_trans_le ); \
789 TRIE_LIST_CUR( state ) = 1; \
790 TRIE_LIST_LEN( state ) = 4; \
793 #define TRIE_HANDLE_WORD(state) STMT_START { \
794 if ( !trie->states[ state ].wordnum ) { \
795 /* we havent inserted this word into the structure yet. */\
796 trie->states[ state ].wordnum = ++curword; \
798 /* store the word for dumping */ \
800 if (OP(noper) != NOTHING ) \
801 tmp=newSVpvn( STRING( noper ), STR_LEN( noper ) );\
803 tmp=newSVpvn( "", 0 ); \
804 if ( UTF ) SvUTF8_on( tmp ); \
805 av_push( trie->words, tmp ); \
808 NOOP; /* It's a dupe. So ignore it. */ \
815 dump_trie_interim_list(trie,next_alloc)
816 dump_trie_interim_table(trie,next_alloc)
818 These routines dump out a trie in a somewhat readable format.
819 The _interim_ variants are used for debugging the interim
820 tables that are used to generate the final compressed
821 representation which is what dump_trie expects.
823 Part of the reason for their existance is to provide a form
824 of documentation as to how the different representations function.
830 Dumps the final compressed table form of the trie to Perl_debug_log.
831 Used for debugging make_trie().
835 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
838 GET_RE_DEBUG_FLAGS_DECL;
840 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
841 (int)depth * 2 + 2,"",
842 "Match","Base","Ofs" );
844 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
845 SV **tmp = av_fetch( trie->revcharmap, state, 0);
847 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
850 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
851 (int)depth * 2 + 2,"");
853 for( state = 0 ; state < trie->uniquecharcount ; state++ )
854 PerlIO_printf( Perl_debug_log, "-----");
855 PerlIO_printf( Perl_debug_log, "\n");
857 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
858 const U32 base = trie->states[ state ].trans.base;
860 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
862 if ( trie->states[ state ].wordnum ) {
863 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
865 PerlIO_printf( Perl_debug_log, "%6s", "" );
868 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
873 while( ( base + ofs < trie->uniquecharcount ) ||
874 ( base + ofs - trie->uniquecharcount < trie->lasttrans
875 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
878 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
880 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
881 if ( ( base + ofs >= trie->uniquecharcount ) &&
882 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
883 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
885 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
886 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
888 PerlIO_printf( Perl_debug_log, "%4s "," ." );
892 PerlIO_printf( Perl_debug_log, "]");
895 PerlIO_printf( Perl_debug_log, "\n" );
899 dump_trie_interim_list(trie,next_alloc)
900 Dumps a fully constructed but uncompressed trie in list form.
901 List tries normally only are used for construction when the number of
902 possible chars (trie->uniquecharcount) is very high.
903 Used for debugging make_trie().
906 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
909 GET_RE_DEBUG_FLAGS_DECL;
910 /* print out the table precompression. */
911 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
912 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
913 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
915 for( state=1 ; state < next_alloc ; state ++ ) {
918 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
919 (int)depth * 2 + 2,"", (UV)state );
920 if ( ! trie->states[ state ].wordnum ) {
921 PerlIO_printf( Perl_debug_log, "%5s| ","");
923 PerlIO_printf( Perl_debug_log, "W%4x| ",
924 trie->states[ state ].wordnum
927 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
928 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
929 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
930 SvPV_nolen_const( *tmp ),
931 TRIE_LIST_ITEM(state,charid).forid,
932 (UV)TRIE_LIST_ITEM(state,charid).newstate
940 dump_trie_interim_table(trie,next_alloc)
941 Dumps a fully constructed but uncompressed trie in table form.
942 This is the normal DFA style state transition table, with a few
943 twists to facilitate compression later.
944 Used for debugging make_trie().
947 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
951 GET_RE_DEBUG_FLAGS_DECL;
954 print out the table precompression so that we can do a visual check
955 that they are identical.
958 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
960 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
961 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
963 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
967 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
969 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
970 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
973 PerlIO_printf( Perl_debug_log, "\n" );
975 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
977 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
978 (int)depth * 2 + 2,"",
979 (UV)TRIE_NODENUM( state ) );
981 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
982 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
983 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
985 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
986 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
988 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
989 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1001 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
1004 /* first pass, loop through and scan words */
1005 reg_trie_data *trie;
1007 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1012 /* we just use folder as a flag in utf8 */
1013 const U8 * const folder = ( flags == EXACTF
1015 : ( flags == EXACTFL
1021 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1022 SV *re_trie_maxbuff;
1024 /* these are only used during construction but are useful during
1025 * debugging so we store them in the struct when debugging.
1026 * Wordcount is actually superfluous in debugging as we have
1027 * (AV*)trie->words to use for it, but that's not available when
1028 * not debugging... We could make the macro use the AV during
1029 * debugging though...
1031 U16 trie_wordcount=0;
1032 STRLEN trie_charcount=0;
1033 U32 trie_laststate=0;
1034 AV *trie_revcharmap;
1036 GET_RE_DEBUG_FLAGS_DECL;
1038 Newxz( trie, 1, reg_trie_data );
1040 trie->startstate = 1;
1041 RExC_rx->data->data[ data_slot ] = (void*)trie;
1042 Newxz( trie->charmap, 256, U16 );
1043 if (!(UTF && folder))
1044 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1046 trie->words = newAV();
1048 TRIE_REVCHARMAP(trie) = newAV();
1050 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1051 if (!SvIOK(re_trie_maxbuff)) {
1052 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1055 PerlIO_printf( Perl_debug_log,
1056 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1057 (int)depth * 2 + 2, "",
1058 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1059 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1061 /* -- First loop and Setup --
1063 We first traverse the branches and scan each word to determine if it
1064 contains widechars, and how many unique chars there are, this is
1065 important as we have to build a table with at least as many columns as we
1068 We use an array of integers to represent the character codes 0..255
1069 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1070 native representation of the character value as the key and IV's for the
1073 *TODO* If we keep track of how many times each character is used we can
1074 remap the columns so that the table compression later on is more
1075 efficient in terms of memory by ensuring most common value is in the
1076 middle and the least common are on the outside. IMO this would be better
1077 than a most to least common mapping as theres a decent chance the most
1078 common letter will share a node with the least common, meaning the node
1079 will not be compressable. With a middle is most common approach the worst
1080 case is when we have the least common nodes twice.
1084 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1085 regnode * const noper = NEXTOPER( cur );
1086 const U8 *uc = (U8*)STRING( noper );
1087 const U8 * const e = uc + STR_LEN( noper );
1089 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1090 const U8 *scan = (U8*)NULL;
1093 TRIE_WORDCOUNT(trie)++;
1094 if (OP(noper) == NOTHING) {
1099 TRIE_BITMAP_SET(trie,*uc);
1100 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1102 for ( ; uc < e ; uc += len ) {
1103 TRIE_CHARCOUNT(trie)++;
1107 if ( !trie->charmap[ uvc ] ) {
1108 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1110 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1115 if ( !trie->widecharmap )
1116 trie->widecharmap = newHV();
1118 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1121 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1123 if ( !SvTRUE( *svpp ) ) {
1124 sv_setiv( *svpp, ++trie->uniquecharcount );
1129 if( cur == first ) {
1132 } else if (chars < trie->minlen) {
1134 } else if (chars > trie->maxlen) {
1138 } /* end first pass */
1139 DEBUG_TRIE_COMPILE_r(
1140 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1141 (int)depth * 2 + 2,"",
1142 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
1143 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, trie->minlen, trie->maxlen )
1148 We now know what we are dealing with in terms of unique chars and
1149 string sizes so we can calculate how much memory a naive
1150 representation using a flat table will take. If it's over a reasonable
1151 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1152 conservative but potentially much slower representation using an array
1155 At the end we convert both representations into the same compressed
1156 form that will be used in regexec.c for matching with. The latter
1157 is a form that cannot be used to construct with but has memory
1158 properties similar to the list form and access properties similar
1159 to the table form making it both suitable for fast searches and
1160 small enough that its feasable to store for the duration of a program.
1162 See the comment in the code where the compressed table is produced
1163 inplace from the flat tabe representation for an explanation of how
1164 the compression works.
1169 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1171 Second Pass -- Array Of Lists Representation
1173 Each state will be represented by a list of charid:state records
1174 (reg_trie_trans_le) the first such element holds the CUR and LEN
1175 points of the allocated array. (See defines above).
1177 We build the initial structure using the lists, and then convert
1178 it into the compressed table form which allows faster lookups
1179 (but cant be modified once converted).
1182 STRLEN transcount = 1;
1184 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1188 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1190 regnode * const noper = NEXTOPER( cur );
1191 U8 *uc = (U8*)STRING( noper );
1192 const U8 * const e = uc + STR_LEN( noper );
1193 U32 state = 1; /* required init */
1194 U16 charid = 0; /* sanity init */
1195 U8 *scan = (U8*)NULL; /* sanity init */
1196 STRLEN foldlen = 0; /* required init */
1197 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1199 if (OP(noper) != NOTHING) {
1200 for ( ; uc < e ; uc += len ) {
1205 charid = trie->charmap[ uvc ];
1207 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1211 charid=(U16)SvIV( *svpp );
1220 if ( !trie->states[ state ].trans.list ) {
1221 TRIE_LIST_NEW( state );
1223 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1224 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1225 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1230 newstate = next_alloc++;
1231 TRIE_LIST_PUSH( state, charid, newstate );
1236 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1238 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1241 TRIE_HANDLE_WORD(state);
1243 } /* end second pass */
1245 TRIE_LASTSTATE(trie) = next_alloc;
1246 Renew( trie->states, next_alloc, reg_trie_state );
1248 /* and now dump it out before we compress it */
1249 DEBUG_TRIE_COMPILE_MORE_r(
1250 dump_trie_interim_list(trie,next_alloc,depth+1)
1253 Newxz( trie->trans, transcount ,reg_trie_trans );
1260 for( state=1 ; state < next_alloc ; state ++ ) {
1264 DEBUG_TRIE_COMPILE_MORE_r(
1265 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1269 if (trie->states[state].trans.list) {
1270 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1274 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1275 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1276 if ( forid < minid ) {
1278 } else if ( forid > maxid ) {
1282 if ( transcount < tp + maxid - minid + 1) {
1284 Renew( trie->trans, transcount, reg_trie_trans );
1285 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1287 base = trie->uniquecharcount + tp - minid;
1288 if ( maxid == minid ) {
1290 for ( ; zp < tp ; zp++ ) {
1291 if ( ! trie->trans[ zp ].next ) {
1292 base = trie->uniquecharcount + zp - minid;
1293 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1294 trie->trans[ zp ].check = state;
1300 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1301 trie->trans[ tp ].check = state;
1306 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1307 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1308 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1309 trie->trans[ tid ].check = state;
1311 tp += ( maxid - minid + 1 );
1313 Safefree(trie->states[ state ].trans.list);
1316 DEBUG_TRIE_COMPILE_MORE_r(
1317 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1320 trie->states[ state ].trans.base=base;
1322 trie->lasttrans = tp + 1;
1326 Second Pass -- Flat Table Representation.
1328 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1329 We know that we will need Charcount+1 trans at most to store the data
1330 (one row per char at worst case) So we preallocate both structures
1331 assuming worst case.
1333 We then construct the trie using only the .next slots of the entry
1336 We use the .check field of the first entry of the node temporarily to
1337 make compression both faster and easier by keeping track of how many non
1338 zero fields are in the node.
1340 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1343 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1344 number representing the first entry of the node, and state as a
1345 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1346 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1347 are 2 entrys per node. eg:
1355 The table is internally in the right hand, idx form. However as we also
1356 have to deal with the states array which is indexed by nodenum we have to
1357 use TRIE_NODENUM() to convert.
1362 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1364 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1365 next_alloc = trie->uniquecharcount + 1;
1368 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1370 regnode * const noper = NEXTOPER( cur );
1371 const U8 *uc = (U8*)STRING( noper );
1372 const U8 * const e = uc + STR_LEN( noper );
1374 U32 state = 1; /* required init */
1376 U16 charid = 0; /* sanity init */
1377 U32 accept_state = 0; /* sanity init */
1378 U8 *scan = (U8*)NULL; /* sanity init */
1380 STRLEN foldlen = 0; /* required init */
1381 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1383 if ( OP(noper) != NOTHING ) {
1384 for ( ; uc < e ; uc += len ) {
1389 charid = trie->charmap[ uvc ];
1391 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1392 charid = svpp ? (U16)SvIV(*svpp) : 0;
1396 if ( !trie->trans[ state + charid ].next ) {
1397 trie->trans[ state + charid ].next = next_alloc;
1398 trie->trans[ state ].check++;
1399 next_alloc += trie->uniquecharcount;
1401 state = trie->trans[ state + charid ].next;
1403 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1405 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1408 accept_state = TRIE_NODENUM( state );
1409 TRIE_HANDLE_WORD(accept_state);
1411 } /* end second pass */
1413 /* and now dump it out before we compress it */
1414 DEBUG_TRIE_COMPILE_MORE_r(
1415 dump_trie_interim_table(trie,next_alloc,depth+1)
1420 * Inplace compress the table.*
1422 For sparse data sets the table constructed by the trie algorithm will
1423 be mostly 0/FAIL transitions or to put it another way mostly empty.
1424 (Note that leaf nodes will not contain any transitions.)
1426 This algorithm compresses the tables by eliminating most such
1427 transitions, at the cost of a modest bit of extra work during lookup:
1429 - Each states[] entry contains a .base field which indicates the
1430 index in the state[] array wheres its transition data is stored.
1432 - If .base is 0 there are no valid transitions from that node.
1434 - If .base is nonzero then charid is added to it to find an entry in
1437 -If trans[states[state].base+charid].check!=state then the
1438 transition is taken to be a 0/Fail transition. Thus if there are fail
1439 transitions at the front of the node then the .base offset will point
1440 somewhere inside the previous nodes data (or maybe even into a node
1441 even earlier), but the .check field determines if the transition is
1444 The following process inplace converts the table to the compressed
1445 table: We first do not compress the root node 1,and mark its all its
1446 .check pointers as 1 and set its .base pointer as 1 as well. This
1447 allows to do a DFA construction from the compressed table later, and
1448 ensures that any .base pointers we calculate later are greater than
1451 - We set 'pos' to indicate the first entry of the second node.
1453 - We then iterate over the columns of the node, finding the first and
1454 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1455 and set the .check pointers accordingly, and advance pos
1456 appropriately and repreat for the next node. Note that when we copy
1457 the next pointers we have to convert them from the original
1458 NODEIDX form to NODENUM form as the former is not valid post
1461 - If a node has no transitions used we mark its base as 0 and do not
1462 advance the pos pointer.
1464 - If a node only has one transition we use a second pointer into the
1465 structure to fill in allocated fail transitions from other states.
1466 This pointer is independent of the main pointer and scans forward
1467 looking for null transitions that are allocated to a state. When it
1468 finds one it writes the single transition into the "hole". If the
1469 pointer doesnt find one the single transition is appeneded as normal.
1471 - Once compressed we can Renew/realloc the structures to release the
1474 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1475 specifically Fig 3.47 and the associated pseudocode.
1479 const U32 laststate = TRIE_NODENUM( next_alloc );
1482 TRIE_LASTSTATE(trie) = laststate;
1484 for ( state = 1 ; state < laststate ; state++ ) {
1486 const U32 stateidx = TRIE_NODEIDX( state );
1487 const U32 o_used = trie->trans[ stateidx ].check;
1488 U32 used = trie->trans[ stateidx ].check;
1489 trie->trans[ stateidx ].check = 0;
1491 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1492 if ( flag || trie->trans[ stateidx + charid ].next ) {
1493 if ( trie->trans[ stateidx + charid ].next ) {
1495 for ( ; zp < pos ; zp++ ) {
1496 if ( ! trie->trans[ zp ].next ) {
1500 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1501 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1502 trie->trans[ zp ].check = state;
1503 if ( ++zp > pos ) pos = zp;
1510 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1512 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1513 trie->trans[ pos ].check = state;
1518 trie->lasttrans = pos + 1;
1519 Renew( trie->states, laststate + 1, reg_trie_state);
1520 DEBUG_TRIE_COMPILE_MORE_r(
1521 PerlIO_printf( Perl_debug_log,
1522 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1523 (int)depth * 2 + 2,"",
1524 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1527 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1530 } /* end table compress */
1532 /* resize the trans array to remove unused space */
1533 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1535 /* and now dump out the compressed format */
1536 DEBUG_TRIE_COMPILE_r(
1537 dump_trie(trie,depth+1)
1540 { /* Modify the program and insert the new TRIE node*/
1542 U8 nodetype =(U8)(flags & 0xFF);
1545 This means we convert either the first branch or the first Exact,
1546 depending on whether the thing following (in 'last') is a branch
1547 or not and whther first is the startbranch (ie is it a sub part of
1548 the alternation or is it the whole thing.)
1549 Assuming its a sub part we conver the EXACT otherwise we convert
1550 the whole branch sequence, including the first.
1552 /* Find the node we are going to overwrite */
1553 if ( first == startbranch && OP( last ) != BRANCH ) {
1556 convert = NEXTOPER( first );
1557 NEXT_OFF( first ) = (U16)(last - first);
1560 /* But first we check to see if there is a common prefix we can
1561 split out as an EXACT and put in front of the TRIE node. */
1562 trie->startstate= 1;
1563 if ( trie->bitmap && !trie->widecharmap ) {
1566 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1567 (int)depth * 2 + 2, "",
1568 TRIE_LASTSTATE(trie))
1570 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
1574 const U32 base = trie->states[ state ].trans.base;
1576 if ( trie->states[state].wordnum )
1579 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1580 if ( ( base + ofs >= trie->uniquecharcount ) &&
1581 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1582 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1584 if ( ++count > 1 ) {
1585 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1586 const char *ch = SvPV_nolen_const( *tmp );
1587 if ( state == 1 ) break;
1589 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1591 PerlIO_printf(Perl_debug_log,
1592 "%*sNew Start State=%"UVuf" Class: [",
1593 (int)depth * 2 + 2, "",
1596 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1597 const char *ch = SvPV_nolen_const( *tmp );
1599 TRIE_BITMAP_SET(trie,*ch);
1601 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1603 PerlIO_printf(Perl_debug_log, ch)
1607 TRIE_BITMAP_SET(trie,*ch);
1609 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1610 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1616 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1617 const char *ch = SvPV_nolen_const( *tmp );
1619 PerlIO_printf( Perl_debug_log,
1620 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1621 (int)depth * 2 + 2, "",
1625 OP( convert ) = nodetype;
1626 str=STRING(convert);
1634 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1639 regnode *n = convert+NODE_SZ_STR(convert);
1640 NEXT_OFF(convert)= NODE_SZ_STR(convert);
1641 trie->startstate = state;
1642 trie->minlen-= (state-1);
1643 trie->maxlen-= (state-1);
1647 NEXT_OFF(convert) = (U16)(tail - convert);
1651 if ( trie->maxlen ) {
1652 OP( convert ) = TRIE;
1653 NEXT_OFF( convert ) = (U16)(tail - convert);
1654 ARG_SET( convert, data_slot );
1656 /* store the type in the flags */
1657 convert->flags = nodetype;
1658 /* XXX We really should free up the resource in trie now, as we wont use them */
1660 /* needed for dumping*/
1662 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1663 /* We now need to mark all of the space originally used by the
1664 branches as optimized away. This keeps the dumpuntil from
1665 throwing a wobbly as it doesnt use regnext() to traverse the
1668 while( optimize < last ) {
1669 OP( optimize ) = OPTIMIZED;
1673 } /* end node insert */
1675 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1681 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1682 * These need to be revisited when a newer toolchain becomes available.
1684 #if defined(__sparc64__) && defined(__GNUC__)
1685 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1686 # undef SPARC64_GCC_WORKAROUND
1687 # define SPARC64_GCC_WORKAROUND 1
1691 /* REx optimizer. Converts nodes into quickier variants "in place".
1692 Finds fixed substrings. */
1694 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1695 to the position after last scanned or to NULL. */
1698 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1699 regnode *last, scan_data_t *data, U32 flags, U32 depth)
1700 /* scanp: Start here (read-write). */
1701 /* deltap: Write maxlen-minlen here. */
1702 /* last: Stop before this one. */
1705 I32 min = 0, pars = 0, code;
1706 regnode *scan = *scanp, *next;
1708 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1709 int is_inf_internal = 0; /* The studied chunk is infinite */
1710 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1711 scan_data_t data_fake;
1712 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
1713 SV *re_trie_maxbuff = NULL;
1715 GET_RE_DEBUG_FLAGS_DECL;
1717 while (scan && OP(scan) != END && scan < last) {
1721 /* Peephole optimizer: */
1723 SV * const mysv=sv_newmortal();
1724 regprop(RExC_rx, mysv, scan);
1725 PerlIO_printf(Perl_debug_log, "%*s%4s~ %s (%d)\n",
1727 scan==*scanp ? "Peep" : "",
1728 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
1730 if (PL_regkind[OP(scan)] == EXACT) {
1731 /* Merge several consecutive EXACTish nodes into one. */
1732 regnode *n = regnext(scan);
1735 regnode *stop = scan;
1737 next = scan + NODE_SZ_STR(scan);
1738 /* Skip NOTHING, merge EXACT*. */
1740 ( PL_regkind[OP(n)] == NOTHING ||
1741 (stringok && (OP(n) == OP(scan))))
1743 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1744 if (OP(n) == TAIL || n > next)
1746 if (PL_regkind[OP(n)] == NOTHING) {
1748 SV * const mysv=sv_newmortal();
1749 regprop(RExC_rx, mysv, n);
1750 PerlIO_printf(Perl_debug_log, "%*sskip: %s (%d)\n",
1751 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
1753 NEXT_OFF(scan) += NEXT_OFF(n);
1754 next = n + NODE_STEP_REGNODE;
1761 else if (stringok) {
1762 const int oldl = STR_LEN(scan);
1763 regnode * const nnext = regnext(n);
1765 SV * const mysv=sv_newmortal();
1766 regprop(RExC_rx, mysv, n);
1767 PerlIO_printf(Perl_debug_log, "%*s mrg: %s (%d)\n",
1768 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(n));
1771 if (oldl + STR_LEN(n) > U8_MAX)
1773 NEXT_OFF(scan) += NEXT_OFF(n);
1774 STR_LEN(scan) += STR_LEN(n);
1775 next = n + NODE_SZ_STR(n);
1776 /* Now we can overwrite *n : */
1777 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1785 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1787 Two problematic code points in Unicode casefolding of EXACT nodes:
1789 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1790 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1796 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1797 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1799 This means that in case-insensitive matching (or "loose matching",
1800 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1801 length of the above casefolded versions) can match a target string
1802 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1803 This would rather mess up the minimum length computation.
1805 What we'll do is to look for the tail four bytes, and then peek
1806 at the preceding two bytes to see whether we need to decrease
1807 the minimum length by four (six minus two).
1809 Thanks to the design of UTF-8, there cannot be false matches:
1810 A sequence of valid UTF-8 bytes cannot be a subsequence of
1811 another valid sequence of UTF-8 bytes.
1814 char * const s0 = STRING(scan), *s, *t;
1815 char * const s1 = s0 + STR_LEN(scan) - 1;
1816 char * const s2 = s1 - 4;
1817 const char t0[] = "\xcc\x88\xcc\x81";
1818 const char * const t1 = t0 + 3;
1821 s < s2 && (t = ninstr(s, s1, t0, t1));
1823 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1824 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1831 n = scan + NODE_SZ_STR(scan);
1833 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1844 /* Follow the next-chain of the current node and optimize
1845 away all the NOTHINGs from it. */
1846 if (OP(scan) != CURLYX) {
1847 const int max = (reg_off_by_arg[OP(scan)]
1849 /* I32 may be smaller than U16 on CRAYs! */
1850 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
1851 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1855 /* Skip NOTHING and LONGJMP. */
1856 while ((n = regnext(n))
1857 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
1858 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1859 && off + noff < max)
1861 if (reg_off_by_arg[OP(scan)])
1864 NEXT_OFF(scan) = off;
1867 DEBUG_OPTIMISE_r({if (merged){
1868 SV * const mysv=sv_newmortal();
1869 regprop(RExC_rx, mysv, scan);
1870 PerlIO_printf(Perl_debug_log, "%*s res: %s (%d)\n",
1871 (int)depth*2, "", SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
1874 /* The principal pseudo-switch. Cannot be a switch, since we
1875 look into several different things. */
1876 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
1877 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1878 next = regnext(scan);
1880 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
1882 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
1883 I32 max1 = 0, min1 = I32_MAX, num = 0;
1884 struct regnode_charclass_class accum;
1885 regnode * const startbranch=scan;
1887 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1888 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
1889 if (flags & SCF_DO_STCLASS)
1890 cl_init_zero(pRExC_state, &accum);
1892 while (OP(scan) == code) {
1893 I32 deltanext, minnext, f = 0, fake;
1894 struct regnode_charclass_class this_class;
1897 data_fake.flags = 0;
1899 data_fake.whilem_c = data->whilem_c;
1900 data_fake.last_closep = data->last_closep;
1903 data_fake.last_closep = &fake;
1904 next = regnext(scan);
1905 scan = NEXTOPER(scan);
1907 scan = NEXTOPER(scan);
1908 if (flags & SCF_DO_STCLASS) {
1909 cl_init(pRExC_state, &this_class);
1910 data_fake.start_class = &this_class;
1911 f = SCF_DO_STCLASS_AND;
1913 if (flags & SCF_WHILEM_VISITED_POS)
1914 f |= SCF_WHILEM_VISITED_POS;
1916 /* we suppose the run is continuous, last=next...*/
1917 minnext = study_chunk(pRExC_state, &scan, &deltanext,
1918 next, &data_fake, f,depth+1);
1921 if (max1 < minnext + deltanext)
1922 max1 = minnext + deltanext;
1923 if (deltanext == I32_MAX)
1924 is_inf = is_inf_internal = 1;
1926 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1929 if (data_fake.flags & SF_HAS_EVAL)
1930 data->flags |= SF_HAS_EVAL;
1931 data->whilem_c = data_fake.whilem_c;
1933 if (flags & SCF_DO_STCLASS)
1934 cl_or(pRExC_state, &accum, &this_class);
1935 if (code == SUSPEND)
1938 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1940 if (flags & SCF_DO_SUBSTR) {
1941 data->pos_min += min1;
1942 data->pos_delta += max1 - min1;
1943 if (max1 != min1 || is_inf)
1944 data->longest = &(data->longest_float);
1947 delta += max1 - min1;
1948 if (flags & SCF_DO_STCLASS_OR) {
1949 cl_or(pRExC_state, data->start_class, &accum);
1951 cl_and(data->start_class, &and_with);
1952 flags &= ~SCF_DO_STCLASS;
1955 else if (flags & SCF_DO_STCLASS_AND) {
1957 cl_and(data->start_class, &accum);
1958 flags &= ~SCF_DO_STCLASS;
1961 /* Switch to OR mode: cache the old value of
1962 * data->start_class */
1963 StructCopy(data->start_class, &and_with,
1964 struct regnode_charclass_class);
1965 flags &= ~SCF_DO_STCLASS_AND;
1966 StructCopy(&accum, data->start_class,
1967 struct regnode_charclass_class);
1968 flags |= SCF_DO_STCLASS_OR;
1969 data->start_class->flags |= ANYOF_EOS;
1975 Assuming this was/is a branch we are dealing with: 'scan' now
1976 points at the item that follows the branch sequence, whatever
1977 it is. We now start at the beginning of the sequence and look
1983 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1985 If we can find such a subseqence we need to turn the first
1986 element into a trie and then add the subsequent branch exact
1987 strings to the trie.
1991 1. patterns where the whole set of branch can be converted to a trie,
1993 2. patterns where only a subset of the alternations can be
1994 converted to a trie.
1996 In case 1 we can replace the whole set with a single regop
1997 for the trie. In case 2 we need to keep the start and end
2000 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2001 becomes BRANCH TRIE; BRANCH X;
2003 Hypthetically when we know the regex isnt anchored we can
2004 turn a case 1 into a DFA and let it rip... Every time it finds a match
2005 it would just call its tail, no WHILEM/CURLY needed.
2010 if (!re_trie_maxbuff) {
2011 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2012 if (!SvIOK(re_trie_maxbuff))
2013 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2015 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2017 regnode *first = (regnode *)NULL;
2018 regnode *last = (regnode *)NULL;
2019 regnode *tail = scan;
2024 SV * const mysv = sv_newmortal(); /* for dumping */
2026 /* var tail is used because there may be a TAIL
2027 regop in the way. Ie, the exacts will point to the
2028 thing following the TAIL, but the last branch will
2029 point at the TAIL. So we advance tail. If we
2030 have nested (?:) we may have to move through several
2034 while ( OP( tail ) == TAIL ) {
2035 /* this is the TAIL generated by (?:) */
2036 tail = regnext( tail );
2041 regprop(RExC_rx, mysv, tail );
2042 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2043 (int)depth * 2 + 2, "",
2044 "Looking for TRIE'able sequences. Tail node is: ",
2045 SvPV_nolen_const( mysv )
2051 step through the branches, cur represents each
2052 branch, noper is the first thing to be matched
2053 as part of that branch and noper_next is the
2054 regnext() of that node. if noper is an EXACT
2055 and noper_next is the same as scan (our current
2056 position in the regex) then the EXACT branch is
2057 a possible optimization target. Once we have
2058 two or more consequetive such branches we can
2059 create a trie of the EXACT's contents and stich
2060 it in place. If the sequence represents all of
2061 the branches we eliminate the whole thing and
2062 replace it with a single TRIE. If it is a
2063 subsequence then we need to stitch it in. This
2064 means the first branch has to remain, and needs
2065 to be repointed at the item on the branch chain
2066 following the last branch optimized. This could
2067 be either a BRANCH, in which case the
2068 subsequence is internal, or it could be the
2069 item following the branch sequence in which
2070 case the subsequence is at the end.
2074 /* dont use tail as the end marker for this traverse */
2075 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2076 regnode * const noper = NEXTOPER( cur );
2077 regnode * const noper_next = regnext( noper );
2080 regprop(RExC_rx, mysv, cur);
2081 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2082 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2084 regprop(RExC_rx, mysv, noper);
2085 PerlIO_printf( Perl_debug_log, " -> %s",
2086 SvPV_nolen_const(mysv));
2089 regprop(RExC_rx, mysv, noper_next );
2090 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2091 SvPV_nolen_const(mysv));
2093 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2094 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2096 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2097 : PL_regkind[ OP( noper ) ] == EXACT )
2098 || OP(noper) == NOTHING )
2099 && noper_next == tail && count<U16_MAX)
2102 if ( !first || optype == NOTHING ) {
2103 if (!first) first = cur;
2104 optype = OP( noper );
2110 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
2112 if ( PL_regkind[ OP( noper ) ] == EXACT
2113 && noper_next == tail )
2117 optype = OP( noper );
2127 regprop(RExC_rx, mysv, cur);
2128 PerlIO_printf( Perl_debug_log,
2129 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2130 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2134 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2135 #ifdef TRIE_STUDY_OPT
2136 if ( OP(first)!=TRIE && startbranch == first ) {
2145 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2146 scan = NEXTOPER(NEXTOPER(scan));
2147 } else /* single branch is optimized. */
2148 scan = NEXTOPER(scan);
2151 else if (OP(scan) == EXACT) {
2152 I32 l = STR_LEN(scan);
2155 const U8 * const s = (U8*)STRING(scan);
2156 l = utf8_length(s, s + l);
2157 uc = utf8_to_uvchr(s, NULL);
2159 uc = *((U8*)STRING(scan));
2162 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2163 /* The code below prefers earlier match for fixed
2164 offset, later match for variable offset. */
2165 if (data->last_end == -1) { /* Update the start info. */
2166 data->last_start_min = data->pos_min;
2167 data->last_start_max = is_inf
2168 ? I32_MAX : data->pos_min + data->pos_delta;
2170 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2172 SvUTF8_on(data->last_found);
2174 SV * const sv = data->last_found;
2175 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2176 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2177 if (mg && mg->mg_len >= 0)
2178 mg->mg_len += utf8_length((U8*)STRING(scan),
2179 (U8*)STRING(scan)+STR_LEN(scan));
2181 data->last_end = data->pos_min + l;
2182 data->pos_min += l; /* As in the first entry. */
2183 data->flags &= ~SF_BEFORE_EOL;
2185 if (flags & SCF_DO_STCLASS_AND) {
2186 /* Check whether it is compatible with what we know already! */
2190 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2191 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2192 && (!(data->start_class->flags & ANYOF_FOLD)
2193 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2196 ANYOF_CLASS_ZERO(data->start_class);
2197 ANYOF_BITMAP_ZERO(data->start_class);
2199 ANYOF_BITMAP_SET(data->start_class, uc);
2200 data->start_class->flags &= ~ANYOF_EOS;
2202 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2204 else if (flags & SCF_DO_STCLASS_OR) {
2205 /* false positive possible if the class is case-folded */
2207 ANYOF_BITMAP_SET(data->start_class, uc);
2209 data->start_class->flags |= ANYOF_UNICODE_ALL;
2210 data->start_class->flags &= ~ANYOF_EOS;
2211 cl_and(data->start_class, &and_with);
2213 flags &= ~SCF_DO_STCLASS;
2215 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2216 I32 l = STR_LEN(scan);
2217 UV uc = *((U8*)STRING(scan));
2219 /* Search for fixed substrings supports EXACT only. */
2220 if (flags & SCF_DO_SUBSTR) {
2222 scan_commit(pRExC_state, data);
2225 const U8 * const s = (U8 *)STRING(scan);
2226 l = utf8_length(s, s + l);
2227 uc = utf8_to_uvchr(s, NULL);
2230 if (flags & SCF_DO_SUBSTR)
2232 if (flags & SCF_DO_STCLASS_AND) {
2233 /* Check whether it is compatible with what we know already! */
2237 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2238 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2239 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2241 ANYOF_CLASS_ZERO(data->start_class);
2242 ANYOF_BITMAP_ZERO(data->start_class);
2244 ANYOF_BITMAP_SET(data->start_class, uc);
2245 data->start_class->flags &= ~ANYOF_EOS;
2246 data->start_class->flags |= ANYOF_FOLD;
2247 if (OP(scan) == EXACTFL)
2248 data->start_class->flags |= ANYOF_LOCALE;
2251 else if (flags & SCF_DO_STCLASS_OR) {
2252 if (data->start_class->flags & ANYOF_FOLD) {
2253 /* false positive possible if the class is case-folded.
2254 Assume that the locale settings are the same... */
2256 ANYOF_BITMAP_SET(data->start_class, uc);
2257 data->start_class->flags &= ~ANYOF_EOS;
2259 cl_and(data->start_class, &and_with);
2261 flags &= ~SCF_DO_STCLASS;
2263 #ifdef TRIE_STUDY_OPT
2264 else if (OP(scan) == TRIE) {
2265 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2266 min += trie->minlen;
2267 flags &= ~SCF_DO_STCLASS; /* xxx */
2268 if (flags & SCF_DO_SUBSTR) {
2269 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2270 data->pos_min += trie->minlen;
2271 data->pos_delta+= (trie->maxlen-trie->minlen);
2275 else if (strchr((const char*)PL_varies,OP(scan))) {
2276 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2277 I32 f = flags, pos_before = 0;
2278 regnode * const oscan = scan;
2279 struct regnode_charclass_class this_class;
2280 struct regnode_charclass_class *oclass = NULL;
2281 I32 next_is_eval = 0;
2283 switch (PL_regkind[OP(scan)]) {
2284 case WHILEM: /* End of (?:...)* . */
2285 scan = NEXTOPER(scan);
2288 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2289 next = NEXTOPER(scan);
2290 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2292 maxcount = REG_INFTY;
2293 next = regnext(scan);
2294 scan = NEXTOPER(scan);
2298 if (flags & SCF_DO_SUBSTR)
2303 if (flags & SCF_DO_STCLASS) {
2305 maxcount = REG_INFTY;
2306 next = regnext(scan);
2307 scan = NEXTOPER(scan);
2310 is_inf = is_inf_internal = 1;
2311 scan = regnext(scan);
2312 if (flags & SCF_DO_SUBSTR) {
2313 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2314 data->longest = &(data->longest_float);
2316 goto optimize_curly_tail;
2318 mincount = ARG1(scan);
2319 maxcount = ARG2(scan);
2320 next = regnext(scan);
2321 if (OP(scan) == CURLYX) {
2322 I32 lp = (data ? *(data->last_closep) : 0);
2323 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
2325 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2326 next_is_eval = (OP(scan) == EVAL);
2328 if (flags & SCF_DO_SUBSTR) {
2329 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2330 pos_before = data->pos_min;
2334 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2336 data->flags |= SF_IS_INF;
2338 if (flags & SCF_DO_STCLASS) {
2339 cl_init(pRExC_state, &this_class);
2340 oclass = data->start_class;
2341 data->start_class = &this_class;
2342 f |= SCF_DO_STCLASS_AND;
2343 f &= ~SCF_DO_STCLASS_OR;
2345 /* These are the cases when once a subexpression
2346 fails at a particular position, it cannot succeed
2347 even after backtracking at the enclosing scope.
2349 XXXX what if minimal match and we are at the
2350 initial run of {n,m}? */
2351 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2352 f &= ~SCF_WHILEM_VISITED_POS;
2354 /* This will finish on WHILEM, setting scan, or on NULL: */
2355 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2357 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2359 if (flags & SCF_DO_STCLASS)
2360 data->start_class = oclass;
2361 if (mincount == 0 || minnext == 0) {
2362 if (flags & SCF_DO_STCLASS_OR) {
2363 cl_or(pRExC_state, data->start_class, &this_class);
2365 else if (flags & SCF_DO_STCLASS_AND) {
2366 /* Switch to OR mode: cache the old value of
2367 * data->start_class */
2368 StructCopy(data->start_class, &and_with,
2369 struct regnode_charclass_class);
2370 flags &= ~SCF_DO_STCLASS_AND;
2371 StructCopy(&this_class, data->start_class,
2372 struct regnode_charclass_class);
2373 flags |= SCF_DO_STCLASS_OR;
2374 data->start_class->flags |= ANYOF_EOS;
2376 } else { /* Non-zero len */
2377 if (flags & SCF_DO_STCLASS_OR) {
2378 cl_or(pRExC_state, data->start_class, &this_class);
2379 cl_and(data->start_class, &and_with);
2381 else if (flags & SCF_DO_STCLASS_AND)
2382 cl_and(data->start_class, &this_class);
2383 flags &= ~SCF_DO_STCLASS;
2385 if (!scan) /* It was not CURLYX, but CURLY. */
2387 if ( /* ? quantifier ok, except for (?{ ... }) */
2388 (next_is_eval || !(mincount == 0 && maxcount == 1))
2389 && (minnext == 0) && (deltanext == 0)
2390 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2391 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2392 && ckWARN(WARN_REGEXP))
2395 "Quantifier unexpected on zero-length expression");
2398 min += minnext * mincount;
2399 is_inf_internal |= ((maxcount == REG_INFTY
2400 && (minnext + deltanext) > 0)
2401 || deltanext == I32_MAX);
2402 is_inf |= is_inf_internal;
2403 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2405 /* Try powerful optimization CURLYX => CURLYN. */
2406 if ( OP(oscan) == CURLYX && data
2407 && data->flags & SF_IN_PAR
2408 && !(data->flags & SF_HAS_EVAL)
2409 && !deltanext && minnext == 1 ) {
2410 /* Try to optimize to CURLYN. */
2411 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2412 regnode * const nxt1 = nxt;
2419 if (!strchr((const char*)PL_simple,OP(nxt))
2420 && !(PL_regkind[OP(nxt)] == EXACT
2421 && STR_LEN(nxt) == 1))
2427 if (OP(nxt) != CLOSE)
2429 /* Now we know that nxt2 is the only contents: */
2430 oscan->flags = (U8)ARG(nxt);
2432 OP(nxt1) = NOTHING; /* was OPEN. */
2434 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2435 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2436 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2437 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2438 OP(nxt + 1) = OPTIMIZED; /* was count. */
2439 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2444 /* Try optimization CURLYX => CURLYM. */
2445 if ( OP(oscan) == CURLYX && data
2446 && !(data->flags & SF_HAS_PAR)
2447 && !(data->flags & SF_HAS_EVAL)
2448 && !deltanext /* atom is fixed width */
2449 && minnext != 0 /* CURLYM can't handle zero width */
2451 /* XXXX How to optimize if data == 0? */
2452 /* Optimize to a simpler form. */
2453 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2457 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2458 && (OP(nxt2) != WHILEM))
2460 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2461 /* Need to optimize away parenths. */
2462 if (data->flags & SF_IN_PAR) {
2463 /* Set the parenth number. */
2464 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2466 if (OP(nxt) != CLOSE)
2467 FAIL("Panic opt close");
2468 oscan->flags = (U8)ARG(nxt);
2469 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2470 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2472 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2473 OP(nxt + 1) = OPTIMIZED; /* was count. */
2474 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2475 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2478 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2479 regnode *nnxt = regnext(nxt1);
2482 if (reg_off_by_arg[OP(nxt1)])
2483 ARG_SET(nxt1, nxt2 - nxt1);
2484 else if (nxt2 - nxt1 < U16_MAX)
2485 NEXT_OFF(nxt1) = nxt2 - nxt1;
2487 OP(nxt) = NOTHING; /* Cannot beautify */
2492 /* Optimize again: */
2493 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2499 else if ((OP(oscan) == CURLYX)
2500 && (flags & SCF_WHILEM_VISITED_POS)
2501 /* See the comment on a similar expression above.
2502 However, this time it not a subexpression
2503 we care about, but the expression itself. */
2504 && (maxcount == REG_INFTY)
2505 && data && ++data->whilem_c < 16) {
2506 /* This stays as CURLYX, we can put the count/of pair. */
2507 /* Find WHILEM (as in regexec.c) */
2508 regnode *nxt = oscan + NEXT_OFF(oscan);
2510 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2512 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2513 | (RExC_whilem_seen << 4)); /* On WHILEM */
2515 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2517 if (flags & SCF_DO_SUBSTR) {
2518 SV *last_str = NULL;
2519 int counted = mincount != 0;
2521 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2522 #if defined(SPARC64_GCC_WORKAROUND)
2525 const char *s = NULL;
2528 if (pos_before >= data->last_start_min)
2531 b = data->last_start_min;
2534 s = SvPV_const(data->last_found, l);
2535 old = b - data->last_start_min;
2538 I32 b = pos_before >= data->last_start_min
2539 ? pos_before : data->last_start_min;
2541 const char * const s = SvPV_const(data->last_found, l);
2542 I32 old = b - data->last_start_min;
2546 old = utf8_hop((U8*)s, old) - (U8*)s;
2549 /* Get the added string: */
2550 last_str = newSVpvn(s + old, l);
2552 SvUTF8_on(last_str);
2553 if (deltanext == 0 && pos_before == b) {
2554 /* What was added is a constant string */
2556 SvGROW(last_str, (mincount * l) + 1);
2557 repeatcpy(SvPVX(last_str) + l,
2558 SvPVX_const(last_str), l, mincount - 1);
2559 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2560 /* Add additional parts. */
2561 SvCUR_set(data->last_found,
2562 SvCUR(data->last_found) - l);
2563 sv_catsv(data->last_found, last_str);
2565 SV * sv = data->last_found;
2567 SvUTF8(sv) && SvMAGICAL(sv) ?
2568 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2569 if (mg && mg->mg_len >= 0)
2570 mg->mg_len += CHR_SVLEN(last_str);
2572 data->last_end += l * (mincount - 1);
2575 /* start offset must point into the last copy */
2576 data->last_start_min += minnext * (mincount - 1);
2577 data->last_start_max += is_inf ? I32_MAX
2578 : (maxcount - 1) * (minnext + data->pos_delta);
2581 /* It is counted once already... */
2582 data->pos_min += minnext * (mincount - counted);
2583 data->pos_delta += - counted * deltanext +
2584 (minnext + deltanext) * maxcount - minnext * mincount;
2585 if (mincount != maxcount) {
2586 /* Cannot extend fixed substrings found inside
2588 scan_commit(pRExC_state,data);
2589 if (mincount && last_str) {
2590 SV * const sv = data->last_found;
2591 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2592 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2596 sv_setsv(sv, last_str);
2597 data->last_end = data->pos_min;
2598 data->last_start_min =
2599 data->pos_min - CHR_SVLEN(last_str);
2600 data->last_start_max = is_inf
2602 : data->pos_min + data->pos_delta
2603 - CHR_SVLEN(last_str);
2605 data->longest = &(data->longest_float);
2607 SvREFCNT_dec(last_str);
2609 if (data && (fl & SF_HAS_EVAL))
2610 data->flags |= SF_HAS_EVAL;
2611 optimize_curly_tail:
2612 if (OP(oscan) != CURLYX) {
2613 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2615 NEXT_OFF(oscan) += NEXT_OFF(next);
2618 default: /* REF and CLUMP only? */
2619 if (flags & SCF_DO_SUBSTR) {
2620 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2621 data->longest = &(data->longest_float);
2623 is_inf = is_inf_internal = 1;
2624 if (flags & SCF_DO_STCLASS_OR)
2625 cl_anything(pRExC_state, data->start_class);
2626 flags &= ~SCF_DO_STCLASS;
2630 else if (strchr((const char*)PL_simple,OP(scan))) {
2633 if (flags & SCF_DO_SUBSTR) {
2634 scan_commit(pRExC_state,data);
2638 if (flags & SCF_DO_STCLASS) {
2639 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2641 /* Some of the logic below assumes that switching
2642 locale on will only add false positives. */
2643 switch (PL_regkind[OP(scan)]) {
2647 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2648 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2649 cl_anything(pRExC_state, data->start_class);
2652 if (OP(scan) == SANY)
2654 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2655 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2656 || (data->start_class->flags & ANYOF_CLASS));
2657 cl_anything(pRExC_state, data->start_class);
2659 if (flags & SCF_DO_STCLASS_AND || !value)
2660 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2663 if (flags & SCF_DO_STCLASS_AND)
2664 cl_and(data->start_class,
2665 (struct regnode_charclass_class*)scan);
2667 cl_or(pRExC_state, data->start_class,
2668 (struct regnode_charclass_class*)scan);
2671 if (flags & SCF_DO_STCLASS_AND) {
2672 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2673 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2674 for (value = 0; value < 256; value++)
2675 if (!isALNUM(value))
2676 ANYOF_BITMAP_CLEAR(data->start_class, value);
2680 if (data->start_class->flags & ANYOF_LOCALE)
2681 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2683 for (value = 0; value < 256; value++)
2685 ANYOF_BITMAP_SET(data->start_class, value);
2690 if (flags & SCF_DO_STCLASS_AND) {
2691 if (data->start_class->flags & ANYOF_LOCALE)
2692 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2695 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2696 data->start_class->flags |= ANYOF_LOCALE;
2700 if (flags & SCF_DO_STCLASS_AND) {
2701 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2702 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2703 for (value = 0; value < 256; value++)
2705 ANYOF_BITMAP_CLEAR(data->start_class, value);
2709 if (data->start_class->flags & ANYOF_LOCALE)
2710 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2712 for (value = 0; value < 256; value++)
2713 if (!isALNUM(value))
2714 ANYOF_BITMAP_SET(data->start_class, value);
2719 if (flags & SCF_DO_STCLASS_AND) {
2720 if (data->start_class->flags & ANYOF_LOCALE)
2721 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2724 data->start_class->flags |= ANYOF_LOCALE;
2725 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2729 if (flags & SCF_DO_STCLASS_AND) {
2730 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2731 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2732 for (value = 0; value < 256; value++)
2733 if (!isSPACE(value))
2734 ANYOF_BITMAP_CLEAR(data->start_class, value);
2738 if (data->start_class->flags & ANYOF_LOCALE)
2739 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2741 for (value = 0; value < 256; value++)
2743 ANYOF_BITMAP_SET(data->start_class, value);
2748 if (flags & SCF_DO_STCLASS_AND) {
2749 if (data->start_class->flags & ANYOF_LOCALE)
2750 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2753 data->start_class->flags |= ANYOF_LOCALE;
2754 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2758 if (flags & SCF_DO_STCLASS_AND) {
2759 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2760 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2761 for (value = 0; value < 256; value++)
2763 ANYOF_BITMAP_CLEAR(data->start_class, value);
2767 if (data->start_class->flags & ANYOF_LOCALE)
2768 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2770 for (value = 0; value < 256; value++)
2771 if (!isSPACE(value))
2772 ANYOF_BITMAP_SET(data->start_class, value);
2777 if (flags & SCF_DO_STCLASS_AND) {
2778 if (data->start_class->flags & ANYOF_LOCALE) {
2779 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2780 for (value = 0; value < 256; value++)
2781 if (!isSPACE(value))
2782 ANYOF_BITMAP_CLEAR(data->start_class, value);
2786 data->start_class->flags |= ANYOF_LOCALE;
2787 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2791 if (flags & SCF_DO_STCLASS_AND) {
2792 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2793 for (value = 0; value < 256; value++)
2794 if (!isDIGIT(value))
2795 ANYOF_BITMAP_CLEAR(data->start_class, value);
2798 if (data->start_class->flags & ANYOF_LOCALE)
2799 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2801 for (value = 0; value < 256; value++)
2803 ANYOF_BITMAP_SET(data->start_class, value);
2808 if (flags & SCF_DO_STCLASS_AND) {
2809 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2810 for (value = 0; value < 256; value++)
2812 ANYOF_BITMAP_CLEAR(data->start_class, value);
2815 if (data->start_class->flags & ANYOF_LOCALE)
2816 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2818 for (value = 0; value < 256; value++)
2819 if (!isDIGIT(value))
2820 ANYOF_BITMAP_SET(data->start_class, value);
2825 if (flags & SCF_DO_STCLASS_OR)
2826 cl_and(data->start_class, &and_with);
2827 flags &= ~SCF_DO_STCLASS;
2830 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
2831 data->flags |= (OP(scan) == MEOL
2835 else if ( PL_regkind[OP(scan)] == BRANCHJ
2836 /* Lookbehind, or need to calculate parens/evals/stclass: */
2837 && (scan->flags || data || (flags & SCF_DO_STCLASS))
2838 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
2839 /* Lookahead/lookbehind */
2840 I32 deltanext, minnext, fake = 0;
2842 struct regnode_charclass_class intrnl;
2845 data_fake.flags = 0;
2847 data_fake.whilem_c = data->whilem_c;
2848 data_fake.last_closep = data->last_closep;
2851 data_fake.last_closep = &fake;
2852 if ( flags & SCF_DO_STCLASS && !scan->flags
2853 && OP(scan) == IFMATCH ) { /* Lookahead */
2854 cl_init(pRExC_state, &intrnl);
2855 data_fake.start_class = &intrnl;
2856 f |= SCF_DO_STCLASS_AND;
2858 if (flags & SCF_WHILEM_VISITED_POS)
2859 f |= SCF_WHILEM_VISITED_POS;
2860 next = regnext(scan);
2861 nscan = NEXTOPER(NEXTOPER(scan));
2862 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
2865 vFAIL("Variable length lookbehind not implemented");
2867 else if (minnext > U8_MAX) {
2868 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
2870 scan->flags = (U8)minnext;
2872 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2874 if (data && (data_fake.flags & SF_HAS_EVAL))
2875 data->flags |= SF_HAS_EVAL;
2877 data->whilem_c = data_fake.whilem_c;
2878 if (f & SCF_DO_STCLASS_AND) {
2879 const int was = (data->start_class->flags & ANYOF_EOS);
2881 cl_and(data->start_class, &intrnl);
2883 data->start_class->flags |= ANYOF_EOS;
2886 else if (OP(scan) == OPEN) {
2889 else if (OP(scan) == CLOSE) {
2890 if ((I32)ARG(scan) == is_par) {
2891 next = regnext(scan);
2893 if ( next && (OP(next) != WHILEM) && next < last)
2894 is_par = 0; /* Disable optimization */
2897 *(data->last_closep) = ARG(scan);
2899 else if (OP(scan) == EVAL) {
2901 data->flags |= SF_HAS_EVAL;
2903 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
2904 if (flags & SCF_DO_SUBSTR) {
2905 scan_commit(pRExC_state,data);
2906 data->longest = &(data->longest_float);
2908 is_inf = is_inf_internal = 1;
2909 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2910 cl_anything(pRExC_state, data->start_class);
2911 flags &= ~SCF_DO_STCLASS;
2913 /* Else: zero-length, ignore. */
2914 scan = regnext(scan);
2919 *deltap = is_inf_internal ? I32_MAX : delta;
2920 if (flags & SCF_DO_SUBSTR && is_inf)
2921 data->pos_delta = I32_MAX - data->pos_min;
2922 if (is_par > U8_MAX)
2924 if (is_par && pars==1 && data) {
2925 data->flags |= SF_IN_PAR;
2926 data->flags &= ~SF_HAS_PAR;
2928 else if (pars && data) {
2929 data->flags |= SF_HAS_PAR;
2930 data->flags &= ~SF_IN_PAR;
2932 if (flags & SCF_DO_STCLASS_OR)
2933 cl_and(data->start_class, &and_with);
2938 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
2940 if (RExC_rx->data) {
2941 Renewc(RExC_rx->data,
2942 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
2943 char, struct reg_data);
2944 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2945 RExC_rx->data->count += n;
2948 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
2949 char, struct reg_data);
2950 Newx(RExC_rx->data->what, n, U8);
2951 RExC_rx->data->count = n;
2953 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2954 return RExC_rx->data->count - n;
2957 #ifndef PERL_IN_XSUB_RE
2959 Perl_reginitcolors(pTHX)
2962 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
2964 char *t = savepv(s);
2968 t = strchr(t, '\t');
2974 PL_colors[i] = t = (char *)"";
2979 PL_colors[i++] = (char *)"";
2986 - pregcomp - compile a regular expression into internal code
2988 * We can't allocate space until we know how big the compiled form will be,
2989 * but we can't compile it (and thus know how big it is) until we've got a
2990 * place to put the code. So we cheat: we compile it twice, once with code
2991 * generation turned off and size counting turned on, and once "for real".
2992 * This also means that we don't allocate space until we are sure that the
2993 * thing really will compile successfully, and we never have to move the
2994 * code and thus invalidate pointers into it. (Note that it has to be in
2995 * one piece because free() must be able to free it all.) [NB: not true in perl]
2997 * Beware that the optimization-preparation code in here knows about some
2998 * of the structure of the compiled regexp. [I'll say.]
3001 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3012 RExC_state_t RExC_state;
3013 RExC_state_t *pRExC_state = &RExC_state;
3015 GET_RE_DEBUG_FLAGS_DECL;
3018 FAIL("NULL regexp argument");
3020 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3023 DEBUG_r(if (!PL_colorset) reginitcolors());
3025 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
3026 PL_colors[4],PL_colors[5],PL_colors[0],
3027 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3029 RExC_flags = pm->op_pmflags;
3033 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3034 RExC_seen_evals = 0;
3037 /* First pass: determine size, legality. */
3044 RExC_emit = &PL_regdummy;
3045 RExC_whilem_seen = 0;
3046 #if 0 /* REGC() is (currently) a NOP at the first pass.
3047 * Clever compilers notice this and complain. --jhi */
3048 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3050 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3051 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3052 RExC_precomp = NULL;
3055 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3056 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3057 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3059 /* Small enough for pointer-storage convention?
3060 If extralen==0, this means that we will not need long jumps. */
3061 if (RExC_size >= 0x10000L && RExC_extralen)
3062 RExC_size += RExC_extralen;
3065 if (RExC_whilem_seen > 15)
3066 RExC_whilem_seen = 15;
3068 /* Allocate space and initialize. */
3069 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3072 FAIL("Regexp out of space");
3075 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3076 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3079 r->prelen = xend - exp;
3080 r->precomp = savepvn(RExC_precomp, r->prelen);
3082 #ifdef PERL_OLD_COPY_ON_WRITE
3083 r->saved_copy = NULL;
3085 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3086 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3087 r->lastparen = 0; /* mg.c reads this. */
3089 r->substrs = 0; /* Useful during FAIL. */
3090 r->startp = 0; /* Useful during FAIL. */
3091 r->endp = 0; /* Useful during FAIL. */
3093 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3095 r->offsets[0] = RExC_size;
3097 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3098 "%s %"UVuf" bytes for offset annotations.\n",
3099 r->offsets ? "Got" : "Couldn't get",
3100 (UV)((2*RExC_size+1) * sizeof(U32))));
3104 /* Second pass: emit code. */
3105 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3110 RExC_emit_start = r->program;
3111 RExC_emit = r->program;
3112 /* Store the count of eval-groups for security checks: */
3113 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
3114 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3116 if (reg(pRExC_state, 0, &flags,1) == NULL)
3120 /* Dig out information for optimizations. */
3121 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3122 pm->op_pmflags = RExC_flags;
3124 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3125 r->regstclass = NULL;
3126 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3127 r->reganch |= ROPT_NAUGHTY;
3128 scan = r->program + 1; /* First BRANCH. */
3130 /* XXXX To minimize changes to RE engine we always allocate
3131 3-units-long substrs field. */
3132 Newxz(r->substrs, 1, struct reg_substr_data);
3134 StructCopy(&zero_scan_data, &data, scan_data_t);
3135 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3136 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3138 STRLEN longest_float_length, longest_fixed_length;
3139 struct regnode_charclass_class ch_class;
3144 /* Skip introductions and multiplicators >= 1. */
3145 while ((OP(first) == OPEN && (sawopen = 1)) ||
3146 /* An OR of *one* alternative - should not happen now. */
3147 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3148 (OP(first) == PLUS) ||
3149 (OP(first) == MINMOD) ||
3150 /* An {n,m} with n>0 */
3151 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) {
3152 if (OP(first) == PLUS)
3155 first += regarglen[OP(first)];
3156 first = NEXTOPER(first);
3159 /* Starting-point info. */
3161 if (PL_regkind[OP(first)] == EXACT) {
3162 if (OP(first) == EXACT)
3163 NOOP; /* Empty, get anchored substr later. */
3164 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3165 r->regstclass = first;
3167 else if (strchr((const char*)PL_simple,OP(first)))
3168 r->regstclass = first;
3169 else if (PL_regkind[OP(first)] == BOUND ||
3170 PL_regkind[OP(first)] == NBOUND)
3171 r->regstclass = first;
3172 else if (PL_regkind[OP(first)] == BOL) {
3173 r->reganch |= (OP(first) == MBOL
3175 : (OP(first) == SBOL
3178 first = NEXTOPER(first);
3181 else if (OP(first) == GPOS) {
3182 r->reganch |= ROPT_ANCH_GPOS;
3183 first = NEXTOPER(first);
3186 else if (!sawopen && (OP(first) == STAR &&
3187 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3188 !(r->reganch & ROPT_ANCH) )
3190 /* turn .* into ^.* with an implied $*=1 */
3192 (OP(NEXTOPER(first)) == REG_ANY)
3195 r->reganch |= type | ROPT_IMPLICIT;
3196 first = NEXTOPER(first);
3199 if (sawplus && (!sawopen || !RExC_sawback)
3200 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3201 /* x+ must match at the 1st pos of run of x's */
3202 r->reganch |= ROPT_SKIP;
3204 /* Scan is after the zeroth branch, first is atomic matcher. */
3205 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3206 (IV)(first - scan + 1)));
3208 * If there's something expensive in the r.e., find the
3209 * longest literal string that must appear and make it the
3210 * regmust. Resolve ties in favor of later strings, since
3211 * the regstart check works with the beginning of the r.e.
3212 * and avoiding duplication strengthens checking. Not a
3213 * strong reason, but sufficient in the absence of others.
3214 * [Now we resolve ties in favor of the earlier string if
3215 * it happens that c_offset_min has been invalidated, since the
3216 * earlier string may buy us something the later one won't.]
3220 data.longest_fixed = newSVpvs("");
3221 data.longest_float = newSVpvs("");
3222 data.last_found = newSVpvs("");
3223 data.longest = &(data.longest_fixed);
3225 if (!r->regstclass) {
3226 cl_init(pRExC_state, &ch_class);
3227 data.start_class = &ch_class;
3228 stclass_flag = SCF_DO_STCLASS_AND;
3229 } else /* XXXX Check for BOUND? */
3231 data.last_closep = &last_close;
3233 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3234 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3235 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3236 && data.last_start_min == 0 && data.last_end > 0
3237 && !RExC_seen_zerolen
3238 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3239 r->reganch |= ROPT_CHECK_ALL;
3240 scan_commit(pRExC_state, &data);
3241 SvREFCNT_dec(data.last_found);
3243 longest_float_length = CHR_SVLEN(data.longest_float);
3244 if (longest_float_length
3245 || (data.flags & SF_FL_BEFORE_EOL
3246 && (!(data.flags & SF_FL_BEFORE_MEOL)
3247 || (RExC_flags & PMf_MULTILINE)))) {
3250 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3251 && data.offset_fixed == data.offset_float_min
3252 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3253 goto remove_float; /* As in (a)+. */
3255 if (SvUTF8(data.longest_float)) {
3256 r->float_utf8 = data.longest_float;
3257 r->float_substr = NULL;
3259 r->float_substr = data.longest_float;
3260 r->float_utf8 = NULL;
3262 r->float_min_offset = data.offset_float_min;
3263 r->float_max_offset = data.offset_float_max;
3264 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3265 && (!(data.flags & SF_FL_BEFORE_MEOL)
3266 || (RExC_flags & PMf_MULTILINE)));
3267 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3271 r->float_substr = r->float_utf8 = NULL;
3272 SvREFCNT_dec(data.longest_float);
3273 longest_float_length = 0;
3276 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3277 if (longest_fixed_length
3278 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3279 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3280 || (RExC_flags & PMf_MULTILINE)))) {
3283 if (SvUTF8(data.longest_fixed)) {
3284 r->anchored_utf8 = data.longest_fixed;
3285 r->anchored_substr = NULL;
3287 r->anchored_substr = data.longest_fixed;
3288 r->anchored_utf8 = NULL;
3290 r->anchored_offset = data.offset_fixed;
3291 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3292 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3293 || (RExC_flags & PMf_MULTILINE)));
3294 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3297 r->anchored_substr = r->anchored_utf8 = NULL;
3298 SvREFCNT_dec(data.longest_fixed);
3299 longest_fixed_length = 0;
3302 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3303 r->regstclass = NULL;
3304 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3306 && !(data.start_class->flags & ANYOF_EOS)
3307 && !cl_is_anything(data.start_class))
3309 const I32 n = add_data(pRExC_state, 1, "f");
3311 Newx(RExC_rx->data->data[n], 1,
3312 struct regnode_charclass_class);
3313 StructCopy(data.start_class,
3314 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3315 struct regnode_charclass_class);
3316 r->regstclass = (regnode*)RExC_rx->data->data[n];
3317 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3318 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3319 regprop(r, sv, (regnode*)data.start_class);
3320 PerlIO_printf(Perl_debug_log,
3321 "synthetic stclass \"%s\".\n",
3322 SvPVX_const(sv));});
3325 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3326 if (longest_fixed_length > longest_float_length) {
3327 r->check_substr = r->anchored_substr;
3328 r->check_utf8 = r->anchored_utf8;
3329 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3330 if (r->reganch & ROPT_ANCH_SINGLE)
3331 r->reganch |= ROPT_NOSCAN;
3334 r->check_substr = r->float_substr;
3335 r->check_utf8 = r->float_utf8;
3336 r->check_offset_min = data.offset_float_min;
3337 r->check_offset_max = data.offset_float_max;
3339 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3340 This should be changed ASAP! */
3341 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3342 r->reganch |= RE_USE_INTUIT;
3343 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3344 r->reganch |= RE_INTUIT_TAIL;
3348 /* Several toplevels. Best we can is to set minlen. */
3350 struct regnode_charclass_class ch_class;
3353 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3354 scan = r->program + 1;
3355 cl_init(pRExC_state, &ch_class);
3356 data.start_class = &ch_class;
3357 data.last_closep = &last_close;
3358 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3359 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3360 = r->float_substr = r->float_utf8 = NULL;
3361 if (!(data.start_class->flags & ANYOF_EOS)
3362 && !cl_is_anything(data.start_class))
3364 const I32 n = add_data(pRExC_state, 1, "f");
3366 Newx(RExC_rx->data->data[n], 1,
3367 struct regnode_charclass_class);
3368 StructCopy(data.start_class,
3369 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3370 struct regnode_charclass_class);
3371 r->regstclass = (regnode*)RExC_rx->data->data[n];
3372 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3373 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3374 regprop(r, sv, (regnode*)data.start_class);
3375 PerlIO_printf(Perl_debug_log,
3376 "synthetic stclass \"%s\".\n",
3377 SvPVX_const(sv));});
3382 if (RExC_seen & REG_SEEN_GPOS)
3383 r->reganch |= ROPT_GPOS_SEEN;
3384 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3385 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3386 if (RExC_seen & REG_SEEN_EVAL)
3387 r->reganch |= ROPT_EVAL_SEEN;
3388 if (RExC_seen & REG_SEEN_CANY)
3389 r->reganch |= ROPT_CANY_SEEN;
3390 Newxz(r->startp, RExC_npar, I32);
3391 Newxz(r->endp, RExC_npar, I32);
3393 if (SvIV(re_debug_flags)> (RE_DEBUG_COMPILE | RE_DEBUG_EXECUTE))
3394 PerlIO_printf(Perl_debug_log,"Final program:\n");
3401 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3402 int rem=(int)(RExC_end - RExC_parse); \
3411 if (RExC_lastparse!=RExC_parse) \
3412 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3415 iscut ? "..." : "<" \
3418 PerlIO_printf(Perl_debug_log,"%16s",""); \
3423 num=REG_NODE_NUM(RExC_emit); \
3424 if (RExC_lastnum!=num) \
3425 PerlIO_printf(Perl_debug_log,"%4d",num); \
3427 PerlIO_printf(Perl_debug_log,"%4s",""); \
3428 PerlIO_printf(Perl_debug_log,"%*s%-4s", \
3429 (int)(10+(depth*2)), "", \
3433 RExC_lastparse=RExC_parse; \
3436 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3437 DEBUG_PARSE_MSG((funcname)); \
3438 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3441 - reg - regular expression, i.e. main body or parenthesized thing
3443 * Caller must absorb opening parenthesis.
3445 * Combining parenthesis handling with the base level of regular expression
3446 * is a trifle forced, but the need to tie the tails of the branches to what
3447 * follows makes it hard to avoid.
3449 #define REGTAIL(x,y,z) regtail(x,y,z,depth+1)
3452 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
3453 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
3456 register regnode *ret; /* Will be the head of the group. */
3457 register regnode *br;
3458 register regnode *lastbr;
3459 register regnode *ender = NULL;
3460 register I32 parno = 0;
3462 const I32 oregflags = RExC_flags;
3463 bool have_branch = 0;
3466 /* for (?g), (?gc), and (?o) warnings; warning
3467 about (?c) will warn about (?g) -- japhy */
3469 #define WASTED_O 0x01
3470 #define WASTED_G 0x02
3471 #define WASTED_C 0x04
3472 #define WASTED_GC (0x02|0x04)
3473 I32 wastedflags = 0x00;
3475 char * parse_start = RExC_parse; /* MJD */
3476 char * const oregcomp_parse = RExC_parse;
3478 GET_RE_DEBUG_FLAGS_DECL;
3479 DEBUG_PARSE("reg ");
3482 *flagp = 0; /* Tentatively. */
3485 /* Make an OPEN node, if parenthesized. */
3487 if (*RExC_parse == '?') { /* (?...) */
3488 U32 posflags = 0, negflags = 0;
3489 U32 *flagsp = &posflags;
3490 bool is_logical = 0;
3491 const char * const seqstart = RExC_parse;
3494 paren = *RExC_parse++;
3495 ret = NULL; /* For look-ahead/behind. */
3497 case '<': /* (?<...) */
3498 RExC_seen |= REG_SEEN_LOOKBEHIND;
3499 if (*RExC_parse == '!')
3501 if (*RExC_parse != '=' && *RExC_parse != '!')
3504 case '=': /* (?=...) */
3505 case '!': /* (?!...) */
3506 RExC_seen_zerolen++;
3507 case ':': /* (?:...) */
3508 case '>': /* (?>...) */
3510 case '$': /* (?$...) */
3511 case '@': /* (?@...) */
3512 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3514 case '#': /* (?#...) */
3515 while (*RExC_parse && *RExC_parse != ')')
3517 if (*RExC_parse != ')')
3518 FAIL("Sequence (?#... not terminated");
3519 nextchar(pRExC_state);
3522 case 'p': /* (?p...) */
3523 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
3524 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
3526 case '?': /* (??...) */
3528 if (*RExC_parse != '{')
3530 paren = *RExC_parse++;
3532 case '{': /* (?{...}) */
3534 I32 count = 1, n = 0;
3536 char *s = RExC_parse;
3538 RExC_seen_zerolen++;
3539 RExC_seen |= REG_SEEN_EVAL;
3540 while (count && (c = *RExC_parse)) {
3551 if (*RExC_parse != ')') {
3553 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3557 OP_4tree *sop, *rop;
3558 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
3561 Perl_save_re_context(aTHX);
3562 rop = sv_compile_2op(sv, &sop, "re", &pad);
3563 sop->op_private |= OPpREFCOUNTED;
3564 /* re_dup will OpREFCNT_inc */
3565 OpREFCNT_set(sop, 1);
3568 n = add_data(pRExC_state, 3, "nop");
3569 RExC_rx->data->data[n] = (void*)rop;
3570 RExC_rx->data->data[n+1] = (void*)sop;
3571 RExC_rx->data->data[n+2] = (void*)pad;
3574 else { /* First pass */
3575 if (PL_reginterp_cnt < ++RExC_seen_evals
3577 /* No compiled RE interpolated, has runtime
3578 components ===> unsafe. */
3579 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3580 if (PL_tainting && PL_tainted)
3581 FAIL("Eval-group in insecure regular expression");
3582 #if PERL_VERSION > 8
3583 if (IN_PERL_COMPILETIME)
3588 nextchar(pRExC_state);
3590 ret = reg_node(pRExC_state, LOGICAL);
3593 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
3594 /* deal with the length of this later - MJD */
3597 ret = reganode(pRExC_state, EVAL, n);
3598 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3599 Set_Node_Offset(ret, parse_start);
3602 case '(': /* (?(?{...})...) and (?(?=...)...) */
3604 if (RExC_parse[0] == '?') { /* (?(?...)) */
3605 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3606 || RExC_parse[1] == '<'
3607 || RExC_parse[1] == '{') { /* Lookahead or eval. */
3610 ret = reg_node(pRExC_state, LOGICAL);
3613 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
3617 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
3620 parno = atoi(RExC_parse++);
3622 while (isDIGIT(*RExC_parse))
3624 ret = reganode(pRExC_state, GROUPP, parno);
3626 if ((c = *nextchar(pRExC_state)) != ')')
3627 vFAIL("Switch condition not recognized");
3629 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3630 br = regbranch(pRExC_state, &flags, 1,depth+1);
3632 br = reganode(pRExC_state, LONGJMP, 0);
3634 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3635 c = *nextchar(pRExC_state);
3639 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3640 regbranch(pRExC_state, &flags, 1,depth+1);
3641 REGTAIL(pRExC_state, ret, lastbr);
3644 c = *nextchar(pRExC_state);
3649 vFAIL("Switch (?(condition)... contains too many branches");
3650 ender = reg_node(pRExC_state, TAIL);
3651 REGTAIL(pRExC_state, br, ender);
3653 REGTAIL(pRExC_state, lastbr, ender);
3654 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
3657 REGTAIL(pRExC_state, ret, ender);
3661 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
3665 RExC_parse--; /* for vFAIL to print correctly */
3666 vFAIL("Sequence (? incomplete");
3670 parse_flags: /* (?i) */
3671 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
3672 /* (?g), (?gc) and (?o) are useless here
3673 and must be globally applied -- japhy */
3675 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3676 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3677 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
3678 if (! (wastedflags & wflagbit) ) {
3679 wastedflags |= wflagbit;
3682 "Useless (%s%c) - %suse /%c modifier",
3683 flagsp == &negflags ? "?-" : "?",
3685 flagsp == &negflags ? "don't " : "",
3691 else if (*RExC_parse == 'c') {
3692 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3693 if (! (wastedflags & WASTED_C) ) {
3694 wastedflags |= WASTED_GC;
3697 "Useless (%sc) - %suse /gc modifier",
3698 flagsp == &negflags ? "?-" : "?",
3699 flagsp == &negflags ? "don't " : ""
3704 else { pmflag(flagsp, *RExC_parse); }
3708 if (*RExC_parse == '-') {
3710 wastedflags = 0; /* reset so (?g-c) warns twice */
3714 RExC_flags |= posflags;
3715 RExC_flags &= ~negflags;
3716 if (*RExC_parse == ':') {
3722 if (*RExC_parse != ')') {
3724 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
3726 nextchar(pRExC_state);
3734 ret = reganode(pRExC_state, OPEN, parno);
3735 Set_Node_Length(ret, 1); /* MJD */
3736 Set_Node_Offset(ret, RExC_parse); /* MJD */
3743 /* Pick up the branches, linking them together. */
3744 parse_start = RExC_parse; /* MJD */
3745 br = regbranch(pRExC_state, &flags, 1,depth+1);
3746 /* branch_len = (paren != 0); */
3750 if (*RExC_parse == '|') {
3751 if (!SIZE_ONLY && RExC_extralen) {
3752 reginsert(pRExC_state, BRANCHJ, br);
3755 reginsert(pRExC_state, BRANCH, br);
3756 Set_Node_Length(br, paren != 0);
3757 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3761 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
3763 else if (paren == ':') {
3764 *flagp |= flags&SIMPLE;
3766 if (is_open) { /* Starts with OPEN. */
3767 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
3769 else if (paren != '?') /* Not Conditional */
3771 *flagp |= flags & (SPSTART | HASWIDTH);
3773 while (*RExC_parse == '|') {
3774 if (!SIZE_ONLY && RExC_extralen) {
3775 ender = reganode(pRExC_state, LONGJMP,0);
3776 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
3779 RExC_extralen += 2; /* Account for LONGJMP. */
3780 nextchar(pRExC_state);
3781 br = regbranch(pRExC_state, &flags, 0, depth+1);
3785 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
3789 *flagp |= flags&SPSTART;
3792 if (have_branch || paren != ':') {
3793 /* Make a closing node, and hook it on the end. */
3796 ender = reg_node(pRExC_state, TAIL);
3799 ender = reganode(pRExC_state, CLOSE, parno);
3800 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3801 Set_Node_Length(ender,1); /* MJD */
3807 *flagp &= ~HASWIDTH;
3810 ender = reg_node(pRExC_state, SUCCEED);
3813 ender = reg_node(pRExC_state, END);
3816 REGTAIL(pRExC_state, lastbr, ender);
3818 if (have_branch && !SIZE_ONLY) {
3819 /* Hook the tails of the branches to the closing node. */
3821 for (br = ret; br; br = regnext(br)) {
3822 const U8 op = PL_regkind[OP(br)];
3825 exact_ret=regtail_study(pRExC_state, NEXTOPER(br), ender,depth+1);
3827 else if (op == BRANCHJ) {
3828 exact_ret=regtail_study(pRExC_state, NEXTOPER(NEXTOPER(br)), ender,depth+1);
3830 if ( exact == PSEUDO )
3832 else if ( exact != exact_ret )
3840 static const char parens[] = "=!<,>";
3842 if (paren && (p = strchr(parens, paren))) {
3843 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
3844 int flag = (p - parens) > 1;
3847 node = SUSPEND, flag = 0;
3848 reginsert(pRExC_state, node,ret);
3849 Set_Node_Cur_Length(ret);
3850 Set_Node_Offset(ret, parse_start + 1);
3852 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, TAIL));
3856 /* Check for proper termination. */
3858 RExC_flags = oregflags;
3859 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3860 RExC_parse = oregcomp_parse;
3861 vFAIL("Unmatched (");
3864 else if (!paren && RExC_parse < RExC_end) {
3865 if (*RExC_parse == ')') {
3867 vFAIL("Unmatched )");
3870 FAIL("Junk on end of regexp"); /* "Can't happen". */
3878 - regbranch - one alternative of an | operator
3880 * Implements the concatenation operator.
3883 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
3886 register regnode *ret;
3887 register regnode *chain = NULL;
3888 register regnode *latest;
3889 I32 flags = 0, c = 0;
3890 GET_RE_DEBUG_FLAGS_DECL;
3891 DEBUG_PARSE("brnc");
3895 if (!SIZE_ONLY && RExC_extralen)
3896 ret = reganode(pRExC_state, BRANCHJ,0);
3898 ret = reg_node(pRExC_state, BRANCH);
3899 Set_Node_Length(ret, 1);
3903 if (!first && SIZE_ONLY)
3904 RExC_extralen += 1; /* BRANCHJ */
3906 *flagp = WORST; /* Tentatively. */
3909 nextchar(pRExC_state);
3910 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
3912 latest = regpiece(pRExC_state, &flags,depth+1);
3913 if (latest == NULL) {
3914 if (flags & TRYAGAIN)
3918 else if (ret == NULL)
3920 *flagp |= flags&HASWIDTH;
3921 if (chain == NULL) /* First piece. */
3922 *flagp |= flags&SPSTART;
3925 REGTAIL(pRExC_state, chain, latest);
3930 if (chain == NULL) { /* Loop ran zero times. */
3931 chain = reg_node(pRExC_state, NOTHING);
3936 *flagp |= flags&SIMPLE;
3943 - regpiece - something followed by possible [*+?]
3945 * Note that the branching code sequences used for ? and the general cases
3946 * of * and + are somewhat optimized: they use the same NOTHING node as
3947 * both the endmarker for their branch list and the body of the last branch.
3948 * It might seem that this node could be dispensed with entirely, but the
3949 * endmarker role is not redundant.
3952 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
3955 register regnode *ret;
3957 register char *next;
3959 const char * const origparse = RExC_parse;
3961 I32 max = REG_INFTY;
3963 GET_RE_DEBUG_FLAGS_DECL;
3964 DEBUG_PARSE("piec");
3966 ret = regatom(pRExC_state, &flags,depth+1);
3968 if (flags & TRYAGAIN)
3975 if (op == '{' && regcurly(RExC_parse)) {
3976 const char *maxpos = NULL;
3977 parse_start = RExC_parse; /* MJD */
3978 next = RExC_parse + 1;
3979 while (isDIGIT(*next) || *next == ',') {
3988 if (*next == '}') { /* got one */
3992 min = atoi(RExC_parse);
3996 maxpos = RExC_parse;
3998 if (!max && *maxpos != '0')
3999 max = REG_INFTY; /* meaning "infinity" */
4000 else if (max >= REG_INFTY)
4001 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4003 nextchar(pRExC_state);
4006 if ((flags&SIMPLE)) {
4007 RExC_naughty += 2 + RExC_naughty / 2;
4008 reginsert(pRExC_state, CURLY, ret);
4009 Set_Node_Offset(ret, parse_start+1); /* MJD */
4010 Set_Node_Cur_Length(ret);
4013 regnode * const w = reg_node(pRExC_state, WHILEM);
4016 REGTAIL(pRExC_state, ret, w);
4017 if (!SIZE_ONLY && RExC_extralen) {
4018 reginsert(pRExC_state, LONGJMP,ret);
4019 reginsert(pRExC_state, NOTHING,ret);
4020 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4022 reginsert(pRExC_state, CURLYX,ret);
4024 Set_Node_Offset(ret, parse_start+1);
4025 Set_Node_Length(ret,
4026 op == '{' ? (RExC_parse - parse_start) : 1);
4028 if (!SIZE_ONLY && RExC_extralen)
4029 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4030 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4032 RExC_whilem_seen++, RExC_extralen += 3;
4033 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4041 if (max && max < min)
4042 vFAIL("Can't do {n,m} with n > m");
4044 ARG1_SET(ret, (U16)min);
4045 ARG2_SET(ret, (U16)max);
4057 #if 0 /* Now runtime fix should be reliable. */
4059 /* if this is reinstated, don't forget to put this back into perldiag:
4061 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4063 (F) The part of the regexp subject to either the * or + quantifier
4064 could match an empty string. The {#} shows in the regular
4065 expression about where the problem was discovered.
4069 if (!(flags&HASWIDTH) && op != '?')
4070 vFAIL("Regexp *+ operand could be empty");
4073 parse_start = RExC_parse;
4074 nextchar(pRExC_state);
4076 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4078 if (op == '*' && (flags&SIMPLE)) {
4079 reginsert(pRExC_state, STAR, ret);
4083 else if (op == '*') {
4087 else if (op == '+' && (flags&SIMPLE)) {
4088 reginsert(pRExC_state, PLUS, ret);
4092 else if (op == '+') {
4096 else if (op == '?') {
4101 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4103 "%.*s matches null string many times",
4104 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4108 if (*RExC_parse == '?') {
4109 nextchar(pRExC_state);
4110 reginsert(pRExC_state, MINMOD, ret);
4111 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4113 if (ISMULT2(RExC_parse)) {
4115 vFAIL("Nested quantifiers");
4122 - regatom - the lowest level
4124 * Optimization: gobbles an entire sequence of ordinary characters so that
4125 * it can turn them into a single node, which is smaller to store and
4126 * faster to run. Backslashed characters are exceptions, each becoming a
4127 * separate node; the code is simpler that way and it's not worth fixing.
4129 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4130 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4133 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4136 register regnode *ret = NULL;
4138 char *parse_start = RExC_parse;
4139 GET_RE_DEBUG_FLAGS_DECL;
4140 DEBUG_PARSE("atom");
4141 *flagp = WORST; /* Tentatively. */
4144 switch (*RExC_parse) {
4146 RExC_seen_zerolen++;
4147 nextchar(pRExC_state);
4148 if (RExC_flags & PMf_MULTILINE)
4149 ret = reg_node(pRExC_state, MBOL);
4150 else if (RExC_flags & PMf_SINGLELINE)
4151 ret = reg_node(pRExC_state, SBOL);
4153 ret = reg_node(pRExC_state, BOL);
4154 Set_Node_Length(ret, 1); /* MJD */
4157 nextchar(pRExC_state);
4159 RExC_seen_zerolen++;
4160 if (RExC_flags & PMf_MULTILINE)
4161 ret = reg_node(pRExC_state, MEOL);
4162 else if (RExC_flags & PMf_SINGLELINE)
4163 ret = reg_node(pRExC_state, SEOL);
4165 ret = reg_node(pRExC_state, EOL);
4166 Set_Node_Length(ret, 1); /* MJD */
4169 nextchar(pRExC_state);
4170 if (RExC_flags & PMf_SINGLELINE)
4171 ret = reg_node(pRExC_state, SANY);
4173 ret = reg_node(pRExC_state, REG_ANY);
4174 *flagp |= HASWIDTH|SIMPLE;
4176 Set_Node_Length(ret, 1); /* MJD */
4180 char * const oregcomp_parse = ++RExC_parse;
4181 ret = regclass(pRExC_state,depth+1);
4182 if (*RExC_parse != ']') {
4183 RExC_parse = oregcomp_parse;
4184 vFAIL("Unmatched [");
4186 nextchar(pRExC_state);
4187 *flagp |= HASWIDTH|SIMPLE;
4188 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4192 nextchar(pRExC_state);
4193 ret = reg(pRExC_state, 1, &flags,depth+1);
4195 if (flags & TRYAGAIN) {
4196 if (RExC_parse == RExC_end) {
4197 /* Make parent create an empty node if needed. */
4205 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4209 if (flags & TRYAGAIN) {
4213 vFAIL("Internal urp");
4214 /* Supposed to be caught earlier. */
4217 if (!regcurly(RExC_parse)) {
4226 vFAIL("Quantifier follows nothing");
4229 switch (*++RExC_parse) {
4231 RExC_seen_zerolen++;
4232 ret = reg_node(pRExC_state, SBOL);
4234 nextchar(pRExC_state);
4235 Set_Node_Length(ret, 2); /* MJD */
4238 ret = reg_node(pRExC_state, GPOS);
4239 RExC_seen |= REG_SEEN_GPOS;
4241 nextchar(pRExC_state);
4242 Set_Node_Length(ret, 2); /* MJD */
4245 ret = reg_node(pRExC_state, SEOL);
4247 RExC_seen_zerolen++; /* Do not optimize RE away */
4248 nextchar(pRExC_state);
4251 ret = reg_node(pRExC_state, EOS);
4253 RExC_seen_zerolen++; /* Do not optimize RE away */
4254 nextchar(pRExC_state);
4255 Set_Node_Length(ret, 2); /* MJD */
4258 ret = reg_node(pRExC_state, CANY);
4259 RExC_seen |= REG_SEEN_CANY;
4260 *flagp |= HASWIDTH|SIMPLE;
4261 nextchar(pRExC_state);
4262 Set_Node_Length(ret, 2); /* MJD */
4265 ret = reg_node(pRExC_state, CLUMP);
4267 nextchar(pRExC_state);
4268 Set_Node_Length(ret, 2); /* MJD */
4271 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4272 *flagp |= HASWIDTH|SIMPLE;
4273 nextchar(pRExC_state);
4274 Set_Node_Length(ret, 2); /* MJD */
4277 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4278 *flagp |= HASWIDTH|SIMPLE;
4279 nextchar(pRExC_state);
4280 Set_Node_Length(ret, 2); /* MJD */
4283 RExC_seen_zerolen++;
4284 RExC_seen |= REG_SEEN_LOOKBEHIND;
4285 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4287 nextchar(pRExC_state);
4288 Set_Node_Length(ret, 2); /* MJD */
4291 RExC_seen_zerolen++;
4292 RExC_seen |= REG_SEEN_LOOKBEHIND;
4293 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4295 nextchar(pRExC_state);
4296 Set_Node_Length(ret, 2); /* MJD */
4299 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4300 *flagp |= HASWIDTH|SIMPLE;
4301 nextchar(pRExC_state);
4302 Set_Node_Length(ret, 2); /* MJD */
4305 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4306 *flagp |= HASWIDTH|SIMPLE;
4307 nextchar(pRExC_state);
4308 Set_Node_Length(ret, 2); /* MJD */
4311 ret = reg_node(pRExC_state, DIGIT);
4312 *flagp |= HASWIDTH|SIMPLE;
4313 nextchar(pRExC_state);
4314 Set_Node_Length(ret, 2); /* MJD */
4317 ret = reg_node(pRExC_state, NDIGIT);
4318 *flagp |= HASWIDTH|SIMPLE;
4319 nextchar(pRExC_state);
4320 Set_Node_Length(ret, 2); /* MJD */
4325 char* const oldregxend = RExC_end;
4326 char* parse_start = RExC_parse - 2;
4328 if (RExC_parse[1] == '{') {
4329 /* a lovely hack--pretend we saw [\pX] instead */
4330 RExC_end = strchr(RExC_parse, '}');
4332 const U8 c = (U8)*RExC_parse;
4334 RExC_end = oldregxend;
4335 vFAIL2("Missing right brace on \\%c{}", c);
4340 RExC_end = RExC_parse + 2;
4341 if (RExC_end > oldregxend)
4342 RExC_end = oldregxend;
4346 ret = regclass(pRExC_state,depth+1);
4348 RExC_end = oldregxend;
4351 Set_Node_Offset(ret, parse_start + 2);
4352 Set_Node_Cur_Length(ret);
4353 nextchar(pRExC_state);
4354 *flagp |= HASWIDTH|SIMPLE;
4367 case '1': case '2': case '3': case '4':
4368 case '5': case '6': case '7': case '8': case '9':
4370 const I32 num = atoi(RExC_parse);
4372 if (num > 9 && num >= RExC_npar)
4375 char * const parse_start = RExC_parse - 1; /* MJD */
4376 while (isDIGIT(*RExC_parse))
4379 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4380 vFAIL("Reference to nonexistent group");
4382 ret = reganode(pRExC_state,
4383 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4387 /* override incorrect value set in reganode MJD */
4388 Set_Node_Offset(ret, parse_start+1);
4389 Set_Node_Cur_Length(ret); /* MJD */
4391 nextchar(pRExC_state);
4396 if (RExC_parse >= RExC_end)
4397 FAIL("Trailing \\");
4400 /* Do not generate "unrecognized" warnings here, we fall
4401 back into the quick-grab loop below */
4408 if (RExC_flags & PMf_EXTENDED) {
4409 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4411 if (RExC_parse < RExC_end)
4417 register STRLEN len;
4422 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4424 parse_start = RExC_parse - 1;
4430 ret = reg_node(pRExC_state,
4431 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4433 for (len = 0, p = RExC_parse - 1;
4434 len < 127 && p < RExC_end;
4437 char * const oldp = p;
4439 if (RExC_flags & PMf_EXTENDED)
4440 p = regwhite(p, RExC_end);
4487 ender = ASCII_TO_NATIVE('\033');
4491 ender = ASCII_TO_NATIVE('\007');
4496 char* const e = strchr(p, '}');
4500 vFAIL("Missing right brace on \\x{}");
4503 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4504 | PERL_SCAN_DISALLOW_PREFIX;
4505 STRLEN numlen = e - p - 1;
4506 ender = grok_hex(p + 1, &numlen, &flags, NULL);
4513 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
4515 ender = grok_hex(p, &numlen, &flags, NULL);
4521 ender = UCHARAT(p++);
4522 ender = toCTRL(ender);
4524 case '0': case '1': case '2': case '3':case '4':
4525 case '5': case '6': case '7': case '8':case '9':
4527 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
4530 ender = grok_oct(p, &numlen, &flags, NULL);
4540 FAIL("Trailing \\");
4543 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4544 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
4545 goto normal_default;
4550 if (UTF8_IS_START(*p) && UTF) {
4552 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
4553 &numlen, UTF8_ALLOW_DEFAULT);
4560 if (RExC_flags & PMf_EXTENDED)
4561 p = regwhite(p, RExC_end);
4563 /* Prime the casefolded buffer. */
4564 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
4566 if (ISMULT2(p)) { /* Back off on ?+*. */
4571 /* Emit all the Unicode characters. */
4573 for (foldbuf = tmpbuf;
4575 foldlen -= numlen) {
4576 ender = utf8_to_uvchr(foldbuf, &numlen);
4578 const STRLEN unilen = reguni(pRExC_state, ender, s);
4581 /* In EBCDIC the numlen
4582 * and unilen can differ. */
4584 if (numlen >= foldlen)
4588 break; /* "Can't happen." */
4592 const STRLEN unilen = reguni(pRExC_state, ender, s);
4601 REGC((char)ender, s++);
4607 /* Emit all the Unicode characters. */
4609 for (foldbuf = tmpbuf;
4611 foldlen -= numlen) {
4612 ender = utf8_to_uvchr(foldbuf, &numlen);
4614 const STRLEN unilen = reguni(pRExC_state, ender, s);
4617 /* In EBCDIC the numlen
4618 * and unilen can differ. */
4620 if (numlen >= foldlen)
4628 const STRLEN unilen = reguni(pRExC_state, ender, s);
4637 REGC((char)ender, s++);
4641 Set_Node_Cur_Length(ret); /* MJD */
4642 nextchar(pRExC_state);
4644 /* len is STRLEN which is unsigned, need to copy to signed */
4647 vFAIL("Internal disaster");
4651 if (len == 1 && UNI_IS_INVARIANT(ender))
4655 RExC_size += STR_SZ(len);
4658 RExC_emit += STR_SZ(len);
4664 /* If the encoding pragma is in effect recode the text of
4665 * any EXACT-kind nodes. */
4666 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4667 const STRLEN oldlen = STR_LEN(ret);
4668 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4672 if (sv_utf8_downgrade(sv, TRUE)) {
4673 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4674 const STRLEN newlen = SvCUR(sv);
4679 GET_RE_DEBUG_FLAGS_DECL;
4680 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
4681 (int)oldlen, STRING(ret),
4683 Copy(s, STRING(ret), newlen, char);
4684 STR_LEN(ret) += newlen - oldlen;
4685 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4687 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4695 S_regwhite(char *p, const char *e)
4700 else if (*p == '#') {
4703 } while (p < e && *p != '\n');
4711 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4712 Character classes ([:foo:]) can also be negated ([:^foo:]).
4713 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4714 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
4715 but trigger failures because they are currently unimplemented. */
4717 #define POSIXCC_DONE(c) ((c) == ':')
4718 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4719 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4722 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
4725 I32 namedclass = OOB_NAMEDCLASS;
4727 if (value == '[' && RExC_parse + 1 < RExC_end &&
4728 /* I smell either [: or [= or [. -- POSIX has been here, right? */
4729 POSIXCC(UCHARAT(RExC_parse))) {
4730 const char c = UCHARAT(RExC_parse);
4731 char* const s = RExC_parse++;
4733 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
4735 if (RExC_parse == RExC_end)
4736 /* Grandfather lone [:, [=, [. */
4739 const char* const t = RExC_parse++; /* skip over the c */
4742 if (UCHARAT(RExC_parse) == ']') {
4743 const char *posixcc = s + 1;
4744 RExC_parse++; /* skip over the ending ] */
4747 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4748 const I32 skip = t - posixcc;
4750 /* Initially switch on the length of the name. */
4753 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
4754 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4757 /* Names all of length 5. */
4758 /* alnum alpha ascii blank cntrl digit graph lower
4759 print punct space upper */
4760 /* Offset 4 gives the best switch position. */
4761 switch (posixcc[4]) {
4763 if (memEQ(posixcc, "alph", 4)) /* alpha */
4764 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4767 if (memEQ(posixcc, "spac", 4)) /* space */
4768 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4771 if (memEQ(posixcc, "grap", 4)) /* graph */
4772 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4775 if (memEQ(posixcc, "asci", 4)) /* ascii */
4776 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
4779 if (memEQ(posixcc, "blan", 4)) /* blank */
4780 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4783 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
4784 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4787 if (memEQ(posixcc, "alnu", 4)) /* alnum */
4788 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4791 if (memEQ(posixcc, "lowe", 4)) /* lower */
4792 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4793 else if (memEQ(posixcc, "uppe", 4)) /* upper */
4794 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4797 if (memEQ(posixcc, "digi", 4)) /* digit */
4798 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4799 else if (memEQ(posixcc, "prin", 4)) /* print */
4800 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4801 else if (memEQ(posixcc, "punc", 4)) /* punct */
4802 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4807 if (memEQ(posixcc, "xdigit", 6))
4808 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
4812 if (namedclass == OOB_NAMEDCLASS)
4813 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4815 assert (posixcc[skip] == ':');
4816 assert (posixcc[skip+1] == ']');
4817 } else if (!SIZE_ONLY) {
4818 /* [[=foo=]] and [[.foo.]] are still future. */
4820 /* adjust RExC_parse so the warning shows after
4822 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
4824 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4827 /* Maternal grandfather:
4828 * "[:" ending in ":" but not in ":]" */
4838 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
4841 if (POSIXCC(UCHARAT(RExC_parse))) {
4842 const char *s = RExC_parse;
4843 const char c = *s++;
4847 if (*s && c == *s && s[1] == ']') {
4848 if (ckWARN(WARN_REGEXP))
4850 "POSIX syntax [%c %c] belongs inside character classes",
4853 /* [[=foo=]] and [[.foo.]] are still future. */
4854 if (POSIXCC_NOTYET(c)) {
4855 /* adjust RExC_parse so the error shows after
4857 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
4859 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4867 parse a class specification and produce either an ANYOF node that
4868 matches the pattern. If the pattern matches a single char only and
4869 that char is < 256 then we produce an EXACT node instead.
4872 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
4875 register UV value = 0;
4876 register UV nextvalue;
4877 register IV prevvalue = OOB_UNICODE;
4878 register IV range = 0;
4879 register regnode *ret;
4882 char *rangebegin = NULL;
4883 bool need_class = 0;
4886 bool optimize_invert = TRUE;
4887 AV* unicode_alternate = NULL;
4889 UV literal_endpoint = 0;
4891 UV stored = 0; /* number of chars stored in the class */
4893 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
4894 case we need to change the emitted regop to an EXACT. */
4895 GET_RE_DEBUG_FLAGS_DECL;
4896 DEBUG_PARSE("clas");
4898 /* Assume we are going to generate an ANYOF node. */
4899 ret = reganode(pRExC_state, ANYOF, 0);
4902 ANYOF_FLAGS(ret) = 0;
4904 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
4908 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4912 RExC_size += ANYOF_SKIP;
4913 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
4916 RExC_emit += ANYOF_SKIP;
4918 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4920 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
4921 ANYOF_BITMAP_ZERO(ret);
4922 listsv = newSVpvs("# comment\n");
4925 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4927 if (!SIZE_ONLY && POSIXCC(nextvalue))
4928 checkposixcc(pRExC_state);
4930 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4931 if (UCHARAT(RExC_parse) == ']')
4934 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
4938 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4941 rangebegin = RExC_parse;
4943 value = utf8n_to_uvchr((U8*)RExC_parse,
4944 RExC_end - RExC_parse,
4945 &numlen, UTF8_ALLOW_DEFAULT);
4946 RExC_parse += numlen;
4949 value = UCHARAT(RExC_parse++);
4951 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4952 if (value == '[' && POSIXCC(nextvalue))
4953 namedclass = regpposixcc(pRExC_state, value);
4954 else if (value == '\\') {
4956 value = utf8n_to_uvchr((U8*)RExC_parse,
4957 RExC_end - RExC_parse,
4958 &numlen, UTF8_ALLOW_DEFAULT);
4959 RExC_parse += numlen;
4962 value = UCHARAT(RExC_parse++);
4963 /* Some compilers cannot handle switching on 64-bit integer
4964 * values, therefore value cannot be an UV. Yes, this will
4965 * be a problem later if we want switch on Unicode.
4966 * A similar issue a little bit later when switching on
4967 * namedclass. --jhi */
4968 switch ((I32)value) {
4969 case 'w': namedclass = ANYOF_ALNUM; break;
4970 case 'W': namedclass = ANYOF_NALNUM; break;
4971 case 's': namedclass = ANYOF_SPACE; break;
4972 case 'S': namedclass = ANYOF_NSPACE; break;
4973 case 'd': namedclass = ANYOF_DIGIT; break;
4974 case 'D': namedclass = ANYOF_NDIGIT; break;
4979 if (RExC_parse >= RExC_end)
4980 vFAIL2("Empty \\%c{}", (U8)value);
4981 if (*RExC_parse == '{') {
4982 const U8 c = (U8)value;
4983 e = strchr(RExC_parse++, '}');
4985 vFAIL2("Missing right brace on \\%c{}", c);
4986 while (isSPACE(UCHARAT(RExC_parse)))
4988 if (e == RExC_parse)
4989 vFAIL2("Empty \\%c{}", c);
4991 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4999 if (UCHARAT(RExC_parse) == '^') {
5002 value = value == 'p' ? 'P' : 'p'; /* toggle */
5003 while (isSPACE(UCHARAT(RExC_parse))) {
5008 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5009 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5012 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5013 namedclass = ANYOF_MAX; /* no official name, but it's named */
5016 case 'n': value = '\n'; break;
5017 case 'r': value = '\r'; break;
5018 case 't': value = '\t'; break;
5019 case 'f': value = '\f'; break;
5020 case 'b': value = '\b'; break;
5021 case 'e': value = ASCII_TO_NATIVE('\033');break;
5022 case 'a': value = ASCII_TO_NATIVE('\007');break;
5024 if (*RExC_parse == '{') {
5025 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5026 | PERL_SCAN_DISALLOW_PREFIX;
5027 char * const e = strchr(RExC_parse++, '}');
5029 vFAIL("Missing right brace on \\x{}");
5031 numlen = e - RExC_parse;
5032 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5036 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5038 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5039 RExC_parse += numlen;
5043 value = UCHARAT(RExC_parse++);
5044 value = toCTRL(value);
5046 case '0': case '1': case '2': case '3': case '4':
5047 case '5': case '6': case '7': case '8': case '9':
5051 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5052 RExC_parse += numlen;
5056 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5058 "Unrecognized escape \\%c in character class passed through",
5062 } /* end of \blah */
5068 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5070 if (!SIZE_ONLY && !need_class)
5071 ANYOF_CLASS_ZERO(ret);
5075 /* a bad range like a-\d, a-[:digit:] ? */
5078 if (ckWARN(WARN_REGEXP)) {
5080 RExC_parse >= rangebegin ?
5081 RExC_parse - rangebegin : 0;
5083 "False [] range \"%*.*s\"",
5086 if (prevvalue < 256) {
5087 ANYOF_BITMAP_SET(ret, prevvalue);
5088 ANYOF_BITMAP_SET(ret, '-');
5091 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5092 Perl_sv_catpvf(aTHX_ listsv,
5093 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5097 range = 0; /* this was not a true range */
5101 const char *what = NULL;
5104 if (namedclass > OOB_NAMEDCLASS)
5105 optimize_invert = FALSE;
5106 /* Possible truncation here but in some 64-bit environments
5107 * the compiler gets heartburn about switch on 64-bit values.
5108 * A similar issue a little earlier when switching on value.
5110 switch ((I32)namedclass) {
5113 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5115 for (value = 0; value < 256; value++)
5117 ANYOF_BITMAP_SET(ret, value);
5124 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5126 for (value = 0; value < 256; value++)
5127 if (!isALNUM(value))
5128 ANYOF_BITMAP_SET(ret, value);
5135 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5137 for (value = 0; value < 256; value++)
5138 if (isALNUMC(value))
5139 ANYOF_BITMAP_SET(ret, value);
5146 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5148 for (value = 0; value < 256; value++)
5149 if (!isALNUMC(value))
5150 ANYOF_BITMAP_SET(ret, value);
5157 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5159 for (value = 0; value < 256; value++)
5161 ANYOF_BITMAP_SET(ret, value);
5168 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5170 for (value = 0; value < 256; value++)
5171 if (!isALPHA(value))
5172 ANYOF_BITMAP_SET(ret, value);
5179 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5182 for (value = 0; value < 128; value++)
5183 ANYOF_BITMAP_SET(ret, value);
5185 for (value = 0; value < 256; value++) {
5187 ANYOF_BITMAP_SET(ret, value);
5196 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5199 for (value = 128; value < 256; value++)
5200 ANYOF_BITMAP_SET(ret, value);
5202 for (value = 0; value < 256; value++) {
5203 if (!isASCII(value))
5204 ANYOF_BITMAP_SET(ret, value);
5213 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5215 for (value = 0; value < 256; value++)
5217 ANYOF_BITMAP_SET(ret, value);
5224 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5226 for (value = 0; value < 256; value++)
5227 if (!isBLANK(value))
5228 ANYOF_BITMAP_SET(ret, value);
5235 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5237 for (value = 0; value < 256; value++)
5239 ANYOF_BITMAP_SET(ret, value);
5246 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5248 for (value = 0; value < 256; value++)
5249 if (!isCNTRL(value))
5250 ANYOF_BITMAP_SET(ret, value);
5257 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5259 /* consecutive digits assumed */
5260 for (value = '0'; value <= '9'; value++)
5261 ANYOF_BITMAP_SET(ret, value);
5268 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5270 /* consecutive digits assumed */
5271 for (value = 0; value < '0'; value++)
5272 ANYOF_BITMAP_SET(ret, value);
5273 for (value = '9' + 1; value < 256; value++)
5274 ANYOF_BITMAP_SET(ret, value);
5281 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5283 for (value = 0; value < 256; value++)
5285 ANYOF_BITMAP_SET(ret, value);
5292 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5294 for (value = 0; value < 256; value++)
5295 if (!isGRAPH(value))
5296 ANYOF_BITMAP_SET(ret, value);
5303 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5305 for (value = 0; value < 256; value++)
5307 ANYOF_BITMAP_SET(ret, value);
5314 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5316 for (value = 0; value < 256; value++)
5317 if (!isLOWER(value))
5318 ANYOF_BITMAP_SET(ret, value);
5325 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5327 for (value = 0; value < 256; value++)
5329 ANYOF_BITMAP_SET(ret, value);
5336 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5338 for (value = 0; value < 256; value++)
5339 if (!isPRINT(value))
5340 ANYOF_BITMAP_SET(ret, value);
5347 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5349 for (value = 0; value < 256; value++)
5350 if (isPSXSPC(value))
5351 ANYOF_BITMAP_SET(ret, value);
5358 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5360 for (value = 0; value < 256; value++)
5361 if (!isPSXSPC(value))
5362 ANYOF_BITMAP_SET(ret, value);
5369 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5371 for (value = 0; value < 256; value++)
5373 ANYOF_BITMAP_SET(ret, value);
5380 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5382 for (value = 0; value < 256; value++)
5383 if (!isPUNCT(value))
5384 ANYOF_BITMAP_SET(ret, value);
5391 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5393 for (value = 0; value < 256; value++)
5395 ANYOF_BITMAP_SET(ret, value);
5402 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5404 for (value = 0; value < 256; value++)
5405 if (!isSPACE(value))
5406 ANYOF_BITMAP_SET(ret, value);
5413 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5415 for (value = 0; value < 256; value++)
5417 ANYOF_BITMAP_SET(ret, value);
5424 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5426 for (value = 0; value < 256; value++)
5427 if (!isUPPER(value))
5428 ANYOF_BITMAP_SET(ret, value);
5435 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5437 for (value = 0; value < 256; value++)
5438 if (isXDIGIT(value))
5439 ANYOF_BITMAP_SET(ret, value);
5446 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5448 for (value = 0; value < 256; value++)
5449 if (!isXDIGIT(value))
5450 ANYOF_BITMAP_SET(ret, value);
5456 /* this is to handle \p and \P */
5459 vFAIL("Invalid [::] class");
5463 /* Strings such as "+utf8::isWord\n" */
5464 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5467 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
5470 } /* end of namedclass \blah */
5473 if (prevvalue > (IV)value) /* b-a */ {
5474 const int w = RExC_parse - rangebegin;
5475 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
5476 range = 0; /* not a valid range */
5480 prevvalue = value; /* save the beginning of the range */
5481 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5482 RExC_parse[1] != ']') {
5485 /* a bad range like \w-, [:word:]- ? */
5486 if (namedclass > OOB_NAMEDCLASS) {
5487 if (ckWARN(WARN_REGEXP)) {
5489 RExC_parse >= rangebegin ?
5490 RExC_parse - rangebegin : 0;
5492 "False [] range \"%*.*s\"",
5496 ANYOF_BITMAP_SET(ret, '-');
5498 range = 1; /* yeah, it's a range! */
5499 continue; /* but do it the next time */
5503 /* now is the next time */
5504 stored += (value - prevvalue + 1);
5506 if (prevvalue < 256) {
5507 const IV ceilvalue = value < 256 ? value : 255;
5510 /* In EBCDIC [\x89-\x91] should include
5511 * the \x8e but [i-j] should not. */
5512 if (literal_endpoint == 2 &&
5513 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5514 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
5516 if (isLOWER(prevvalue)) {
5517 for (i = prevvalue; i <= ceilvalue; i++)
5519 ANYOF_BITMAP_SET(ret, i);
5521 for (i = prevvalue; i <= ceilvalue; i++)
5523 ANYOF_BITMAP_SET(ret, i);
5528 for (i = prevvalue; i <= ceilvalue; i++)
5529 ANYOF_BITMAP_SET(ret, i);
5531 if (value > 255 || UTF) {
5532 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5533 const UV natvalue = NATIVE_TO_UNI(value);
5535 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5536 if (prevnatvalue < natvalue) { /* what about > ? */
5537 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
5538 prevnatvalue, natvalue);
5540 else if (prevnatvalue == natvalue) {
5541 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
5543 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
5545 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
5547 /* If folding and foldable and a single
5548 * character, insert also the folded version
5549 * to the charclass. */
5551 if (foldlen == (STRLEN)UNISKIP(f))
5552 Perl_sv_catpvf(aTHX_ listsv,
5555 /* Any multicharacter foldings
5556 * require the following transform:
5557 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5558 * where E folds into "pq" and F folds
5559 * into "rst", all other characters
5560 * fold to single characters. We save
5561 * away these multicharacter foldings,
5562 * to be later saved as part of the
5563 * additional "s" data. */
5566 if (!unicode_alternate)
5567 unicode_alternate = newAV();
5568 sv = newSVpvn((char*)foldbuf, foldlen);
5570 av_push(unicode_alternate, sv);
5574 /* If folding and the value is one of the Greek
5575 * sigmas insert a few more sigmas to make the
5576 * folding rules of the sigmas to work right.
5577 * Note that not all the possible combinations
5578 * are handled here: some of them are handled
5579 * by the standard folding rules, and some of
5580 * them (literal or EXACTF cases) are handled
5581 * during runtime in regexec.c:S_find_byclass(). */
5582 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5583 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5584 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
5585 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5586 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5588 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5589 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
5590 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
5595 literal_endpoint = 0;
5599 range = 0; /* this range (if it was one) is done now */
5603 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
5605 RExC_size += ANYOF_CLASS_ADD_SKIP;
5607 RExC_emit += ANYOF_CLASS_ADD_SKIP;
5613 /****** !SIZE_ONLY AFTER HERE *********/
5615 if( stored == 1 && value < 256
5616 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5618 /* optimize single char class to an EXACT node
5619 but *only* when its not a UTF/high char */
5620 RExC_emit = orig_emit;
5621 ret = reg_node(pRExC_state,
5622 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5623 *STRING(ret)= (char)value;
5625 RExC_emit += STR_SZ(1);
5628 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
5629 if ( /* If the only flag is folding (plus possibly inversion). */
5630 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5632 for (value = 0; value < 256; ++value) {
5633 if (ANYOF_BITMAP_TEST(ret, value)) {
5634 UV fold = PL_fold[value];
5637 ANYOF_BITMAP_SET(ret, fold);
5640 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
5643 /* optimize inverted simple patterns (e.g. [^a-z]) */
5644 if (optimize_invert &&
5645 /* If the only flag is inversion. */
5646 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
5647 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
5648 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
5649 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
5652 AV * const av = newAV();
5654 /* The 0th element stores the character class description
5655 * in its textual form: used later (regexec.c:Perl_regclass_swash())
5656 * to initialize the appropriate swash (which gets stored in
5657 * the 1st element), and also useful for dumping the regnode.
5658 * The 2nd element stores the multicharacter foldings,
5659 * used later (regexec.c:S_reginclass()). */
5660 av_store(av, 0, listsv);
5661 av_store(av, 1, NULL);
5662 av_store(av, 2, (SV*)unicode_alternate);
5663 rv = newRV_noinc((SV*)av);
5664 n = add_data(pRExC_state, 1, "s");
5665 RExC_rx->data->data[n] = (void*)rv;
5672 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
5674 char* const retval = RExC_parse++;
5677 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5678 RExC_parse[2] == '#') {
5679 while (*RExC_parse != ')') {
5680 if (RExC_parse == RExC_end)
5681 FAIL("Sequence (?#... not terminated");
5687 if (RExC_flags & PMf_EXTENDED) {
5688 if (isSPACE(*RExC_parse)) {
5692 else if (*RExC_parse == '#') {
5693 while (RExC_parse < RExC_end)
5694 if (*RExC_parse++ == '\n') break;
5703 - reg_node - emit a node
5705 STATIC regnode * /* Location. */
5706 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
5709 register regnode *ptr;
5710 regnode * const ret = RExC_emit;
5713 SIZE_ALIGN(RExC_size);
5717 NODE_ALIGN_FILL(ret);
5719 FILL_ADVANCE_NODE(ptr, op);
5720 if (RExC_offsets) { /* MJD */
5721 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
5722 "reg_node", __LINE__,
5724 RExC_emit - RExC_emit_start > RExC_offsets[0]
5725 ? "Overwriting end of array!\n" : "OK",
5726 RExC_emit - RExC_emit_start,
5727 RExC_parse - RExC_start,
5729 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
5738 - reganode - emit a node with an argument
5740 STATIC regnode * /* Location. */
5741 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
5744 register regnode *ptr;
5745 regnode * const ret = RExC_emit;
5748 SIZE_ALIGN(RExC_size);
5753 NODE_ALIGN_FILL(ret);
5755 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
5756 if (RExC_offsets) { /* MJD */
5757 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5761 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5762 "Overwriting end of array!\n" : "OK",
5763 RExC_emit - RExC_emit_start,
5764 RExC_parse - RExC_start,
5766 Set_Cur_Node_Offset;
5775 - reguni - emit (if appropriate) a Unicode character
5778 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
5781 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
5785 - reginsert - insert an operator in front of already-emitted operand
5787 * Means relocating the operand.
5790 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
5793 register regnode *src;
5794 register regnode *dst;
5795 register regnode *place;
5796 const int offset = regarglen[(U8)op];
5798 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
5801 RExC_size += NODE_STEP_REGNODE + offset;
5806 RExC_emit += NODE_STEP_REGNODE + offset;
5808 while (src > opnd) {
5809 StructCopy(--src, --dst, regnode);
5810 if (RExC_offsets) { /* MJD 20010112 */
5811 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
5815 dst - RExC_emit_start > RExC_offsets[0]
5816 ? "Overwriting end of array!\n" : "OK",
5817 src - RExC_emit_start,
5818 dst - RExC_emit_start,
5820 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5821 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
5826 place = opnd; /* Op node, where operand used to be. */
5827 if (RExC_offsets) { /* MJD */
5828 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
5832 place - RExC_emit_start > RExC_offsets[0]
5833 ? "Overwriting end of array!\n" : "OK",
5834 place - RExC_emit_start,
5835 RExC_parse - RExC_start,
5837 Set_Node_Offset(place, RExC_parse);
5838 Set_Node_Length(place, 1);
5840 src = NEXTOPER(place);
5841 FILL_ADVANCE_NODE(place, op);
5842 Zero(src, offset, regnode);
5846 - regtail - set the next-pointer at the end of a node chain of p to val.
5847 - SEE ALSO: regtail_study
5849 /* TODO: All three parms should be const */
5851 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
5854 register regnode *scan;
5855 GET_RE_DEBUG_FLAGS_DECL;
5860 /* Find last node. */
5863 regnode * const temp = regnext(scan);
5865 SV * const mysv=sv_newmortal();
5866 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
5867 regprop(RExC_rx, mysv, scan);
5868 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
5869 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
5876 if (reg_off_by_arg[OP(scan)]) {
5877 ARG_SET(scan, val - scan);
5880 NEXT_OFF(scan) = val - scan;
5885 - regtail_study - set the next-pointer at the end of a node chain of p to val.
5886 - Look for optimizable sequences at the same time.
5887 - currently only looks for EXACT chains.
5889 /* TODO: All four parms should be const */
5891 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
5894 register regnode *scan;
5896 GET_RE_DEBUG_FLAGS_DECL;
5901 /* Find last node. */
5905 regnode * const temp = regnext(scan);
5911 if( exact == PSEUDO )
5913 else if ( exact != OP(scan) )
5922 SV * const mysv=sv_newmortal();
5923 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
5924 regprop(RExC_rx, mysv, scan);
5925 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
5926 SvPV_nolen_const(mysv),
5928 REG_NODE_NUM(scan));
5935 if (reg_off_by_arg[OP(scan)]) {
5936 ARG_SET(scan, val - scan);
5939 NEXT_OFF(scan) = val - scan;
5946 - regcurly - a little FSA that accepts {\d+,?\d*}
5949 S_regcurly(register const char *s)
5968 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
5971 Perl_regdump(pTHX_ const regexp *r)
5975 SV * const sv = sv_newmortal();
5977 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
5979 /* Header fields of interest. */
5980 if (r->anchored_substr)
5981 PerlIO_printf(Perl_debug_log,
5982 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
5984 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
5985 SvPVX_const(r->anchored_substr),
5987 SvTAIL(r->anchored_substr) ? "$" : "",
5988 (IV)r->anchored_offset);
5989 else if (r->anchored_utf8)
5990 PerlIO_printf(Perl_debug_log,
5991 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
5993 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5994 SvPVX_const(r->anchored_utf8),
5996 SvTAIL(r->anchored_utf8) ? "$" : "",
5997 (IV)r->anchored_offset);
5998 if (r->float_substr)
5999 PerlIO_printf(Perl_debug_log,
6000 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6002 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
6003 SvPVX_const(r->float_substr),
6005 SvTAIL(r->float_substr) ? "$" : "",
6006 (IV)r->float_min_offset, (UV)r->float_max_offset);
6007 else if (r->float_utf8)
6008 PerlIO_printf(Perl_debug_log,
6009 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
6011 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
6012 SvPVX_const(r->float_utf8),
6014 SvTAIL(r->float_utf8) ? "$" : "",
6015 (IV)r->float_min_offset, (UV)r->float_max_offset);
6016 if (r->check_substr || r->check_utf8)
6017 PerlIO_printf(Perl_debug_log,
6018 r->check_substr == r->float_substr
6019 && r->check_utf8 == r->float_utf8
6020 ? "(checking floating" : "(checking anchored");
6021 if (r->reganch & ROPT_NOSCAN)
6022 PerlIO_printf(Perl_debug_log, " noscan");
6023 if (r->reganch & ROPT_CHECK_ALL)
6024 PerlIO_printf(Perl_debug_log, " isall");
6025 if (r->check_substr || r->check_utf8)
6026 PerlIO_printf(Perl_debug_log, ") ");
6028 if (r->regstclass) {
6029 regprop(r, sv, r->regstclass);
6030 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6032 if (r->reganch & ROPT_ANCH) {
6033 PerlIO_printf(Perl_debug_log, "anchored");
6034 if (r->reganch & ROPT_ANCH_BOL)
6035 PerlIO_printf(Perl_debug_log, "(BOL)");
6036 if (r->reganch & ROPT_ANCH_MBOL)
6037 PerlIO_printf(Perl_debug_log, "(MBOL)");
6038 if (r->reganch & ROPT_ANCH_SBOL)
6039 PerlIO_printf(Perl_debug_log, "(SBOL)");
6040 if (r->reganch & ROPT_ANCH_GPOS)
6041 PerlIO_printf(Perl_debug_log, "(GPOS)");
6042 PerlIO_putc(Perl_debug_log, ' ');
6044 if (r->reganch & ROPT_GPOS_SEEN)
6045 PerlIO_printf(Perl_debug_log, "GPOS ");
6046 if (r->reganch & ROPT_SKIP)
6047 PerlIO_printf(Perl_debug_log, "plus ");
6048 if (r->reganch & ROPT_IMPLICIT)
6049 PerlIO_printf(Perl_debug_log, "implicit ");
6050 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6051 if (r->reganch & ROPT_EVAL_SEEN)
6052 PerlIO_printf(Perl_debug_log, "with eval ");
6053 PerlIO_printf(Perl_debug_log, "\n");
6055 const U32 len = r->offsets[0];
6056 GET_RE_DEBUG_FLAGS_DECL;
6059 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
6060 for (i = 1; i <= len; i++)
6061 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
6062 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
6063 PerlIO_printf(Perl_debug_log, "\n");
6067 PERL_UNUSED_CONTEXT;
6069 #endif /* DEBUGGING */
6073 - regprop - printable representation of opcode
6076 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6082 sv_setpvn(sv, "", 0);
6083 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6084 /* It would be nice to FAIL() here, but this may be called from
6085 regexec.c, and it would be hard to supply pRExC_state. */
6086 Perl_croak(aTHX_ "Corrupted regexp opcode");
6087 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6089 k = PL_regkind[OP(o)];
6092 SV * const dsv = sv_2mortal(newSVpvs(""));
6093 /* Using is_utf8_string() is a crude hack but it may
6094 * be the best for now since we have no flag "this EXACTish
6095 * node was UTF-8" --jhi */
6096 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
6097 const char * const s = do_utf8 ?
6098 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6099 UNI_DISPLAY_REGEX) :
6101 const int len = do_utf8 ?
6104 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6108 } else if (k == TRIE) {
6109 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6110 /* print the details of the trie in dumpuntil instead, as
6111 * prog->data isn't available here */
6112 } else if (k == CURLY) {
6113 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6114 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6115 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6117 else if (k == WHILEM && o->flags) /* Ordinal/of */
6118 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6119 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6120 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6121 else if (k == LOGICAL)
6122 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6123 else if (k == ANYOF) {
6124 int i, rangestart = -1;
6125 const U8 flags = ANYOF_FLAGS(o);
6127 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6128 static const char * const anyofs[] = {
6161 if (flags & ANYOF_LOCALE)
6162 sv_catpvs(sv, "{loc}");
6163 if (flags & ANYOF_FOLD)
6164 sv_catpvs(sv, "{i}");
6165 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6166 if (flags & ANYOF_INVERT)
6168 for (i = 0; i <= 256; i++) {
6169 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6170 if (rangestart == -1)
6172 } else if (rangestart != -1) {
6173 if (i <= rangestart + 3)
6174 for (; rangestart < i; rangestart++)
6175 put_byte(sv, rangestart);
6177 put_byte(sv, rangestart);
6179 put_byte(sv, i - 1);
6185 if (o->flags & ANYOF_CLASS)
6186 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6187 if (ANYOF_CLASS_TEST(o,i))
6188 sv_catpv(sv, anyofs[i]);
6190 if (flags & ANYOF_UNICODE)
6191 sv_catpvs(sv, "{unicode}");
6192 else if (flags & ANYOF_UNICODE_ALL)
6193 sv_catpvs(sv, "{unicode_all}");
6197 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6201 U8 s[UTF8_MAXBYTES_CASE+1];
6203 for (i = 0; i <= 256; i++) { /* just the first 256 */
6204 uvchr_to_utf8(s, i);
6206 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6207 if (rangestart == -1)
6209 } else if (rangestart != -1) {
6210 if (i <= rangestart + 3)
6211 for (; rangestart < i; rangestart++) {
6212 const U8 * const e = uvchr_to_utf8(s,rangestart);
6214 for(p = s; p < e; p++)
6218 const U8 *e = uvchr_to_utf8(s,rangestart);
6220 for (p = s; p < e; p++)
6223 e = uvchr_to_utf8(s, i-1);
6224 for (p = s; p < e; p++)
6231 sv_catpvs(sv, "..."); /* et cetera */
6235 char *s = savesvpv(lv);
6236 char * const origs = s;
6238 while (*s && *s != '\n')
6242 const char * const t = ++s;
6260 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6262 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6263 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
6265 PERL_UNUSED_CONTEXT;
6266 PERL_UNUSED_ARG(sv);
6268 #endif /* DEBUGGING */
6272 Perl_re_intuit_string(pTHX_ regexp *prog)
6273 { /* Assume that RE_INTUIT is set */
6275 GET_RE_DEBUG_FLAGS_DECL;
6276 PERL_UNUSED_CONTEXT;
6280 const char * const s = SvPV_nolen_const(prog->check_substr
6281 ? prog->check_substr : prog->check_utf8);
6283 if (!PL_colorset) reginitcolors();
6284 PerlIO_printf(Perl_debug_log,
6285 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6287 prog->check_substr ? "" : "utf8 ",
6288 PL_colors[5],PL_colors[0],
6291 (strlen(s) > 60 ? "..." : ""));
6294 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6298 Perl_pregfree(pTHX_ struct regexp *r)
6302 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6304 GET_RE_DEBUG_FLAGS_DECL;
6306 if (!r || (--r->refcnt > 0))
6308 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
6309 const char * const s = (r->reganch & ROPT_UTF8)
6310 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
6311 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
6312 const int len = SvCUR(dsv);
6315 PerlIO_printf(Perl_debug_log,
6316 "%sFreeing REx:%s %s%*.*s%s%s\n",
6317 PL_colors[4],PL_colors[5],PL_colors[0],
6320 len > 60 ? "..." : "");
6323 /* gcov results gave these as non-null 100% of the time, so there's no
6324 optimisation in checking them before calling Safefree */
6325 Safefree(r->precomp);
6326 Safefree(r->offsets); /* 20010421 MJD */
6327 RX_MATCH_COPY_FREE(r);
6328 #ifdef PERL_OLD_COPY_ON_WRITE
6330 SvREFCNT_dec(r->saved_copy);
6333 if (r->anchored_substr)
6334 SvREFCNT_dec(r->anchored_substr);
6335 if (r->anchored_utf8)
6336 SvREFCNT_dec(r->anchored_utf8);
6337 if (r->float_substr)
6338 SvREFCNT_dec(r->float_substr);
6340 SvREFCNT_dec(r->float_utf8);
6341 Safefree(r->substrs);
6344 int n = r->data->count;
6345 PAD* new_comppad = NULL;
6350 /* If you add a ->what type here, update the comment in regcomp.h */
6351 switch (r->data->what[n]) {
6353 SvREFCNT_dec((SV*)r->data->data[n]);
6356 Safefree(r->data->data[n]);
6359 new_comppad = (AV*)r->data->data[n];
6362 if (new_comppad == NULL)
6363 Perl_croak(aTHX_ "panic: pregfree comppad");
6364 PAD_SAVE_LOCAL(old_comppad,
6365 /* Watch out for global destruction's random ordering. */
6366 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6369 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6372 op_free((OP_4tree*)r->data->data[n]);
6374 PAD_RESTORE_LOCAL(old_comppad);
6375 SvREFCNT_dec((SV*)new_comppad);
6382 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
6385 refcount = --trie->refcount;
6388 Safefree(trie->charmap);
6389 if (trie->widecharmap)
6390 SvREFCNT_dec((SV*)trie->widecharmap);
6391 Safefree(trie->states);
6392 Safefree(trie->trans);
6394 Safefree(trie->bitmap);
6397 SvREFCNT_dec((SV*)trie->words);
6398 if (trie->revcharmap)
6399 SvREFCNT_dec((SV*)trie->revcharmap);
6401 Safefree(r->data->data[n]); /* do this last!!!! */
6406 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
6409 Safefree(r->data->what);
6412 Safefree(r->startp);
6417 #ifndef PERL_IN_XSUB_RE
6419 - regnext - dig the "next" pointer out of a node
6422 Perl_regnext(pTHX_ register regnode *p)
6425 register I32 offset;
6427 if (p == &PL_regdummy)
6430 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6439 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
6442 STRLEN l1 = strlen(pat1);
6443 STRLEN l2 = strlen(pat2);
6446 const char *message;
6452 Copy(pat1, buf, l1 , char);
6453 Copy(pat2, buf + l1, l2 , char);
6454 buf[l1 + l2] = '\n';
6455 buf[l1 + l2 + 1] = '\0';
6457 /* ANSI variant takes additional second argument */
6458 va_start(args, pat2);
6462 msv = vmess(buf, &args);
6464 message = SvPV_const(msv,l1);
6467 Copy(message, buf, l1 , char);
6468 buf[l1-1] = '\0'; /* Overwrite \n */
6469 Perl_croak(aTHX_ "%s", buf);
6472 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6474 #ifndef PERL_IN_XSUB_RE
6476 Perl_save_re_context(pTHX)
6480 struct re_save_state *state;
6482 SAVEVPTR(PL_curcop);
6483 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6485 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6486 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6487 SSPUSHINT(SAVEt_RE_STATE);
6489 Copy(&PL_reg_state, state, 1, struct re_save_state);
6491 PL_reg_start_tmp = 0;
6492 PL_reg_start_tmpl = 0;
6493 PL_reg_oldsaved = NULL;
6494 PL_reg_oldsavedlen = 0;
6496 PL_reg_leftiter = 0;
6497 PL_reg_poscache = NULL;
6498 PL_reg_poscache_size = 0;
6499 #ifdef PERL_OLD_COPY_ON_WRITE
6503 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6505 const REGEXP * const rx = PM_GETRE(PL_curpm);
6508 for (i = 1; i <= rx->nparens; i++) {
6509 char digits[TYPE_CHARS(long)];
6510 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
6511 GV *const *const gvp
6512 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6515 GV * const gv = *gvp;
6516 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6526 clear_re(pTHX_ void *r)
6529 ReREFCNT_dec((regexp *)r);
6535 S_put_byte(pTHX_ SV *sv, int c)
6537 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6538 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6539 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6540 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6542 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6546 #define CLEAR_OPTSTART \
6547 if (optstart) STMT_START { \
6548 PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart); \
6552 #define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6554 STATIC const regnode *
6555 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6556 const regnode *last, SV* sv, I32 l)
6559 register U8 op = EXACT; /* Arbitrary non-END op. */
6560 register const regnode *next;
6561 const regnode *optstart= NULL;
6562 GET_RE_DEBUG_FLAGS_DECL;
6564 while (op != END && (!last || node < last)) {
6565 /* While that wasn't END last time... */
6571 next = regnext((regnode *)node);
6573 if (OP(node) == OPTIMIZED) {
6574 if (!optstart && (SvIV(re_debug_flags) & RE_DEBUG_OPTIMISE))
6580 regprop(r, sv, node);
6581 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6582 (int)(2*l + 1), "", SvPVX_const(sv));
6584 if (OP(node) != OPTIMIZED) {
6585 if (next == NULL) /* Next ptr. */
6586 PerlIO_printf(Perl_debug_log, "(0)");
6588 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6589 (void)PerlIO_putc(Perl_debug_log, '\n');
6593 if (PL_regkind[(U8)op] == BRANCHJ) {
6594 register const regnode *nnode = (OP(next) == LONGJMP
6595 ? regnext((regnode *)next)
6597 if (last && nnode > last)
6599 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6601 else if (PL_regkind[(U8)op] == BRANCH) {
6602 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
6604 else if ( PL_regkind[(U8)op] == TRIE ) {
6605 const I32 n = ARG(node);
6606 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
6607 const I32 arry_len = av_len(trie->words)+1;
6609 PerlIO_printf(Perl_debug_log,
6610 "%*s[Start:%"UVuf" Words:%d Chars:%d Unique:%d States:%"IVdf" Minlen:%d Maxlen:%d",
6614 TRIE_WORDCOUNT(trie),
6615 (int)TRIE_CHARCOUNT(trie),
6616 trie->uniquecharcount,
6617 (IV)TRIE_LASTSTATE(trie)-1,
6618 trie->minlen, trie->maxlen
6623 sv_setpvn(sv, "", 0);
6624 for (i = 0; i <= 256; i++) {
6625 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6626 if (rangestart == -1)
6628 } else if (rangestart != -1) {
6629 if (i <= rangestart + 3)
6630 for (; rangestart < i; rangestart++)
6631 put_byte(sv, rangestart);
6633 put_byte(sv, rangestart);
6635 put_byte(sv, i - 1);
6640 PerlIO_printf(Perl_debug_log, " Start-Class:%s]\n", SvPVX_const(sv));
6642 PerlIO_printf(Perl_debug_log, " No Start-Class]\n");
6644 for (word_idx=0; word_idx < arry_len; word_idx++) {
6645 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
6647 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6650 SvPV_nolen_const(*elem_ptr),
6655 PerlIO_printf(Perl_debug_log, "(0)\n");
6657 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6663 node = NEXTOPER(node);
6664 node += regarglen[(U8)op];
6667 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6668 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6669 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6671 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6672 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6675 else if ( op == PLUS || op == STAR) {
6676 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6678 else if (op == ANYOF) {
6679 /* arglen 1 + class block */
6680 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6681 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6682 node = NEXTOPER(node);
6684 else if (PL_regkind[(U8)op] == EXACT) {
6685 /* Literal string, where present. */
6686 node += NODE_SZ_STR(node) - 1;
6687 node = NEXTOPER(node);
6690 node = NEXTOPER(node);
6691 node += regarglen[(U8)op];
6693 if (op == CURLYX || op == OPEN)
6695 else if (op == WHILEM)
6702 #endif /* DEBUGGING */
6706 * c-indentation-style: bsd
6708 * indent-tabs-mode: t
6711 * ex: set ts=8 sts=4 sw=4 noet: