5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
174 #define FULL_TRIE_STUDY
177 /* Length of a variant. */
179 typedef struct scan_data_t {
185 I32 last_end; /* min value, <0 unless valid. */
188 SV **longest; /* Either &l_fixed, or &l_float. */
192 I32 offset_float_min;
193 I32 offset_float_max;
197 struct regnode_charclass_class *start_class;
201 * Forward declarations for pregcomp()'s friends.
204 static const scan_data_t zero_scan_data =
205 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
207 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
208 #define SF_BEFORE_SEOL 0x0001
209 #define SF_BEFORE_MEOL 0x0002
210 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
211 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
214 # define SF_FIX_SHIFT_EOL (0+2)
215 # define SF_FL_SHIFT_EOL (0+4)
217 # define SF_FIX_SHIFT_EOL (+2)
218 # define SF_FL_SHIFT_EOL (+4)
221 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
222 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
224 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
225 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
226 #define SF_IS_INF 0x0040
227 #define SF_HAS_PAR 0x0080
228 #define SF_IN_PAR 0x0100
229 #define SF_HAS_EVAL 0x0200
230 #define SCF_DO_SUBSTR 0x0400
231 #define SCF_DO_STCLASS_AND 0x0800
232 #define SCF_DO_STCLASS_OR 0x1000
233 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
234 #define SCF_WHILEM_VISITED_POS 0x2000
236 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
239 #define UTF (RExC_utf8 != 0)
240 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
241 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
243 #define OOB_UNICODE 12345678
244 #define OOB_NAMEDCLASS -1
246 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
247 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250 /* length of regex to show in messages that don't mark a position within */
251 #define RegexLengthToShowInErrorMessages 127
254 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
255 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
256 * op/pragma/warn/regcomp.
258 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
259 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
261 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
264 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
265 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 #define FAIL(msg) STMT_START { \
269 const char *ellipses = ""; \
270 IV len = RExC_end - RExC_precomp; \
273 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
274 if (len > RegexLengthToShowInErrorMessages) { \
275 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
276 len = RegexLengthToShowInErrorMessages - 10; \
279 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
280 msg, (int)len, RExC_precomp, ellipses); \
284 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
286 #define Simple_vFAIL(m) STMT_START { \
287 const IV offset = RExC_parse - RExC_precomp; \
288 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
289 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
293 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
295 #define vFAIL(m) STMT_START { \
297 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
302 * Like Simple_vFAIL(), but accepts two arguments.
304 #define Simple_vFAIL2(m,a1) STMT_START { \
305 const IV offset = RExC_parse - RExC_precomp; \
306 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
307 (int)offset, RExC_precomp, RExC_precomp + offset); \
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
313 #define vFAIL2(m,a1) STMT_START { \
315 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
316 Simple_vFAIL2(m, a1); \
321 * Like Simple_vFAIL(), but accepts three arguments.
323 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
324 const IV offset = RExC_parse - RExC_precomp; \
325 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
326 (int)offset, RExC_precomp, RExC_precomp + offset); \
330 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
332 #define vFAIL3(m,a1,a2) STMT_START { \
334 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
335 Simple_vFAIL3(m, a1, a2); \
339 * Like Simple_vFAIL(), but accepts four arguments.
341 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
342 const IV offset = RExC_parse - RExC_precomp; \
343 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
344 (int)offset, RExC_precomp, RExC_precomp + offset); \
347 #define vWARN(loc,m) STMT_START { \
348 const IV offset = loc - RExC_precomp; \
349 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
350 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
353 #define vWARNdep(loc,m) STMT_START { \
354 const IV offset = loc - RExC_precomp; \
355 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
356 "%s" REPORT_LOCATION, \
357 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
361 #define vWARN2(loc, m, a1) STMT_START { \
362 const IV offset = loc - RExC_precomp; \
363 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
364 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
367 #define vWARN3(loc, m, a1, a2) STMT_START { \
368 const IV offset = loc - RExC_precomp; \
369 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
370 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
373 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
374 const IV offset = loc - RExC_precomp; \
375 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
376 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
379 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
380 const IV offset = loc - RExC_precomp; \
381 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
382 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
386 /* Allow for side effects in s */
387 #define REGC(c,s) STMT_START { \
388 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
391 /* Macros for recording node offsets. 20001227 mjd@plover.com
392 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
393 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
394 * Element 0 holds the number n.
395 * Position is 1 indexed.
398 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
400 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
401 __LINE__, (node), (int)(byte))); \
403 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
405 RExC_offsets[2*(node)-1] = (byte); \
410 #define Set_Node_Offset(node,byte) \
411 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
412 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
414 #define Set_Node_Length_To_R(node,len) STMT_START { \
416 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
417 __LINE__, (int)(node), (int)(len))); \
419 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
421 RExC_offsets[2*(node)] = (len); \
426 #define Set_Node_Length(node,len) \
427 Set_Node_Length_To_R((node)-RExC_emit_start, len)
428 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
429 #define Set_Node_Cur_Length(node) \
430 Set_Node_Length(node, RExC_parse - parse_start)
432 /* Get offsets and lengths */
433 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
434 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
436 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
438 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
442 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
443 #define EXPERIMENTAL_INPLACESCAN
446 static void clear_re(pTHX_ void *r);
448 /* Mark that we cannot extend a found fixed substring at this point.
449 Update the longest found anchored substring and the longest found
450 floating substrings if needed. */
453 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
455 const STRLEN l = CHR_SVLEN(data->last_found);
456 const STRLEN old_l = CHR_SVLEN(*data->longest);
458 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
459 SvSetMagicSV(*data->longest, data->last_found);
460 if (*data->longest == data->longest_fixed) {
461 data->offset_fixed = l ? data->last_start_min : data->pos_min;
462 if (data->flags & SF_BEFORE_EOL)
464 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
466 data->flags &= ~SF_FIX_BEFORE_EOL;
469 data->offset_float_min = l ? data->last_start_min : data->pos_min;
470 data->offset_float_max = (l
471 ? data->last_start_max
472 : data->pos_min + data->pos_delta);
473 if ((U32)data->offset_float_max > (U32)I32_MAX)
474 data->offset_float_max = I32_MAX;
475 if (data->flags & SF_BEFORE_EOL)
477 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
479 data->flags &= ~SF_FL_BEFORE_EOL;
482 SvCUR_set(data->last_found, 0);
484 SV * const sv = data->last_found;
485 if (SvUTF8(sv) && SvMAGICAL(sv)) {
486 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
492 data->flags &= ~SF_BEFORE_EOL;
495 /* Can match anything (initialization) */
497 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
499 ANYOF_CLASS_ZERO(cl);
500 ANYOF_BITMAP_SETALL(cl);
501 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
503 cl->flags |= ANYOF_LOCALE;
506 /* Can match anything (initialization) */
508 S_cl_is_anything(const struct regnode_charclass_class *cl)
512 for (value = 0; value <= ANYOF_MAX; value += 2)
513 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
515 if (!(cl->flags & ANYOF_UNICODE_ALL))
517 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
522 /* Can match anything (initialization) */
524 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
526 Zero(cl, 1, struct regnode_charclass_class);
528 cl_anything(pRExC_state, cl);
532 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
534 Zero(cl, 1, struct regnode_charclass_class);
536 cl_anything(pRExC_state, cl);
538 cl->flags |= ANYOF_LOCALE;
541 /* 'And' a given class with another one. Can create false positives */
542 /* We assume that cl is not inverted */
544 S_cl_and(struct regnode_charclass_class *cl,
545 const struct regnode_charclass_class *and_with)
547 if (!(and_with->flags & ANYOF_CLASS)
548 && !(cl->flags & ANYOF_CLASS)
549 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
550 && !(and_with->flags & ANYOF_FOLD)
551 && !(cl->flags & ANYOF_FOLD)) {
554 if (and_with->flags & ANYOF_INVERT)
555 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
556 cl->bitmap[i] &= ~and_with->bitmap[i];
558 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
559 cl->bitmap[i] &= and_with->bitmap[i];
560 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
561 if (!(and_with->flags & ANYOF_EOS))
562 cl->flags &= ~ANYOF_EOS;
564 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
565 !(and_with->flags & ANYOF_INVERT)) {
566 cl->flags &= ~ANYOF_UNICODE_ALL;
567 cl->flags |= ANYOF_UNICODE;
568 ARG_SET(cl, ARG(and_with));
570 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
571 !(and_with->flags & ANYOF_INVERT))
572 cl->flags &= ~ANYOF_UNICODE_ALL;
573 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
574 !(and_with->flags & ANYOF_INVERT))
575 cl->flags &= ~ANYOF_UNICODE;
578 /* 'OR' a given class with another one. Can create false positives */
579 /* We assume that cl is not inverted */
581 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
583 if (or_with->flags & ANYOF_INVERT) {
585 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
586 * <= (B1 | !B2) | (CL1 | !CL2)
587 * which is wasteful if CL2 is small, but we ignore CL2:
588 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
589 * XXXX Can we handle case-fold? Unclear:
590 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
591 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
593 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
594 && !(or_with->flags & ANYOF_FOLD)
595 && !(cl->flags & ANYOF_FOLD) ) {
598 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
599 cl->bitmap[i] |= ~or_with->bitmap[i];
600 } /* XXXX: logic is complicated otherwise */
602 cl_anything(pRExC_state, cl);
605 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
606 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
607 && (!(or_with->flags & ANYOF_FOLD)
608 || (cl->flags & ANYOF_FOLD)) ) {
611 /* OR char bitmap and class bitmap separately */
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= or_with->bitmap[i];
614 if (or_with->flags & ANYOF_CLASS) {
615 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
616 cl->classflags[i] |= or_with->classflags[i];
617 cl->flags |= ANYOF_CLASS;
620 else { /* XXXX: logic is complicated, leave it along for a moment. */
621 cl_anything(pRExC_state, cl);
624 if (or_with->flags & ANYOF_EOS)
625 cl->flags |= ANYOF_EOS;
627 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
628 ARG(cl) != ARG(or_with)) {
629 cl->flags |= ANYOF_UNICODE_ALL;
630 cl->flags &= ~ANYOF_UNICODE;
632 if (or_with->flags & ANYOF_UNICODE_ALL) {
633 cl->flags |= ANYOF_UNICODE_ALL;
634 cl->flags &= ~ANYOF_UNICODE;
638 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
639 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
640 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
641 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
647 dump_trie_interim_list(trie,next_alloc)
648 dump_trie_interim_table(trie,next_alloc)
650 These routines dump out a trie in a somewhat readable format.
651 The _interim_ variants are used for debugging the interim
652 tables that are used to generate the final compressed
653 representation which is what dump_trie expects.
655 Part of the reason for their existance is to provide a form
656 of documentation as to how the different representations function.
662 Dumps the final compressed table form of the trie to Perl_debug_log.
663 Used for debugging make_trie().
667 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
670 SV *sv=sv_newmortal();
671 int colwidth= trie->widecharmap ? 6 : 4;
672 GET_RE_DEBUG_FLAGS_DECL;
675 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
676 (int)depth * 2 + 2,"",
677 "Match","Base","Ofs" );
679 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
680 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
682 PerlIO_printf( Perl_debug_log, "%*s",
684 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
685 PL_colors[0], PL_colors[1],
686 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
687 PERL_PV_ESCAPE_FIRSTCHAR
692 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
693 (int)depth * 2 + 2,"");
695 for( state = 0 ; state < trie->uniquecharcount ; state++ )
696 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
697 PerlIO_printf( Perl_debug_log, "\n");
699 for( state = 1 ; state < trie->laststate ; state++ ) {
700 const U32 base = trie->states[ state ].trans.base;
702 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
704 if ( trie->states[ state ].wordnum ) {
705 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
707 PerlIO_printf( Perl_debug_log, "%6s", "" );
710 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
715 while( ( base + ofs < trie->uniquecharcount ) ||
716 ( base + ofs - trie->uniquecharcount < trie->lasttrans
717 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
720 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
722 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
723 if ( ( base + ofs >= trie->uniquecharcount ) &&
724 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
725 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
727 PerlIO_printf( Perl_debug_log, "%*"UVXf,
729 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
731 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
735 PerlIO_printf( Perl_debug_log, "]");
738 PerlIO_printf( Perl_debug_log, "\n" );
742 dump_trie_interim_list(trie,next_alloc)
743 Dumps a fully constructed but uncompressed trie in list form.
744 List tries normally only are used for construction when the number of
745 possible chars (trie->uniquecharcount) is very high.
746 Used for debugging make_trie().
749 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
752 SV *sv=sv_newmortal();
753 int colwidth= trie->widecharmap ? 6 : 4;
754 GET_RE_DEBUG_FLAGS_DECL;
755 /* print out the table precompression. */
756 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
757 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
758 "------:-----+-----------------\n" );
760 for( state=1 ; state < next_alloc ; state ++ ) {
763 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
764 (int)depth * 2 + 2,"", (UV)state );
765 if ( ! trie->states[ state ].wordnum ) {
766 PerlIO_printf( Perl_debug_log, "%5s| ","");
768 PerlIO_printf( Perl_debug_log, "W%4x| ",
769 trie->states[ state ].wordnum
772 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
773 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
775 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
777 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
778 PL_colors[0], PL_colors[1],
779 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
780 PERL_PV_ESCAPE_FIRSTCHAR
782 TRIE_LIST_ITEM(state,charid).forid,
783 (UV)TRIE_LIST_ITEM(state,charid).newstate
787 PerlIO_printf( Perl_debug_log, "\n");
792 dump_trie_interim_table(trie,next_alloc)
793 Dumps a fully constructed but uncompressed trie in table form.
794 This is the normal DFA style state transition table, with a few
795 twists to facilitate compression later.
796 Used for debugging make_trie().
799 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
803 SV *sv=sv_newmortal();
804 int colwidth= trie->widecharmap ? 6 : 4;
805 GET_RE_DEBUG_FLAGS_DECL;
808 print out the table precompression so that we can do a visual check
809 that they are identical.
812 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
814 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
815 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
817 PerlIO_printf( Perl_debug_log, "%*s",
819 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
820 PL_colors[0], PL_colors[1],
821 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
822 PERL_PV_ESCAPE_FIRSTCHAR
828 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
830 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
831 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
834 PerlIO_printf( Perl_debug_log, "\n" );
836 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
838 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
839 (int)depth * 2 + 2,"",
840 (UV)TRIE_NODENUM( state ) );
842 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
843 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
845 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
847 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
849 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
850 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
852 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
853 trie->states[ TRIE_NODENUM( state ) ].wordnum );
860 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
861 startbranch: the first branch in the whole branch sequence
862 first : start branch of sequence of branch-exact nodes.
863 May be the same as startbranch
864 last : Thing following the last branch.
865 May be the same as tail.
866 tail : item following the branch sequence
867 count : words in the sequence
868 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
871 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
873 A trie is an N'ary tree where the branches are determined by digital
874 decomposition of the key. IE, at the root node you look up the 1st character and
875 follow that branch repeat until you find the end of the branches. Nodes can be
876 marked as "accepting" meaning they represent a complete word. Eg:
880 would convert into the following structure. Numbers represent states, letters
881 following numbers represent valid transitions on the letter from that state, if
882 the number is in square brackets it represents an accepting state, otherwise it
883 will be in parenthesis.
885 +-h->+-e->[3]-+-r->(8)-+-s->[9]
889 (1) +-i->(6)-+-s->[7]
891 +-s->(3)-+-h->(4)-+-e->[5]
893 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
895 This shows that when matching against the string 'hers' we will begin at state 1
896 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
897 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
898 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
899 single traverse. We store a mapping from accepting to state to which word was
900 matched, and then when we have multiple possibilities we try to complete the
901 rest of the regex in the order in which they occured in the alternation.
903 The only prior NFA like behaviour that would be changed by the TRIE support is
904 the silent ignoring of duplicate alternations which are of the form:
906 / (DUPE|DUPE) X? (?{ ... }) Y /x
908 Thus EVAL blocks follwing a trie may be called a different number of times with
909 and without the optimisation. With the optimisations dupes will be silently
910 ignored. This inconsistant behaviour of EVAL type nodes is well established as
911 the following demonstrates:
913 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
915 which prints out 'word' three times, but
917 'words'=~/(word|word|word)(?{ print $1 })S/
919 which doesnt print it out at all. This is due to other optimisations kicking in.
921 Example of what happens on a structural level:
923 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
925 1: CURLYM[1] {1,32767}(18)
936 This would be optimizable with startbranch=5, first=5, last=16, tail=16
937 and should turn into:
939 1: CURLYM[1] {1,32767}(18)
941 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
949 Cases where tail != last would be like /(?foo|bar)baz/:
959 which would be optimizable with startbranch=1, first=1, last=7, tail=8
960 and would end up looking like:
963 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
970 d = uvuni_to_utf8_flags(d, uv, 0);
972 is the recommended Unicode-aware way of saying
977 #define TRIE_STORE_REVCHAR \
979 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
980 if (UTF) SvUTF8_on(tmp); \
981 av_push( TRIE_REVCHARMAP(trie), tmp ); \
984 #define TRIE_READ_CHAR STMT_START { \
988 if ( foldlen > 0 ) { \
989 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
994 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
995 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
996 foldlen -= UNISKIP( uvc ); \
997 scan = foldbuf + UNISKIP( uvc ); \
1000 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1010 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1011 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1012 TRIE_LIST_LEN( state ) *= 2; \
1013 Renew( trie->states[ state ].trans.list, \
1014 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1016 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1017 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1018 TRIE_LIST_CUR( state )++; \
1021 #define TRIE_LIST_NEW(state) STMT_START { \
1022 Newxz( trie->states[ state ].trans.list, \
1023 4, reg_trie_trans_le ); \
1024 TRIE_LIST_CUR( state ) = 1; \
1025 TRIE_LIST_LEN( state ) = 4; \
1028 #define TRIE_HANDLE_WORD(state) STMT_START { \
1029 U16 dupe= trie->states[ state ].wordnum; \
1030 regnode * const noper_next = regnext( noper ); \
1032 if (trie->wordlen) \
1033 trie->wordlen[ curword ] = wordlen; \
1035 /* store the word for dumping */ \
1037 if (OP(noper) != NOTHING) \
1038 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1040 tmp = newSVpvn( "", 0 ); \
1041 if ( UTF ) SvUTF8_on( tmp ); \
1042 av_push( trie->words, tmp ); \
1047 if ( noper_next < tail ) { \
1049 Newxz( trie->jump, word_count + 1, U16); \
1050 trie->jump[curword] = (U16)(tail - noper_next); \
1052 jumper = noper_next; \
1054 nextbranch= regnext(cur); \
1058 /* So it's a dupe. This means we need to maintain a */\
1059 /* linked-list from the first to the next. */\
1060 /* we only allocate the nextword buffer when there */\
1061 /* a dupe, so first time we have to do the allocation */\
1062 if (!trie->nextword) \
1063 Newxz( trie->nextword, word_count + 1, U16); \
1064 while ( trie->nextword[dupe] ) \
1065 dupe= trie->nextword[dupe]; \
1066 trie->nextword[dupe]= curword; \
1068 /* we haven't inserted this word yet. */ \
1069 trie->states[ state ].wordnum = curword; \
1074 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1075 ( ( base + charid >= ucharcount \
1076 && base + charid < ubound \
1077 && state == trie->trans[ base - ucharcount + charid ].check \
1078 && trie->trans[ base - ucharcount + charid ].next ) \
1079 ? trie->trans[ base - ucharcount + charid ].next \
1080 : ( state==1 ? special : 0 ) \
1084 #define MADE_JUMP_TRIE 2
1085 #define MADE_EXACT_TRIE 4
1088 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1091 /* first pass, loop through and scan words */
1092 reg_trie_data *trie;
1094 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1099 regnode *jumper = NULL;
1100 regnode *nextbranch = NULL;
1101 /* we just use folder as a flag in utf8 */
1102 const U8 * const folder = ( flags == EXACTF
1104 : ( flags == EXACTFL
1110 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1111 SV *re_trie_maxbuff;
1113 /* these are only used during construction but are useful during
1114 * debugging so we store them in the struct when debugging.
1116 STRLEN trie_charcount=0;
1117 AV *trie_revcharmap;
1119 GET_RE_DEBUG_FLAGS_DECL;
1121 PERL_UNUSED_ARG(depth);
1124 Newxz( trie, 1, reg_trie_data );
1126 trie->startstate = 1;
1127 trie->wordcount = word_count;
1128 RExC_rx->data->data[ data_slot ] = (void*)trie;
1129 Newxz( trie->charmap, 256, U16 );
1130 if (!(UTF && folder))
1131 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1133 trie->words = newAV();
1135 TRIE_REVCHARMAP(trie) = newAV();
1137 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1138 if (!SvIOK(re_trie_maxbuff)) {
1139 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1142 PerlIO_printf( Perl_debug_log,
1143 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1144 (int)depth * 2 + 2, "",
1145 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1146 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1149 /* -- First loop and Setup --
1151 We first traverse the branches and scan each word to determine if it
1152 contains widechars, and how many unique chars there are, this is
1153 important as we have to build a table with at least as many columns as we
1156 We use an array of integers to represent the character codes 0..255
1157 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1158 native representation of the character value as the key and IV's for the
1161 *TODO* If we keep track of how many times each character is used we can
1162 remap the columns so that the table compression later on is more
1163 efficient in terms of memory by ensuring most common value is in the
1164 middle and the least common are on the outside. IMO this would be better
1165 than a most to least common mapping as theres a decent chance the most
1166 common letter will share a node with the least common, meaning the node
1167 will not be compressable. With a middle is most common approach the worst
1168 case is when we have the least common nodes twice.
1172 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1173 regnode * const noper = NEXTOPER( cur );
1174 const U8 *uc = (U8*)STRING( noper );
1175 const U8 * const e = uc + STR_LEN( noper );
1177 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1178 const U8 *scan = (U8*)NULL;
1179 U32 wordlen = 0; /* required init */
1182 if (OP(noper) == NOTHING) {
1187 TRIE_BITMAP_SET(trie,*uc);
1188 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1190 for ( ; uc < e ; uc += len ) {
1191 TRIE_CHARCOUNT(trie)++;
1195 if ( !trie->charmap[ uvc ] ) {
1196 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1198 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1203 if ( !trie->widecharmap )
1204 trie->widecharmap = newHV();
1206 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1209 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1211 if ( !SvTRUE( *svpp ) ) {
1212 sv_setiv( *svpp, ++trie->uniquecharcount );
1217 if( cur == first ) {
1220 } else if (chars < trie->minlen) {
1222 } else if (chars > trie->maxlen) {
1226 } /* end first pass */
1227 DEBUG_TRIE_COMPILE_r(
1228 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1229 (int)depth * 2 + 2,"",
1230 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1231 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1232 (int)trie->minlen, (int)trie->maxlen )
1234 Newxz( trie->wordlen, word_count, U32 );
1237 We now know what we are dealing with in terms of unique chars and
1238 string sizes so we can calculate how much memory a naive
1239 representation using a flat table will take. If it's over a reasonable
1240 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1241 conservative but potentially much slower representation using an array
1244 At the end we convert both representations into the same compressed
1245 form that will be used in regexec.c for matching with. The latter
1246 is a form that cannot be used to construct with but has memory
1247 properties similar to the list form and access properties similar
1248 to the table form making it both suitable for fast searches and
1249 small enough that its feasable to store for the duration of a program.
1251 See the comment in the code where the compressed table is produced
1252 inplace from the flat tabe representation for an explanation of how
1253 the compression works.
1258 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1260 Second Pass -- Array Of Lists Representation
1262 Each state will be represented by a list of charid:state records
1263 (reg_trie_trans_le) the first such element holds the CUR and LEN
1264 points of the allocated array. (See defines above).
1266 We build the initial structure using the lists, and then convert
1267 it into the compressed table form which allows faster lookups
1268 (but cant be modified once converted).
1271 STRLEN transcount = 1;
1273 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1277 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1279 regnode * const noper = NEXTOPER( cur );
1280 U8 *uc = (U8*)STRING( noper );
1281 const U8 * const e = uc + STR_LEN( noper );
1282 U32 state = 1; /* required init */
1283 U16 charid = 0; /* sanity init */
1284 U8 *scan = (U8*)NULL; /* sanity init */
1285 STRLEN foldlen = 0; /* required init */
1286 U32 wordlen = 0; /* required init */
1287 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1289 if (OP(noper) != NOTHING) {
1290 for ( ; uc < e ; uc += len ) {
1295 charid = trie->charmap[ uvc ];
1297 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1301 charid=(U16)SvIV( *svpp );
1304 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1311 if ( !trie->states[ state ].trans.list ) {
1312 TRIE_LIST_NEW( state );
1314 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1315 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1316 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1321 newstate = next_alloc++;
1322 TRIE_LIST_PUSH( state, charid, newstate );
1327 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1331 TRIE_HANDLE_WORD(state);
1333 } /* end second pass */
1335 trie->laststate = next_alloc;
1336 Renew( trie->states, next_alloc, reg_trie_state );
1338 /* and now dump it out before we compress it */
1339 DEBUG_TRIE_COMPILE_MORE_r(
1340 dump_trie_interim_list(trie,next_alloc,depth+1)
1343 Newxz( trie->trans, transcount ,reg_trie_trans );
1350 for( state=1 ; state < next_alloc ; state ++ ) {
1354 DEBUG_TRIE_COMPILE_MORE_r(
1355 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1359 if (trie->states[state].trans.list) {
1360 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1364 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1365 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1366 if ( forid < minid ) {
1368 } else if ( forid > maxid ) {
1372 if ( transcount < tp + maxid - minid + 1) {
1374 Renew( trie->trans, transcount, reg_trie_trans );
1375 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1377 base = trie->uniquecharcount + tp - minid;
1378 if ( maxid == minid ) {
1380 for ( ; zp < tp ; zp++ ) {
1381 if ( ! trie->trans[ zp ].next ) {
1382 base = trie->uniquecharcount + zp - minid;
1383 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1384 trie->trans[ zp ].check = state;
1390 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1391 trie->trans[ tp ].check = state;
1396 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1397 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1398 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1399 trie->trans[ tid ].check = state;
1401 tp += ( maxid - minid + 1 );
1403 Safefree(trie->states[ state ].trans.list);
1406 DEBUG_TRIE_COMPILE_MORE_r(
1407 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1410 trie->states[ state ].trans.base=base;
1412 trie->lasttrans = tp + 1;
1416 Second Pass -- Flat Table Representation.
1418 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1419 We know that we will need Charcount+1 trans at most to store the data
1420 (one row per char at worst case) So we preallocate both structures
1421 assuming worst case.
1423 We then construct the trie using only the .next slots of the entry
1426 We use the .check field of the first entry of the node temporarily to
1427 make compression both faster and easier by keeping track of how many non
1428 zero fields are in the node.
1430 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1433 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1434 number representing the first entry of the node, and state as a
1435 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1436 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1437 are 2 entrys per node. eg:
1445 The table is internally in the right hand, idx form. However as we also
1446 have to deal with the states array which is indexed by nodenum we have to
1447 use TRIE_NODENUM() to convert.
1452 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1454 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1455 next_alloc = trie->uniquecharcount + 1;
1458 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1460 regnode * const noper = NEXTOPER( cur );
1461 const U8 *uc = (U8*)STRING( noper );
1462 const U8 * const e = uc + STR_LEN( noper );
1464 U32 state = 1; /* required init */
1466 U16 charid = 0; /* sanity init */
1467 U32 accept_state = 0; /* sanity init */
1468 U8 *scan = (U8*)NULL; /* sanity init */
1470 STRLEN foldlen = 0; /* required init */
1471 U32 wordlen = 0; /* required init */
1472 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1474 if ( OP(noper) != NOTHING ) {
1475 for ( ; uc < e ; uc += len ) {
1480 charid = trie->charmap[ uvc ];
1482 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1483 charid = svpp ? (U16)SvIV(*svpp) : 0;
1487 if ( !trie->trans[ state + charid ].next ) {
1488 trie->trans[ state + charid ].next = next_alloc;
1489 trie->trans[ state ].check++;
1490 next_alloc += trie->uniquecharcount;
1492 state = trie->trans[ state + charid ].next;
1494 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1496 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1499 accept_state = TRIE_NODENUM( state );
1500 TRIE_HANDLE_WORD(accept_state);
1502 } /* end second pass */
1504 /* and now dump it out before we compress it */
1505 DEBUG_TRIE_COMPILE_MORE_r(
1506 dump_trie_interim_table(trie,next_alloc,depth+1)
1511 * Inplace compress the table.*
1513 For sparse data sets the table constructed by the trie algorithm will
1514 be mostly 0/FAIL transitions or to put it another way mostly empty.
1515 (Note that leaf nodes will not contain any transitions.)
1517 This algorithm compresses the tables by eliminating most such
1518 transitions, at the cost of a modest bit of extra work during lookup:
1520 - Each states[] entry contains a .base field which indicates the
1521 index in the state[] array wheres its transition data is stored.
1523 - If .base is 0 there are no valid transitions from that node.
1525 - If .base is nonzero then charid is added to it to find an entry in
1528 -If trans[states[state].base+charid].check!=state then the
1529 transition is taken to be a 0/Fail transition. Thus if there are fail
1530 transitions at the front of the node then the .base offset will point
1531 somewhere inside the previous nodes data (or maybe even into a node
1532 even earlier), but the .check field determines if the transition is
1536 The following process inplace converts the table to the compressed
1537 table: We first do not compress the root node 1,and mark its all its
1538 .check pointers as 1 and set its .base pointer as 1 as well. This
1539 allows to do a DFA construction from the compressed table later, and
1540 ensures that any .base pointers we calculate later are greater than
1543 - We set 'pos' to indicate the first entry of the second node.
1545 - We then iterate over the columns of the node, finding the first and
1546 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1547 and set the .check pointers accordingly, and advance pos
1548 appropriately and repreat for the next node. Note that when we copy
1549 the next pointers we have to convert them from the original
1550 NODEIDX form to NODENUM form as the former is not valid post
1553 - If a node has no transitions used we mark its base as 0 and do not
1554 advance the pos pointer.
1556 - If a node only has one transition we use a second pointer into the
1557 structure to fill in allocated fail transitions from other states.
1558 This pointer is independent of the main pointer and scans forward
1559 looking for null transitions that are allocated to a state. When it
1560 finds one it writes the single transition into the "hole". If the
1561 pointer doesnt find one the single transition is appended as normal.
1563 - Once compressed we can Renew/realloc the structures to release the
1566 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1567 specifically Fig 3.47 and the associated pseudocode.
1571 const U32 laststate = TRIE_NODENUM( next_alloc );
1574 trie->laststate = laststate;
1576 for ( state = 1 ; state < laststate ; state++ ) {
1578 const U32 stateidx = TRIE_NODEIDX( state );
1579 const U32 o_used = trie->trans[ stateidx ].check;
1580 U32 used = trie->trans[ stateidx ].check;
1581 trie->trans[ stateidx ].check = 0;
1583 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1584 if ( flag || trie->trans[ stateidx + charid ].next ) {
1585 if ( trie->trans[ stateidx + charid ].next ) {
1587 for ( ; zp < pos ; zp++ ) {
1588 if ( ! trie->trans[ zp ].next ) {
1592 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1593 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1594 trie->trans[ zp ].check = state;
1595 if ( ++zp > pos ) pos = zp;
1602 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1604 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1605 trie->trans[ pos ].check = state;
1610 trie->lasttrans = pos + 1;
1611 Renew( trie->states, laststate + 1, reg_trie_state);
1612 DEBUG_TRIE_COMPILE_MORE_r(
1613 PerlIO_printf( Perl_debug_log,
1614 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1615 (int)depth * 2 + 2,"",
1616 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1619 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1622 } /* end table compress */
1624 /* resize the trans array to remove unused space */
1625 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1627 /* and now dump out the compressed format */
1628 DEBUG_TRIE_COMPILE_r(
1629 dump_trie(trie,depth+1)
1632 { /* Modify the program and insert the new TRIE node*/
1634 U8 nodetype =(U8)(flags & 0xFF);
1643 This means we convert either the first branch or the first Exact,
1644 depending on whether the thing following (in 'last') is a branch
1645 or not and whther first is the startbranch (ie is it a sub part of
1646 the alternation or is it the whole thing.)
1647 Assuming its a sub part we conver the EXACT otherwise we convert
1648 the whole branch sequence, including the first.
1650 /* Find the node we are going to overwrite */
1651 if ( first == startbranch && OP( last ) != BRANCH ) {
1652 /* whole branch chain */
1655 const regnode *nop = NEXTOPER( convert );
1656 mjd_offset= Node_Offset((nop));
1657 mjd_nodelen= Node_Length((nop));
1660 /* branch sub-chain */
1661 convert = NEXTOPER( first );
1662 NEXT_OFF( first ) = (U16)(last - first);
1664 mjd_offset= Node_Offset((convert));
1665 mjd_nodelen= Node_Length((convert));
1669 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1670 (int)depth * 2 + 2, "",
1671 (UV)mjd_offset, (UV)mjd_nodelen)
1674 /* But first we check to see if there is a common prefix we can
1675 split out as an EXACT and put in front of the TRIE node. */
1676 trie->startstate= 1;
1677 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1680 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1681 (int)depth * 2 + 2, "",
1682 (UV)trie->laststate)
1684 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
1688 const U32 base = trie->states[ state ].trans.base;
1690 if ( trie->states[state].wordnum )
1693 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1694 if ( ( base + ofs >= trie->uniquecharcount ) &&
1695 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1696 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1698 if ( ++count > 1 ) {
1699 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1700 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1701 if ( state == 1 ) break;
1703 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1705 PerlIO_printf(Perl_debug_log,
1706 "%*sNew Start State=%"UVuf" Class: [",
1707 (int)depth * 2 + 2, "",
1710 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1711 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1713 TRIE_BITMAP_SET(trie,*ch);
1715 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1717 PerlIO_printf(Perl_debug_log, (char*)ch)
1721 TRIE_BITMAP_SET(trie,*ch);
1723 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1724 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1730 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1731 const char *ch = SvPV_nolen_const( *tmp );
1733 PerlIO_printf( Perl_debug_log,
1734 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1735 (int)depth * 2 + 2, "",
1736 (UV)state, (UV)idx, ch)
1739 OP( convert ) = nodetype;
1740 str=STRING(convert);
1749 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1755 regnode *n = convert+NODE_SZ_STR(convert);
1756 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1757 trie->startstate = state;
1758 trie->minlen -= (state - 1);
1759 trie->maxlen -= (state - 1);
1761 regnode *fix = convert;
1763 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1764 while( ++fix < n ) {
1765 Set_Node_Offset_Length(fix, 0, 0);
1771 NEXT_OFF(convert) = (U16)(tail - convert);
1775 if ( trie->maxlen ) {
1776 NEXT_OFF( convert ) = (U16)(tail - convert);
1777 ARG_SET( convert, data_slot );
1778 /* Store the offset to the first unabsorbed branch in
1779 jump[0], which is otherwise unused by the jump logic.
1780 We use this when dumping a trie and during optimisation. */
1782 trie->jump[0] = (U16)(tail - nextbranch);
1786 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1787 ((char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1789 OP( convert ) = TRIEC;
1790 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1791 Safefree(trie->bitmap);
1794 OP( convert ) = TRIE;
1796 /* store the type in the flags */
1797 convert->flags = nodetype;
1798 /* XXX We really should free up the resource in trie now, as we wont use them */
1800 /* needed for dumping*/
1802 regnode *optimize = convert
1804 + regarglen[ OP( convert ) ];
1805 regnode *opt = convert;
1806 while (++opt<optimize) {
1807 Set_Node_Offset_Length(opt,0,0);
1810 Try to clean up some of the debris left after the
1813 while( optimize < jumper ) {
1814 mjd_nodelen += Node_Length((optimize));
1815 OP( optimize ) = OPTIMIZED;
1816 Set_Node_Offset_Length(optimize,0,0);
1819 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1821 } /* end node insert */
1823 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1827 : trie->startstate>1
1833 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1835 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1837 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1838 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1841 We find the fail state for each state in the trie, this state is the longest proper
1842 suffix of the current states 'word' that is also a proper prefix of another word in our
1843 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1844 the DFA not to have to restart after its tried and failed a word at a given point, it
1845 simply continues as though it had been matching the other word in the first place.
1847 'abcdgu'=~/abcdefg|cdgu/
1848 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1849 fail, which would bring use to the state representing 'd' in the second word where we would
1850 try 'g' and succeed, prodceding to match 'cdgu'.
1852 /* add a fail transition */
1853 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1855 const U32 ucharcount = trie->uniquecharcount;
1856 const U32 numstates = trie->laststate;
1857 const U32 ubound = trie->lasttrans + ucharcount;
1861 U32 base = trie->states[ 1 ].trans.base;
1864 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1865 GET_RE_DEBUG_FLAGS_DECL;
1867 PERL_UNUSED_ARG(depth);
1871 ARG_SET( stclass, data_slot );
1872 Newxz( aho, 1, reg_ac_data );
1873 RExC_rx->data->data[ data_slot ] = (void*)aho;
1875 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1876 (trie->laststate+1)*sizeof(reg_trie_state));
1877 Newxz( q, numstates, U32);
1878 Newxz( aho->fail, numstates, U32 );
1881 /* initialize fail[0..1] to be 1 so that we always have
1882 a valid final fail state */
1883 fail[ 0 ] = fail[ 1 ] = 1;
1885 for ( charid = 0; charid < ucharcount ; charid++ ) {
1886 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1888 q[ q_write ] = newstate;
1889 /* set to point at the root */
1890 fail[ q[ q_write++ ] ]=1;
1893 while ( q_read < q_write) {
1894 const U32 cur = q[ q_read++ % numstates ];
1895 base = trie->states[ cur ].trans.base;
1897 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1898 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1900 U32 fail_state = cur;
1903 fail_state = fail[ fail_state ];
1904 fail_base = aho->states[ fail_state ].trans.base;
1905 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1907 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1908 fail[ ch_state ] = fail_state;
1909 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1911 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1913 q[ q_write++ % numstates] = ch_state;
1917 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
1918 when we fail in state 1, this allows us to use the
1919 charclass scan to find a valid start char. This is based on the principle
1920 that theres a good chance the string being searched contains lots of stuff
1921 that cant be a start char.
1923 fail[ 0 ] = fail[ 1 ] = 0;
1924 DEBUG_TRIE_COMPILE_r({
1925 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
1926 for( q_read=1; q_read<numstates; q_read++ ) {
1927 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
1929 PerlIO_printf(Perl_debug_log, "\n");
1932 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1937 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1938 * These need to be revisited when a newer toolchain becomes available.
1940 #if defined(__sparc64__) && defined(__GNUC__)
1941 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1942 # undef SPARC64_GCC_WORKAROUND
1943 # define SPARC64_GCC_WORKAROUND 1
1947 #define DEBUG_PEEP(str,scan,depth) \
1948 DEBUG_OPTIMISE_r({ \
1949 SV * const mysv=sv_newmortal(); \
1950 regnode *Next = regnext(scan); \
1951 regprop(RExC_rx, mysv, scan); \
1952 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1953 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1954 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1957 #define JOIN_EXACT(scan,min,flags) \
1958 if (PL_regkind[OP(scan)] == EXACT) \
1959 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1962 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1963 /* Merge several consecutive EXACTish nodes into one. */
1964 regnode *n = regnext(scan);
1966 regnode *next = scan + NODE_SZ_STR(scan);
1970 regnode *stop = scan;
1971 GET_RE_DEBUG_FLAGS_DECL;
1973 PERL_UNUSED_ARG(depth);
1975 #ifndef EXPERIMENTAL_INPLACESCAN
1976 PERL_UNUSED_ARG(flags);
1977 PERL_UNUSED_ARG(val);
1979 DEBUG_PEEP("join",scan,depth);
1981 /* Skip NOTHING, merge EXACT*. */
1983 ( PL_regkind[OP(n)] == NOTHING ||
1984 (stringok && (OP(n) == OP(scan))))
1986 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1988 if (OP(n) == TAIL || n > next)
1990 if (PL_regkind[OP(n)] == NOTHING) {
1991 DEBUG_PEEP("skip:",n,depth);
1992 NEXT_OFF(scan) += NEXT_OFF(n);
1993 next = n + NODE_STEP_REGNODE;
2000 else if (stringok) {
2001 const unsigned int oldl = STR_LEN(scan);
2002 regnode * const nnext = regnext(n);
2004 DEBUG_PEEP("merg",n,depth);
2007 if (oldl + STR_LEN(n) > U8_MAX)
2009 NEXT_OFF(scan) += NEXT_OFF(n);
2010 STR_LEN(scan) += STR_LEN(n);
2011 next = n + NODE_SZ_STR(n);
2012 /* Now we can overwrite *n : */
2013 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2021 #ifdef EXPERIMENTAL_INPLACESCAN
2022 if (flags && !NEXT_OFF(n)) {
2023 DEBUG_PEEP("atch", val, depth);
2024 if (reg_off_by_arg[OP(n)]) {
2025 ARG_SET(n, val - n);
2028 NEXT_OFF(n) = val - n;
2035 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2037 Two problematic code points in Unicode casefolding of EXACT nodes:
2039 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2040 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2046 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2047 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2049 This means that in case-insensitive matching (or "loose matching",
2050 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2051 length of the above casefolded versions) can match a target string
2052 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2053 This would rather mess up the minimum length computation.
2055 What we'll do is to look for the tail four bytes, and then peek
2056 at the preceding two bytes to see whether we need to decrease
2057 the minimum length by four (six minus two).
2059 Thanks to the design of UTF-8, there cannot be false matches:
2060 A sequence of valid UTF-8 bytes cannot be a subsequence of
2061 another valid sequence of UTF-8 bytes.
2064 char * const s0 = STRING(scan), *s, *t;
2065 char * const s1 = s0 + STR_LEN(scan) - 1;
2066 char * const s2 = s1 - 4;
2067 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2068 const char t0[] = "\xaf\x49\xaf\x42";
2070 const char t0[] = "\xcc\x88\xcc\x81";
2072 const char * const t1 = t0 + 3;
2075 s < s2 && (t = ninstr(s, s1, t0, t1));
2078 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2079 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2081 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2082 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2090 n = scan + NODE_SZ_STR(scan);
2092 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2099 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2103 /* REx optimizer. Converts nodes into quickier variants "in place".
2104 Finds fixed substrings. */
2106 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2107 to the position after last scanned or to NULL. */
2112 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2113 regnode *last, scan_data_t *data, U32 flags, U32 depth)
2114 /* scanp: Start here (read-write). */
2115 /* deltap: Write maxlen-minlen here. */
2116 /* last: Stop before this one. */
2119 I32 min = 0, pars = 0, code;
2120 regnode *scan = *scanp, *next;
2122 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2123 int is_inf_internal = 0; /* The studied chunk is infinite */
2124 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2125 scan_data_t data_fake;
2126 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2127 SV *re_trie_maxbuff = NULL;
2128 regnode *first_non_open = scan;
2131 GET_RE_DEBUG_FLAGS_DECL;
2133 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2136 while (first_non_open && OP(first_non_open) == OPEN)
2137 first_non_open=regnext(first_non_open);
2141 while (scan && OP(scan) != END && scan < last) {
2142 /* Peephole optimizer: */
2143 DEBUG_PEEP("Peep",scan,depth);
2145 JOIN_EXACT(scan,&min,0);
2147 /* Follow the next-chain of the current node and optimize
2148 away all the NOTHINGs from it. */
2149 if (OP(scan) != CURLYX) {
2150 const int max = (reg_off_by_arg[OP(scan)]
2152 /* I32 may be smaller than U16 on CRAYs! */
2153 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2154 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2158 /* Skip NOTHING and LONGJMP. */
2159 while ((n = regnext(n))
2160 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2161 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2162 && off + noff < max)
2164 if (reg_off_by_arg[OP(scan)])
2167 NEXT_OFF(scan) = off;
2172 /* The principal pseudo-switch. Cannot be a switch, since we
2173 look into several different things. */
2174 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2175 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2176 next = regnext(scan);
2178 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2180 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2181 /* NOTE - There is similar code to this block below for handling
2182 TRIE nodes on a re-study. If you change stuff here check there
2184 I32 max1 = 0, min1 = I32_MAX, num = 0;
2185 struct regnode_charclass_class accum;
2186 regnode * const startbranch=scan;
2188 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2189 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
2190 if (flags & SCF_DO_STCLASS)
2191 cl_init_zero(pRExC_state, &accum);
2193 while (OP(scan) == code) {
2194 I32 deltanext, minnext, f = 0, fake;
2195 struct regnode_charclass_class this_class;
2198 data_fake.flags = 0;
2200 data_fake.whilem_c = data->whilem_c;
2201 data_fake.last_closep = data->last_closep;
2204 data_fake.last_closep = &fake;
2205 next = regnext(scan);
2206 scan = NEXTOPER(scan);
2208 scan = NEXTOPER(scan);
2209 if (flags & SCF_DO_STCLASS) {
2210 cl_init(pRExC_state, &this_class);
2211 data_fake.start_class = &this_class;
2212 f = SCF_DO_STCLASS_AND;
2214 if (flags & SCF_WHILEM_VISITED_POS)
2215 f |= SCF_WHILEM_VISITED_POS;
2217 /* we suppose the run is continuous, last=next...*/
2218 minnext = study_chunk(pRExC_state, &scan, &deltanext,
2219 next, &data_fake, f,depth+1);
2222 if (max1 < minnext + deltanext)
2223 max1 = minnext + deltanext;
2224 if (deltanext == I32_MAX)
2225 is_inf = is_inf_internal = 1;
2227 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2230 if (data_fake.flags & SF_HAS_EVAL)
2231 data->flags |= SF_HAS_EVAL;
2232 data->whilem_c = data_fake.whilem_c;
2234 if (flags & SCF_DO_STCLASS)
2235 cl_or(pRExC_state, &accum, &this_class);
2236 if (code == SUSPEND)
2239 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2241 if (flags & SCF_DO_SUBSTR) {
2242 data->pos_min += min1;
2243 data->pos_delta += max1 - min1;
2244 if (max1 != min1 || is_inf)
2245 data->longest = &(data->longest_float);
2248 delta += max1 - min1;
2249 if (flags & SCF_DO_STCLASS_OR) {
2250 cl_or(pRExC_state, data->start_class, &accum);
2252 cl_and(data->start_class, &and_with);
2253 flags &= ~SCF_DO_STCLASS;
2256 else if (flags & SCF_DO_STCLASS_AND) {
2258 cl_and(data->start_class, &accum);
2259 flags &= ~SCF_DO_STCLASS;
2262 /* Switch to OR mode: cache the old value of
2263 * data->start_class */
2264 StructCopy(data->start_class, &and_with,
2265 struct regnode_charclass_class);
2266 flags &= ~SCF_DO_STCLASS_AND;
2267 StructCopy(&accum, data->start_class,
2268 struct regnode_charclass_class);
2269 flags |= SCF_DO_STCLASS_OR;
2270 data->start_class->flags |= ANYOF_EOS;
2274 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2277 Assuming this was/is a branch we are dealing with: 'scan' now
2278 points at the item that follows the branch sequence, whatever
2279 it is. We now start at the beginning of the sequence and look
2286 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2288 If we can find such a subseqence we need to turn the first
2289 element into a trie and then add the subsequent branch exact
2290 strings to the trie.
2294 1. patterns where the whole set of branch can be converted.
2296 2. patterns where only a subset can be converted.
2298 In case 1 we can replace the whole set with a single regop
2299 for the trie. In case 2 we need to keep the start and end
2302 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2303 becomes BRANCH TRIE; BRANCH X;
2305 There is an additional case, that being where there is a
2306 common prefix, which gets split out into an EXACT like node
2307 preceding the TRIE node.
2309 If x(1..n)==tail then we can do a simple trie, if not we make
2310 a "jump" trie, such that when we match the appropriate word
2311 we "jump" to the appopriate tail node. Essentailly we turn
2312 a nested if into a case structure of sorts.
2317 if (!re_trie_maxbuff) {
2318 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2319 if (!SvIOK(re_trie_maxbuff))
2320 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2322 if ( SvIV(re_trie_maxbuff)>=0 ) {
2324 regnode *first = (regnode *)NULL;
2325 regnode *last = (regnode *)NULL;
2326 regnode *tail = scan;
2331 SV * const mysv = sv_newmortal(); /* for dumping */
2333 /* var tail is used because there may be a TAIL
2334 regop in the way. Ie, the exacts will point to the
2335 thing following the TAIL, but the last branch will
2336 point at the TAIL. So we advance tail. If we
2337 have nested (?:) we may have to move through several
2341 while ( OP( tail ) == TAIL ) {
2342 /* this is the TAIL generated by (?:) */
2343 tail = regnext( tail );
2348 regprop(RExC_rx, mysv, tail );
2349 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2350 (int)depth * 2 + 2, "",
2351 "Looking for TRIE'able sequences. Tail node is: ",
2352 SvPV_nolen_const( mysv )
2358 step through the branches, cur represents each
2359 branch, noper is the first thing to be matched
2360 as part of that branch and noper_next is the
2361 regnext() of that node. if noper is an EXACT
2362 and noper_next is the same as scan (our current
2363 position in the regex) then the EXACT branch is
2364 a possible optimization target. Once we have
2365 two or more consequetive such branches we can
2366 create a trie of the EXACT's contents and stich
2367 it in place. If the sequence represents all of
2368 the branches we eliminate the whole thing and
2369 replace it with a single TRIE. If it is a
2370 subsequence then we need to stitch it in. This
2371 means the first branch has to remain, and needs
2372 to be repointed at the item on the branch chain
2373 following the last branch optimized. This could
2374 be either a BRANCH, in which case the
2375 subsequence is internal, or it could be the
2376 item following the branch sequence in which
2377 case the subsequence is at the end.
2381 /* dont use tail as the end marker for this traverse */
2382 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2383 regnode * const noper = NEXTOPER( cur );
2384 regnode * const noper_next = regnext( noper );
2387 regprop(RExC_rx, mysv, cur);
2388 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2389 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2391 regprop(RExC_rx, mysv, noper);
2392 PerlIO_printf( Perl_debug_log, " -> %s",
2393 SvPV_nolen_const(mysv));
2396 regprop(RExC_rx, mysv, noper_next );
2397 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2398 SvPV_nolen_const(mysv));
2400 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2401 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2403 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2404 : PL_regkind[ OP( noper ) ] == EXACT )
2405 || OP(noper) == NOTHING )
2407 && noper_next == tail
2412 if ( !first || optype == NOTHING ) {
2413 if (!first) first = cur;
2414 optype = OP( noper );
2420 make_trie( pRExC_state,
2421 startbranch, first, cur, tail, count,
2424 if ( PL_regkind[ OP( noper ) ] == EXACT
2426 && noper_next == tail
2431 optype = OP( noper );
2441 regprop(RExC_rx, mysv, cur);
2442 PerlIO_printf( Perl_debug_log,
2443 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2444 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2448 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2449 #ifdef TRIE_STUDY_OPT
2450 if ( ((made == MADE_EXACT_TRIE &&
2451 startbranch == first)
2452 || ( first_non_open == first )) &&
2454 flags |= SCF_TRIE_RESTUDY;
2462 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2463 scan = NEXTOPER(NEXTOPER(scan));
2464 } else /* single branch is optimized. */
2465 scan = NEXTOPER(scan);
2468 else if (OP(scan) == EXACT) {
2469 I32 l = STR_LEN(scan);
2472 const U8 * const s = (U8*)STRING(scan);
2473 l = utf8_length(s, s + l);
2474 uc = utf8_to_uvchr(s, NULL);
2476 uc = *((U8*)STRING(scan));
2479 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2480 /* The code below prefers earlier match for fixed
2481 offset, later match for variable offset. */
2482 if (data->last_end == -1) { /* Update the start info. */
2483 data->last_start_min = data->pos_min;
2484 data->last_start_max = is_inf
2485 ? I32_MAX : data->pos_min + data->pos_delta;
2487 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2489 SvUTF8_on(data->last_found);
2491 SV * const sv = data->last_found;
2492 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2493 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2494 if (mg && mg->mg_len >= 0)
2495 mg->mg_len += utf8_length((U8*)STRING(scan),
2496 (U8*)STRING(scan)+STR_LEN(scan));
2498 data->last_end = data->pos_min + l;
2499 data->pos_min += l; /* As in the first entry. */
2500 data->flags &= ~SF_BEFORE_EOL;
2502 if (flags & SCF_DO_STCLASS_AND) {
2503 /* Check whether it is compatible with what we know already! */
2507 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2508 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2509 && (!(data->start_class->flags & ANYOF_FOLD)
2510 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2513 ANYOF_CLASS_ZERO(data->start_class);
2514 ANYOF_BITMAP_ZERO(data->start_class);
2516 ANYOF_BITMAP_SET(data->start_class, uc);
2517 data->start_class->flags &= ~ANYOF_EOS;
2519 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2521 else if (flags & SCF_DO_STCLASS_OR) {
2522 /* false positive possible if the class is case-folded */
2524 ANYOF_BITMAP_SET(data->start_class, uc);
2526 data->start_class->flags |= ANYOF_UNICODE_ALL;
2527 data->start_class->flags &= ~ANYOF_EOS;
2528 cl_and(data->start_class, &and_with);
2530 flags &= ~SCF_DO_STCLASS;
2532 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2533 I32 l = STR_LEN(scan);
2534 UV uc = *((U8*)STRING(scan));
2536 /* Search for fixed substrings supports EXACT only. */
2537 if (flags & SCF_DO_SUBSTR) {
2539 scan_commit(pRExC_state, data);
2542 const U8 * const s = (U8 *)STRING(scan);
2543 l = utf8_length(s, s + l);
2544 uc = utf8_to_uvchr(s, NULL);
2547 if (flags & SCF_DO_SUBSTR)
2549 if (flags & SCF_DO_STCLASS_AND) {
2550 /* Check whether it is compatible with what we know already! */
2554 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2555 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2556 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2558 ANYOF_CLASS_ZERO(data->start_class);
2559 ANYOF_BITMAP_ZERO(data->start_class);
2561 ANYOF_BITMAP_SET(data->start_class, uc);
2562 data->start_class->flags &= ~ANYOF_EOS;
2563 data->start_class->flags |= ANYOF_FOLD;
2564 if (OP(scan) == EXACTFL)
2565 data->start_class->flags |= ANYOF_LOCALE;
2568 else if (flags & SCF_DO_STCLASS_OR) {
2569 if (data->start_class->flags & ANYOF_FOLD) {
2570 /* false positive possible if the class is case-folded.
2571 Assume that the locale settings are the same... */
2573 ANYOF_BITMAP_SET(data->start_class, uc);
2574 data->start_class->flags &= ~ANYOF_EOS;
2576 cl_and(data->start_class, &and_with);
2578 flags &= ~SCF_DO_STCLASS;
2580 else if (strchr((const char*)PL_varies,OP(scan))) {
2581 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2582 I32 f = flags, pos_before = 0;
2583 regnode * const oscan = scan;
2584 struct regnode_charclass_class this_class;
2585 struct regnode_charclass_class *oclass = NULL;
2586 I32 next_is_eval = 0;
2588 switch (PL_regkind[OP(scan)]) {
2589 case WHILEM: /* End of (?:...)* . */
2590 scan = NEXTOPER(scan);
2593 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2594 next = NEXTOPER(scan);
2595 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2597 maxcount = REG_INFTY;
2598 next = regnext(scan);
2599 scan = NEXTOPER(scan);
2603 if (flags & SCF_DO_SUBSTR)
2608 if (flags & SCF_DO_STCLASS) {
2610 maxcount = REG_INFTY;
2611 next = regnext(scan);
2612 scan = NEXTOPER(scan);
2615 is_inf = is_inf_internal = 1;
2616 scan = regnext(scan);
2617 if (flags & SCF_DO_SUBSTR) {
2618 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
2619 data->longest = &(data->longest_float);
2621 goto optimize_curly_tail;
2623 mincount = ARG1(scan);
2624 maxcount = ARG2(scan);
2625 next = regnext(scan);
2626 if (OP(scan) == CURLYX) {
2627 I32 lp = (data ? *(data->last_closep) : 0);
2628 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2630 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2631 next_is_eval = (OP(scan) == EVAL);
2633 if (flags & SCF_DO_SUBSTR) {
2634 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
2635 pos_before = data->pos_min;
2639 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2641 data->flags |= SF_IS_INF;
2643 if (flags & SCF_DO_STCLASS) {
2644 cl_init(pRExC_state, &this_class);
2645 oclass = data->start_class;
2646 data->start_class = &this_class;
2647 f |= SCF_DO_STCLASS_AND;
2648 f &= ~SCF_DO_STCLASS_OR;
2650 /* These are the cases when once a subexpression
2651 fails at a particular position, it cannot succeed
2652 even after backtracking at the enclosing scope.
2654 XXXX what if minimal match and we are at the
2655 initial run of {n,m}? */
2656 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2657 f &= ~SCF_WHILEM_VISITED_POS;
2659 /* This will finish on WHILEM, setting scan, or on NULL: */
2660 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
2662 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2664 if (flags & SCF_DO_STCLASS)
2665 data->start_class = oclass;
2666 if (mincount == 0 || minnext == 0) {
2667 if (flags & SCF_DO_STCLASS_OR) {
2668 cl_or(pRExC_state, data->start_class, &this_class);
2670 else if (flags & SCF_DO_STCLASS_AND) {
2671 /* Switch to OR mode: cache the old value of
2672 * data->start_class */
2673 StructCopy(data->start_class, &and_with,
2674 struct regnode_charclass_class);
2675 flags &= ~SCF_DO_STCLASS_AND;
2676 StructCopy(&this_class, data->start_class,
2677 struct regnode_charclass_class);
2678 flags |= SCF_DO_STCLASS_OR;
2679 data->start_class->flags |= ANYOF_EOS;
2681 } else { /* Non-zero len */
2682 if (flags & SCF_DO_STCLASS_OR) {
2683 cl_or(pRExC_state, data->start_class, &this_class);
2684 cl_and(data->start_class, &and_with);
2686 else if (flags & SCF_DO_STCLASS_AND)
2687 cl_and(data->start_class, &this_class);
2688 flags &= ~SCF_DO_STCLASS;
2690 if (!scan) /* It was not CURLYX, but CURLY. */
2692 if ( /* ? quantifier ok, except for (?{ ... }) */
2693 (next_is_eval || !(mincount == 0 && maxcount == 1))
2694 && (minnext == 0) && (deltanext == 0)
2695 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2696 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2697 && ckWARN(WARN_REGEXP))
2700 "Quantifier unexpected on zero-length expression");
2703 min += minnext * mincount;
2704 is_inf_internal |= ((maxcount == REG_INFTY
2705 && (minnext + deltanext) > 0)
2706 || deltanext == I32_MAX);
2707 is_inf |= is_inf_internal;
2708 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2710 /* Try powerful optimization CURLYX => CURLYN. */
2711 if ( OP(oscan) == CURLYX && data
2712 && data->flags & SF_IN_PAR
2713 && !(data->flags & SF_HAS_EVAL)
2714 && !deltanext && minnext == 1 ) {
2715 /* Try to optimize to CURLYN. */
2716 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2717 regnode * const nxt1 = nxt;
2724 if (!strchr((const char*)PL_simple,OP(nxt))
2725 && !(PL_regkind[OP(nxt)] == EXACT
2726 && STR_LEN(nxt) == 1))
2732 if (OP(nxt) != CLOSE)
2734 /* Now we know that nxt2 is the only contents: */
2735 oscan->flags = (U8)ARG(nxt);
2737 OP(nxt1) = NOTHING; /* was OPEN. */
2739 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2740 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2741 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2742 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2743 OP(nxt + 1) = OPTIMIZED; /* was count. */
2744 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2749 /* Try optimization CURLYX => CURLYM. */
2750 if ( OP(oscan) == CURLYX && data
2751 && !(data->flags & SF_HAS_PAR)
2752 && !(data->flags & SF_HAS_EVAL)
2753 && !deltanext /* atom is fixed width */
2754 && minnext != 0 /* CURLYM can't handle zero width */
2756 /* XXXX How to optimize if data == 0? */
2757 /* Optimize to a simpler form. */
2758 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2762 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2763 && (OP(nxt2) != WHILEM))
2765 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2766 /* Need to optimize away parenths. */
2767 if (data->flags & SF_IN_PAR) {
2768 /* Set the parenth number. */
2769 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2771 if (OP(nxt) != CLOSE)
2772 FAIL("Panic opt close");
2773 oscan->flags = (U8)ARG(nxt);
2774 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2775 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2777 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2778 OP(nxt + 1) = OPTIMIZED; /* was count. */
2779 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2780 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2783 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2784 regnode *nnxt = regnext(nxt1);
2787 if (reg_off_by_arg[OP(nxt1)])
2788 ARG_SET(nxt1, nxt2 - nxt1);
2789 else if (nxt2 - nxt1 < U16_MAX)
2790 NEXT_OFF(nxt1) = nxt2 - nxt1;
2792 OP(nxt) = NOTHING; /* Cannot beautify */
2797 /* Optimize again: */
2798 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
2804 else if ((OP(oscan) == CURLYX)
2805 && (flags & SCF_WHILEM_VISITED_POS)
2806 /* See the comment on a similar expression above.
2807 However, this time it not a subexpression
2808 we care about, but the expression itself. */
2809 && (maxcount == REG_INFTY)
2810 && data && ++data->whilem_c < 16) {
2811 /* This stays as CURLYX, we can put the count/of pair. */
2812 /* Find WHILEM (as in regexec.c) */
2813 regnode *nxt = oscan + NEXT_OFF(oscan);
2815 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2817 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2818 | (RExC_whilem_seen << 4)); /* On WHILEM */
2820 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2822 if (flags & SCF_DO_SUBSTR) {
2823 SV *last_str = NULL;
2824 int counted = mincount != 0;
2826 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2827 #if defined(SPARC64_GCC_WORKAROUND)
2830 const char *s = NULL;
2833 if (pos_before >= data->last_start_min)
2836 b = data->last_start_min;
2839 s = SvPV_const(data->last_found, l);
2840 old = b - data->last_start_min;
2843 I32 b = pos_before >= data->last_start_min
2844 ? pos_before : data->last_start_min;
2846 const char * const s = SvPV_const(data->last_found, l);
2847 I32 old = b - data->last_start_min;
2851 old = utf8_hop((U8*)s, old) - (U8*)s;
2854 /* Get the added string: */
2855 last_str = newSVpvn(s + old, l);
2857 SvUTF8_on(last_str);
2858 if (deltanext == 0 && pos_before == b) {
2859 /* What was added is a constant string */
2861 SvGROW(last_str, (mincount * l) + 1);
2862 repeatcpy(SvPVX(last_str) + l,
2863 SvPVX_const(last_str), l, mincount - 1);
2864 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2865 /* Add additional parts. */
2866 SvCUR_set(data->last_found,
2867 SvCUR(data->last_found) - l);
2868 sv_catsv(data->last_found, last_str);
2870 SV * sv = data->last_found;
2872 SvUTF8(sv) && SvMAGICAL(sv) ?
2873 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2874 if (mg && mg->mg_len >= 0)
2875 mg->mg_len += CHR_SVLEN(last_str);
2877 data->last_end += l * (mincount - 1);
2880 /* start offset must point into the last copy */
2881 data->last_start_min += minnext * (mincount - 1);
2882 data->last_start_max += is_inf ? I32_MAX
2883 : (maxcount - 1) * (minnext + data->pos_delta);
2886 /* It is counted once already... */
2887 data->pos_min += minnext * (mincount - counted);
2888 data->pos_delta += - counted * deltanext +
2889 (minnext + deltanext) * maxcount - minnext * mincount;
2890 if (mincount != maxcount) {
2891 /* Cannot extend fixed substrings found inside
2893 scan_commit(pRExC_state,data);
2894 if (mincount && last_str) {
2895 SV * const sv = data->last_found;
2896 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2897 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2901 sv_setsv(sv, last_str);
2902 data->last_end = data->pos_min;
2903 data->last_start_min =
2904 data->pos_min - CHR_SVLEN(last_str);
2905 data->last_start_max = is_inf
2907 : data->pos_min + data->pos_delta
2908 - CHR_SVLEN(last_str);
2910 data->longest = &(data->longest_float);
2912 SvREFCNT_dec(last_str);
2914 if (data && (fl & SF_HAS_EVAL))
2915 data->flags |= SF_HAS_EVAL;
2916 optimize_curly_tail:
2917 if (OP(oscan) != CURLYX) {
2918 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
2920 NEXT_OFF(oscan) += NEXT_OFF(next);
2923 default: /* REF and CLUMP only? */
2924 if (flags & SCF_DO_SUBSTR) {
2925 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2926 data->longest = &(data->longest_float);
2928 is_inf = is_inf_internal = 1;
2929 if (flags & SCF_DO_STCLASS_OR)
2930 cl_anything(pRExC_state, data->start_class);
2931 flags &= ~SCF_DO_STCLASS;
2935 else if (strchr((const char*)PL_simple,OP(scan))) {
2938 if (flags & SCF_DO_SUBSTR) {
2939 scan_commit(pRExC_state,data);
2943 if (flags & SCF_DO_STCLASS) {
2944 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2946 /* Some of the logic below assumes that switching
2947 locale on will only add false positives. */
2948 switch (PL_regkind[OP(scan)]) {
2952 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2953 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2954 cl_anything(pRExC_state, data->start_class);
2957 if (OP(scan) == SANY)
2959 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2960 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2961 || (data->start_class->flags & ANYOF_CLASS));
2962 cl_anything(pRExC_state, data->start_class);
2964 if (flags & SCF_DO_STCLASS_AND || !value)
2965 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2968 if (flags & SCF_DO_STCLASS_AND)
2969 cl_and(data->start_class,
2970 (struct regnode_charclass_class*)scan);
2972 cl_or(pRExC_state, data->start_class,
2973 (struct regnode_charclass_class*)scan);
2976 if (flags & SCF_DO_STCLASS_AND) {
2977 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2978 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2979 for (value = 0; value < 256; value++)
2980 if (!isALNUM(value))
2981 ANYOF_BITMAP_CLEAR(data->start_class, value);
2985 if (data->start_class->flags & ANYOF_LOCALE)
2986 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2988 for (value = 0; value < 256; value++)
2990 ANYOF_BITMAP_SET(data->start_class, value);
2995 if (flags & SCF_DO_STCLASS_AND) {
2996 if (data->start_class->flags & ANYOF_LOCALE)
2997 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3000 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3001 data->start_class->flags |= ANYOF_LOCALE;
3005 if (flags & SCF_DO_STCLASS_AND) {
3006 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3007 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3008 for (value = 0; value < 256; value++)
3010 ANYOF_BITMAP_CLEAR(data->start_class, value);
3014 if (data->start_class->flags & ANYOF_LOCALE)
3015 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3017 for (value = 0; value < 256; value++)
3018 if (!isALNUM(value))
3019 ANYOF_BITMAP_SET(data->start_class, value);
3024 if (flags & SCF_DO_STCLASS_AND) {
3025 if (data->start_class->flags & ANYOF_LOCALE)
3026 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3029 data->start_class->flags |= ANYOF_LOCALE;
3030 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3034 if (flags & SCF_DO_STCLASS_AND) {
3035 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3036 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3037 for (value = 0; value < 256; value++)
3038 if (!isSPACE(value))
3039 ANYOF_BITMAP_CLEAR(data->start_class, value);
3043 if (data->start_class->flags & ANYOF_LOCALE)
3044 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3046 for (value = 0; value < 256; value++)
3048 ANYOF_BITMAP_SET(data->start_class, value);
3053 if (flags & SCF_DO_STCLASS_AND) {
3054 if (data->start_class->flags & ANYOF_LOCALE)
3055 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3058 data->start_class->flags |= ANYOF_LOCALE;
3059 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3063 if (flags & SCF_DO_STCLASS_AND) {
3064 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3065 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3066 for (value = 0; value < 256; value++)
3068 ANYOF_BITMAP_CLEAR(data->start_class, value);
3072 if (data->start_class->flags & ANYOF_LOCALE)
3073 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3075 for (value = 0; value < 256; value++)
3076 if (!isSPACE(value))
3077 ANYOF_BITMAP_SET(data->start_class, value);
3082 if (flags & SCF_DO_STCLASS_AND) {
3083 if (data->start_class->flags & ANYOF_LOCALE) {
3084 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3085 for (value = 0; value < 256; value++)
3086 if (!isSPACE(value))
3087 ANYOF_BITMAP_CLEAR(data->start_class, value);
3091 data->start_class->flags |= ANYOF_LOCALE;
3092 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3096 if (flags & SCF_DO_STCLASS_AND) {
3097 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3098 for (value = 0; value < 256; value++)
3099 if (!isDIGIT(value))
3100 ANYOF_BITMAP_CLEAR(data->start_class, value);
3103 if (data->start_class->flags & ANYOF_LOCALE)
3104 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3106 for (value = 0; value < 256; value++)
3108 ANYOF_BITMAP_SET(data->start_class, value);
3113 if (flags & SCF_DO_STCLASS_AND) {
3114 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3115 for (value = 0; value < 256; value++)
3117 ANYOF_BITMAP_CLEAR(data->start_class, value);
3120 if (data->start_class->flags & ANYOF_LOCALE)
3121 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3123 for (value = 0; value < 256; value++)
3124 if (!isDIGIT(value))
3125 ANYOF_BITMAP_SET(data->start_class, value);
3130 if (flags & SCF_DO_STCLASS_OR)
3131 cl_and(data->start_class, &and_with);
3132 flags &= ~SCF_DO_STCLASS;
3135 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3136 data->flags |= (OP(scan) == MEOL
3140 else if ( PL_regkind[OP(scan)] == BRANCHJ
3141 /* Lookbehind, or need to calculate parens/evals/stclass: */
3142 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3143 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3144 /* Lookahead/lookbehind */
3145 I32 deltanext, minnext, fake = 0;
3147 struct regnode_charclass_class intrnl;
3150 data_fake.flags = 0;
3152 data_fake.whilem_c = data->whilem_c;
3153 data_fake.last_closep = data->last_closep;
3156 data_fake.last_closep = &fake;
3157 if ( flags & SCF_DO_STCLASS && !scan->flags
3158 && OP(scan) == IFMATCH ) { /* Lookahead */
3159 cl_init(pRExC_state, &intrnl);
3160 data_fake.start_class = &intrnl;
3161 f |= SCF_DO_STCLASS_AND;
3163 if (flags & SCF_WHILEM_VISITED_POS)
3164 f |= SCF_WHILEM_VISITED_POS;
3165 next = regnext(scan);
3166 nscan = NEXTOPER(NEXTOPER(scan));
3167 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
3170 vFAIL("Variable length lookbehind not implemented");
3172 else if (minnext > (I32)U8_MAX) {
3173 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3175 scan->flags = (U8)minnext;
3178 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3180 if (data_fake.flags & SF_HAS_EVAL)
3181 data->flags |= SF_HAS_EVAL;
3182 data->whilem_c = data_fake.whilem_c;
3184 if (f & SCF_DO_STCLASS_AND) {
3185 const int was = (data->start_class->flags & ANYOF_EOS);
3187 cl_and(data->start_class, &intrnl);
3189 data->start_class->flags |= ANYOF_EOS;
3192 else if (OP(scan) == OPEN) {
3195 else if (OP(scan) == CLOSE) {
3196 if ((I32)ARG(scan) == is_par) {
3197 next = regnext(scan);
3199 if ( next && (OP(next) != WHILEM) && next < last)
3200 is_par = 0; /* Disable optimization */
3203 *(data->last_closep) = ARG(scan);
3205 else if (OP(scan) == EVAL) {
3207 data->flags |= SF_HAS_EVAL;
3209 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3210 if (flags & SCF_DO_SUBSTR) {
3211 scan_commit(pRExC_state,data);
3212 data->longest = &(data->longest_float);
3214 is_inf = is_inf_internal = 1;
3215 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3216 cl_anything(pRExC_state, data->start_class);
3217 flags &= ~SCF_DO_STCLASS;
3219 #ifdef TRIE_STUDY_OPT
3220 #ifdef FULL_TRIE_STUDY
3221 else if (PL_regkind[OP(scan)] == TRIE) {
3222 /* NOTE - There is similar code to this block above for handling
3223 BRANCH nodes on the initial study. If you change stuff here
3225 regnode *tail= regnext(scan);
3226 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3227 I32 max1 = 0, min1 = I32_MAX;
3228 struct regnode_charclass_class accum;
3230 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3231 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
3232 if (flags & SCF_DO_STCLASS)
3233 cl_init_zero(pRExC_state, &accum);
3239 const regnode *nextbranch= NULL;
3242 for ( word=1 ; word <= trie->wordcount ; word++)
3244 I32 deltanext=0, minnext=0, f = 0, fake;
3245 struct regnode_charclass_class this_class;
3247 data_fake.flags = 0;
3249 data_fake.whilem_c = data->whilem_c;
3250 data_fake.last_closep = data->last_closep;
3253 data_fake.last_closep = &fake;
3255 if (flags & SCF_DO_STCLASS) {
3256 cl_init(pRExC_state, &this_class);
3257 data_fake.start_class = &this_class;
3258 f = SCF_DO_STCLASS_AND;
3260 if (flags & SCF_WHILEM_VISITED_POS)
3261 f |= SCF_WHILEM_VISITED_POS;
3263 if (trie->jump[word]) {
3265 nextbranch = tail - trie->jump[0];
3266 scan= tail - trie->jump[word];
3267 /* We go from the jump point to the branch that follows
3268 it. Note this means we need the vestigal unused branches
3269 even though they arent otherwise used.
3271 minnext = study_chunk(pRExC_state, &scan, &deltanext,
3272 (regnode *)nextbranch, &data_fake, f,depth+1);
3274 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3275 nextbranch= regnext((regnode*)nextbranch);
3277 if (min1 > (I32)(minnext + trie->minlen))
3278 min1 = minnext + trie->minlen;
3279 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3280 max1 = minnext + deltanext + trie->maxlen;
3281 if (deltanext == I32_MAX)
3282 is_inf = is_inf_internal = 1;
3284 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3288 if (data_fake.flags & SF_HAS_EVAL)
3289 data->flags |= SF_HAS_EVAL;
3290 data->whilem_c = data_fake.whilem_c;
3292 if (flags & SCF_DO_STCLASS)
3293 cl_or(pRExC_state, &accum, &this_class);
3296 if (flags & SCF_DO_SUBSTR) {
3297 data->pos_min += min1;
3298 data->pos_delta += max1 - min1;
3299 if (max1 != min1 || is_inf)
3300 data->longest = &(data->longest_float);
3303 delta += max1 - min1;
3304 if (flags & SCF_DO_STCLASS_OR) {
3305 cl_or(pRExC_state, data->start_class, &accum);
3307 cl_and(data->start_class, &and_with);
3308 flags &= ~SCF_DO_STCLASS;
3311 else if (flags & SCF_DO_STCLASS_AND) {
3313 cl_and(data->start_class, &accum);
3314 flags &= ~SCF_DO_STCLASS;
3317 /* Switch to OR mode: cache the old value of
3318 * data->start_class */
3319 StructCopy(data->start_class, &and_with,
3320 struct regnode_charclass_class);
3321 flags &= ~SCF_DO_STCLASS_AND;
3322 StructCopy(&accum, data->start_class,
3323 struct regnode_charclass_class);
3324 flags |= SCF_DO_STCLASS_OR;
3325 data->start_class->flags |= ANYOF_EOS;
3332 else if (PL_regkind[OP(scan)] == TRIE) {
3333 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3336 min += trie->minlen;
3337 delta += (trie->maxlen - trie->minlen);
3338 flags &= ~SCF_DO_STCLASS; /* xxx */
3339 if (flags & SCF_DO_SUBSTR) {
3340 scan_commit(pRExC_state,data); /* Cannot expect anything... */
3341 data->pos_min += trie->minlen;
3342 data->pos_delta += (trie->maxlen - trie->minlen);
3343 if (trie->maxlen != trie->minlen)
3344 data->longest = &(data->longest_float);
3346 if (trie->jump) /* no more substrings -- for now /grr*/
3347 flags &= ~SCF_DO_SUBSTR;
3349 #endif /* old or new */
3350 #endif /* TRIE_STUDY_OPT */
3351 /* Else: zero-length, ignore. */
3352 scan = regnext(scan);
3357 *deltap = is_inf_internal ? I32_MAX : delta;
3358 if (flags & SCF_DO_SUBSTR && is_inf)
3359 data->pos_delta = I32_MAX - data->pos_min;
3360 if (is_par > (I32)U8_MAX)
3362 if (is_par && pars==1 && data) {
3363 data->flags |= SF_IN_PAR;
3364 data->flags &= ~SF_HAS_PAR;
3366 else if (pars && data) {
3367 data->flags |= SF_HAS_PAR;
3368 data->flags &= ~SF_IN_PAR;
3370 if (flags & SCF_DO_STCLASS_OR)
3371 cl_and(data->start_class, &and_with);
3372 if (flags & SCF_TRIE_RESTUDY)
3373 data->flags |= SCF_TRIE_RESTUDY;
3378 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3380 if (RExC_rx->data) {
3381 Renewc(RExC_rx->data,
3382 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3383 char, struct reg_data);
3384 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3385 RExC_rx->data->count += n;
3388 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3389 char, struct reg_data);
3390 Newx(RExC_rx->data->what, n, U8);
3391 RExC_rx->data->count = n;
3393 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3394 return RExC_rx->data->count - n;
3397 #ifndef PERL_IN_XSUB_RE
3399 Perl_reginitcolors(pTHX)
3402 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3404 char *t = savepv(s);
3408 t = strchr(t, '\t');
3414 PL_colors[i] = t = (char *)"";
3419 PL_colors[i++] = (char *)"";
3426 #ifdef TRIE_STUDY_OPT
3427 #define CHECK_RESTUDY_GOTO \
3429 (data.flags & SCF_TRIE_RESTUDY) \
3433 #define CHECK_RESTUDY_GOTO
3436 - pregcomp - compile a regular expression into internal code
3438 * We can't allocate space until we know how big the compiled form will be,
3439 * but we can't compile it (and thus know how big it is) until we've got a
3440 * place to put the code. So we cheat: we compile it twice, once with code
3441 * generation turned off and size counting turned on, and once "for real".
3442 * This also means that we don't allocate space until we are sure that the
3443 * thing really will compile successfully, and we never have to move the
3444 * code and thus invalidate pointers into it. (Note that it has to be in
3445 * one piece because free() must be able to free it all.) [NB: not true in perl]
3447 * Beware that the optimization-preparation code in here knows about some
3448 * of the structure of the compiled regexp. [I'll say.]
3451 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3462 RExC_state_t RExC_state;
3463 RExC_state_t * const pRExC_state = &RExC_state;
3464 #ifdef TRIE_STUDY_OPT
3466 RExC_state_t copyRExC_state;
3469 GET_RE_DEBUG_FLAGS_DECL;
3472 FAIL("NULL regexp argument");
3474 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3477 DEBUG_r(if (!PL_colorset) reginitcolors());
3479 SV *dsv= sv_newmortal();
3480 RE_PV_QUOTED_DECL(s, RExC_utf8,
3481 dsv, RExC_precomp, (xend - exp), 60);
3482 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3483 PL_colors[4],PL_colors[5],s);
3485 RExC_flags = pm->op_pmflags;
3489 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3490 RExC_seen_evals = 0;
3493 /* First pass: determine size, legality. */
3500 RExC_emit = &PL_regdummy;
3501 RExC_whilem_seen = 0;
3502 #if 0 /* REGC() is (currently) a NOP at the first pass.
3503 * Clever compilers notice this and complain. --jhi */
3504 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3506 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3507 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3508 RExC_precomp = NULL;
3511 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3512 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3513 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3516 RExC_lastparse=NULL;
3520 /* Small enough for pointer-storage convention?
3521 If extralen==0, this means that we will not need long jumps. */
3522 if (RExC_size >= 0x10000L && RExC_extralen)
3523 RExC_size += RExC_extralen;
3526 if (RExC_whilem_seen > 15)
3527 RExC_whilem_seen = 15;
3529 /* Allocate space and initialize. */
3530 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3533 FAIL("Regexp out of space");
3536 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3537 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3540 r->prelen = xend - exp;
3541 r->precomp = savepvn(RExC_precomp, r->prelen);
3543 #ifdef PERL_OLD_COPY_ON_WRITE
3544 r->saved_copy = NULL;
3546 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3547 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3548 r->lastparen = 0; /* mg.c reads this. */
3550 r->substrs = 0; /* Useful during FAIL. */
3551 r->startp = 0; /* Useful during FAIL. */
3552 r->endp = 0; /* Useful during FAIL. */
3554 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3556 r->offsets[0] = RExC_size;
3558 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3559 "%s %"UVuf" bytes for offset annotations.\n",
3560 r->offsets ? "Got" : "Couldn't get",
3561 (UV)((2*RExC_size+1) * sizeof(U32))));
3565 /* Second pass: emit code. */
3566 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3571 RExC_emit_start = r->program;
3572 RExC_emit = r->program;
3573 /* Store the count of eval-groups for security checks: */
3574 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
3575 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3577 if (reg(pRExC_state, 0, &flags,1) == NULL)
3579 /* XXXX To minimize changes to RE engine we always allocate
3580 3-units-long substrs field. */
3581 Newx(r->substrs, 1, struct reg_substr_data);
3584 minlen=sawplus=sawopen=0;
3585 Zero(r->substrs, 1, struct reg_substr_data);
3586 StructCopy(&zero_scan_data, &data, scan_data_t);
3588 #ifdef TRIE_STUDY_OPT
3590 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3591 RExC_state=copyRExC_state;
3592 if (data.longest_fixed)
3593 SvREFCNT_dec(data.longest_fixed);
3594 if (data.longest_float)
3595 SvREFCNT_dec(data.longest_float);
3596 if (data.last_found)
3597 SvREFCNT_dec(data.last_found);
3599 copyRExC_state=RExC_state;
3602 /* Dig out information for optimizations. */
3603 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3604 pm->op_pmflags = RExC_flags;
3606 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3607 r->regstclass = NULL;
3608 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3609 r->reganch |= ROPT_NAUGHTY;
3610 scan = r->program + 1; /* First BRANCH. */
3612 /* XXXX Should not we check for something else? Usually it is OPEN1... */
3613 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3615 STRLEN longest_float_length, longest_fixed_length;
3616 struct regnode_charclass_class ch_class; /* pointed to by data */
3618 I32 last_close = 0; /* pointed to by data */
3621 /* Skip introductions and multiplicators >= 1. */
3622 while ((OP(first) == OPEN && (sawopen = 1)) ||
3623 /* An OR of *one* alternative - should not happen now. */
3624 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3625 /* for now we can't handle lookbehind IFMATCH*/
3626 (OP(first) == IFMATCH && !first->flags) ||
3627 (OP(first) == PLUS) ||
3628 (OP(first) == MINMOD) ||
3629 /* An {n,m} with n>0 */
3630 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3633 if (OP(first) == PLUS)
3636 first += regarglen[OP(first)];
3637 if (OP(first) == IFMATCH) {
3638 first = NEXTOPER(first);
3639 first += EXTRA_STEP_2ARGS;
3640 } else /* XXX possible optimisation for /(?=)/ */
3641 first = NEXTOPER(first);
3644 /* Starting-point info. */
3646 DEBUG_PEEP("first:",first,0);
3647 /* Ignore EXACT as we deal with it later. */
3648 if (PL_regkind[OP(first)] == EXACT) {
3649 if (OP(first) == EXACT)
3650 NOOP; /* Empty, get anchored substr later. */
3651 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3652 r->regstclass = first;
3655 else if (PL_regkind[OP(first)] == TRIE &&
3656 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3659 /* this can happen only on restudy */
3660 if ( OP(first) == TRIE ) {
3661 struct regnode_1 *trieop;
3662 Newxz(trieop,1,struct regnode_1);
3663 StructCopy(first,trieop,struct regnode_1);
3664 trie_op=(regnode *)trieop;
3666 struct regnode_charclass *trieop;
3667 Newxz(trieop,1,struct regnode_charclass);
3668 StructCopy(first,trieop,struct regnode_charclass);
3669 trie_op=(regnode *)trieop;
3671 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
3672 r->regstclass = trie_op;
3675 else if (strchr((const char*)PL_simple,OP(first)))
3676 r->regstclass = first;
3677 else if (PL_regkind[OP(first)] == BOUND ||
3678 PL_regkind[OP(first)] == NBOUND)
3679 r->regstclass = first;
3680 else if (PL_regkind[OP(first)] == BOL) {
3681 r->reganch |= (OP(first) == MBOL
3683 : (OP(first) == SBOL
3686 first = NEXTOPER(first);
3689 else if (OP(first) == GPOS) {
3690 r->reganch |= ROPT_ANCH_GPOS;
3691 first = NEXTOPER(first);
3694 else if (!sawopen && (OP(first) == STAR &&
3695 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3696 !(r->reganch & ROPT_ANCH) )
3698 /* turn .* into ^.* with an implied $*=1 */
3700 (OP(NEXTOPER(first)) == REG_ANY)
3703 r->reganch |= type | ROPT_IMPLICIT;
3704 first = NEXTOPER(first);
3707 if (sawplus && (!sawopen || !RExC_sawback)
3708 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3709 /* x+ must match at the 1st pos of run of x's */
3710 r->reganch |= ROPT_SKIP;
3712 /* Scan is after the zeroth branch, first is atomic matcher. */
3713 #ifdef TRIE_STUDY_OPT
3716 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3717 (IV)(first - scan + 1))
3721 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3722 (IV)(first - scan + 1))
3728 * If there's something expensive in the r.e., find the
3729 * longest literal string that must appear and make it the
3730 * regmust. Resolve ties in favor of later strings, since
3731 * the regstart check works with the beginning of the r.e.
3732 * and avoiding duplication strengthens checking. Not a
3733 * strong reason, but sufficient in the absence of others.
3734 * [Now we resolve ties in favor of the earlier string if
3735 * it happens that c_offset_min has been invalidated, since the
3736 * earlier string may buy us something the later one won't.]
3740 data.longest_fixed = newSVpvs("");
3741 data.longest_float = newSVpvs("");
3742 data.last_found = newSVpvs("");
3743 data.longest = &(data.longest_fixed);
3745 if (!r->regstclass) {
3746 cl_init(pRExC_state, &ch_class);
3747 data.start_class = &ch_class;
3748 stclass_flag = SCF_DO_STCLASS_AND;
3749 } else /* XXXX Check for BOUND? */
3751 data.last_closep = &last_close;
3753 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
3754 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3760 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3761 && data.last_start_min == 0 && data.last_end > 0
3762 && !RExC_seen_zerolen
3763 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3764 r->reganch |= ROPT_CHECK_ALL;
3765 scan_commit(pRExC_state, &data);
3766 SvREFCNT_dec(data.last_found);
3768 longest_float_length = CHR_SVLEN(data.longest_float);
3769 if (longest_float_length
3770 || (data.flags & SF_FL_BEFORE_EOL
3771 && (!(data.flags & SF_FL_BEFORE_MEOL)
3772 || (RExC_flags & PMf_MULTILINE)))) {
3775 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
3776 && data.offset_fixed == data.offset_float_min
3777 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3778 goto remove_float; /* As in (a)+. */
3780 if (SvUTF8(data.longest_float)) {
3781 r->float_utf8 = data.longest_float;
3782 r->float_substr = NULL;
3784 r->float_substr = data.longest_float;
3785 r->float_utf8 = NULL;
3787 r->float_min_offset = data.offset_float_min;
3788 r->float_max_offset = data.offset_float_max;
3789 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3790 && (!(data.flags & SF_FL_BEFORE_MEOL)
3791 || (RExC_flags & PMf_MULTILINE)));
3792 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
3796 r->float_substr = r->float_utf8 = NULL;
3797 SvREFCNT_dec(data.longest_float);
3798 longest_float_length = 0;
3801 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
3802 if (longest_fixed_length
3803 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3804 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3805 || (RExC_flags & PMf_MULTILINE)))) {
3808 if (SvUTF8(data.longest_fixed)) {
3809 r->anchored_utf8 = data.longest_fixed;
3810 r->anchored_substr = NULL;
3812 r->anchored_substr = data.longest_fixed;
3813 r->anchored_utf8 = NULL;
3815 r->anchored_offset = data.offset_fixed;
3816 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3817 && (!(data.flags & SF_FIX_BEFORE_MEOL)
3818 || (RExC_flags & PMf_MULTILINE)));
3819 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
3822 r->anchored_substr = r->anchored_utf8 = NULL;
3823 SvREFCNT_dec(data.longest_fixed);
3824 longest_fixed_length = 0;
3827 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
3828 r->regstclass = NULL;
3829 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3831 && !(data.start_class->flags & ANYOF_EOS)
3832 && !cl_is_anything(data.start_class))
3834 const I32 n = add_data(pRExC_state, 1, "f");
3836 Newx(RExC_rx->data->data[n], 1,
3837 struct regnode_charclass_class);
3838 StructCopy(data.start_class,
3839 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3840 struct regnode_charclass_class);
3841 r->regstclass = (regnode*)RExC_rx->data->data[n];
3842 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3843 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
3844 regprop(r, sv, (regnode*)data.start_class);
3845 PerlIO_printf(Perl_debug_log,
3846 "synthetic stclass \"%s\".\n",
3847 SvPVX_const(sv));});
3850 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
3851 if (longest_fixed_length > longest_float_length) {
3852 r->check_substr = r->anchored_substr;
3853 r->check_utf8 = r->anchored_utf8;
3854 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3855 if (r->reganch & ROPT_ANCH_SINGLE)
3856 r->reganch |= ROPT_NOSCAN;
3859 r->check_substr = r->float_substr;
3860 r->check_utf8 = r->float_utf8;
3861 r->check_offset_min = data.offset_float_min;
3862 r->check_offset_max = data.offset_float_max;
3864 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3865 This should be changed ASAP! */
3866 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
3867 r->reganch |= RE_USE_INTUIT;
3868 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
3869 r->reganch |= RE_INTUIT_TAIL;
3873 /* Several toplevels. Best we can is to set minlen. */
3875 struct regnode_charclass_class ch_class;
3878 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
3880 scan = r->program + 1;
3881 cl_init(pRExC_state, &ch_class);
3882 data.start_class = &ch_class;
3883 data.last_closep = &last_close;
3885 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3886 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3890 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3891 = r->float_substr = r->float_utf8 = NULL;
3892 if (!(data.start_class->flags & ANYOF_EOS)
3893 && !cl_is_anything(data.start_class))
3895 const I32 n = add_data(pRExC_state, 1, "f");
3897 Newx(RExC_rx->data->data[n], 1,
3898 struct regnode_charclass_class);
3899 StructCopy(data.start_class,
3900 (struct regnode_charclass_class*)RExC_rx->data->data[n],
3901 struct regnode_charclass_class);
3902 r->regstclass = (regnode*)RExC_rx->data->data[n];
3903 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
3904 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
3905 regprop(r, sv, (regnode*)data.start_class);
3906 PerlIO_printf(Perl_debug_log,
3907 "synthetic stclass \"%s\".\n",
3908 SvPVX_const(sv));});
3913 if (RExC_seen & REG_SEEN_GPOS)
3914 r->reganch |= ROPT_GPOS_SEEN;
3915 if (RExC_seen & REG_SEEN_LOOKBEHIND)
3916 r->reganch |= ROPT_LOOKBEHIND_SEEN;
3917 if (RExC_seen & REG_SEEN_EVAL)
3918 r->reganch |= ROPT_EVAL_SEEN;
3919 if (RExC_seen & REG_SEEN_CANY)
3920 r->reganch |= ROPT_CANY_SEEN;
3921 Newxz(r->startp, RExC_npar, I32);
3922 Newxz(r->endp, RExC_npar, I32);
3924 DEBUG_r( RX_DEBUG_on(r) );
3926 PerlIO_printf(Perl_debug_log,"Final program:\n");
3929 DEBUG_OFFSETS_r(if (r->offsets) {
3930 const U32 len = r->offsets[0];
3932 GET_RE_DEBUG_FLAGS_DECL;
3933 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3934 for (i = 1; i <= len; i++) {
3935 if (r->offsets[i*2-1] || r->offsets[i*2])
3936 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3937 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3939 PerlIO_printf(Perl_debug_log, "\n");
3945 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3946 int rem=(int)(RExC_end - RExC_parse); \
3955 if (RExC_lastparse!=RExC_parse) \
3956 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3959 iscut ? "..." : "<" \
3962 PerlIO_printf(Perl_debug_log,"%16s",""); \
3967 num=REG_NODE_NUM(RExC_emit); \
3968 if (RExC_lastnum!=num) \
3969 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3971 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3972 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3973 (int)((depth*2)), "", \
3977 RExC_lastparse=RExC_parse; \
3982 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3983 DEBUG_PARSE_MSG((funcname)); \
3984 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3987 - reg - regular expression, i.e. main body or parenthesized thing
3989 * Caller must absorb opening parenthesis.
3991 * Combining parenthesis handling with the base level of regular expression
3992 * is a trifle forced, but the need to tie the tails of the branches to what
3993 * follows makes it hard to avoid.
3995 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3997 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3999 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4003 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4004 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4007 register regnode *ret; /* Will be the head of the group. */
4008 register regnode *br;
4009 register regnode *lastbr;
4010 register regnode *ender = NULL;
4011 register I32 parno = 0;
4013 const I32 oregflags = RExC_flags;
4014 bool have_branch = 0;
4017 /* for (?g), (?gc), and (?o) warnings; warning
4018 about (?c) will warn about (?g) -- japhy */
4020 #define WASTED_O 0x01
4021 #define WASTED_G 0x02
4022 #define WASTED_C 0x04
4023 #define WASTED_GC (0x02|0x04)
4024 I32 wastedflags = 0x00;
4026 char * parse_start = RExC_parse; /* MJD */
4027 char * const oregcomp_parse = RExC_parse;
4029 GET_RE_DEBUG_FLAGS_DECL;
4030 DEBUG_PARSE("reg ");
4033 *flagp = 0; /* Tentatively. */
4036 /* Make an OPEN node, if parenthesized. */
4038 if (*RExC_parse == '?') { /* (?...) */
4039 U32 posflags = 0, negflags = 0;
4040 U32 *flagsp = &posflags;
4041 bool is_logical = 0;
4042 const char * const seqstart = RExC_parse;
4045 paren = *RExC_parse++;
4046 ret = NULL; /* For look-ahead/behind. */
4048 case '<': /* (?<...) */
4049 RExC_seen |= REG_SEEN_LOOKBEHIND;
4050 if (*RExC_parse == '!')
4052 if (*RExC_parse != '=' && *RExC_parse != '!')
4055 case '=': /* (?=...) */
4056 case '!': /* (?!...) */
4057 RExC_seen_zerolen++;
4058 case ':': /* (?:...) */
4059 case '>': /* (?>...) */
4061 case '$': /* (?$...) */
4062 case '@': /* (?@...) */
4063 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4065 case '#': /* (?#...) */
4066 while (*RExC_parse && *RExC_parse != ')')
4068 if (*RExC_parse != ')')
4069 FAIL("Sequence (?#... not terminated");
4070 nextchar(pRExC_state);
4073 case 'p': /* (?p...) */
4074 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
4075 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
4077 case '?': /* (??...) */
4079 if (*RExC_parse != '{')
4081 paren = *RExC_parse++;
4083 case '{': /* (?{...}) */
4085 I32 count = 1, n = 0;
4087 char *s = RExC_parse;
4089 RExC_seen_zerolen++;
4090 RExC_seen |= REG_SEEN_EVAL;
4091 while (count && (c = *RExC_parse)) {
4102 if (*RExC_parse != ')') {
4104 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4108 OP_4tree *sop, *rop;
4109 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
4112 Perl_save_re_context(aTHX);
4113 rop = sv_compile_2op(sv, &sop, "re", &pad);
4114 sop->op_private |= OPpREFCOUNTED;
4115 /* re_dup will OpREFCNT_inc */
4116 OpREFCNT_set(sop, 1);
4119 n = add_data(pRExC_state, 3, "nop");
4120 RExC_rx->data->data[n] = (void*)rop;
4121 RExC_rx->data->data[n+1] = (void*)sop;
4122 RExC_rx->data->data[n+2] = (void*)pad;
4125 else { /* First pass */
4126 if (PL_reginterp_cnt < ++RExC_seen_evals
4128 /* No compiled RE interpolated, has runtime
4129 components ===> unsafe. */
4130 FAIL("Eval-group not allowed at runtime, use re 'eval'");
4131 if (PL_tainting && PL_tainted)
4132 FAIL("Eval-group in insecure regular expression");
4133 #if PERL_VERSION > 8
4134 if (IN_PERL_COMPILETIME)
4139 nextchar(pRExC_state);
4141 ret = reg_node(pRExC_state, LOGICAL);
4144 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
4145 /* deal with the length of this later - MJD */
4148 ret = reganode(pRExC_state, EVAL, n);
4149 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4150 Set_Node_Offset(ret, parse_start);
4153 case '(': /* (?(?{...})...) and (?(?=...)...) */
4155 if (RExC_parse[0] == '?') { /* (?(?...)) */
4156 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4157 || RExC_parse[1] == '<'
4158 || RExC_parse[1] == '{') { /* Lookahead or eval. */
4161 ret = reg_node(pRExC_state, LOGICAL);
4164 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
4168 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
4171 parno = atoi(RExC_parse++);
4173 while (isDIGIT(*RExC_parse))
4175 ret = reganode(pRExC_state, GROUPP, parno);
4177 if ((c = *nextchar(pRExC_state)) != ')')
4178 vFAIL("Switch condition not recognized");
4180 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4181 br = regbranch(pRExC_state, &flags, 1,depth+1);
4183 br = reganode(pRExC_state, LONGJMP, 0);
4185 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
4186 c = *nextchar(pRExC_state);
4190 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
4191 regbranch(pRExC_state, &flags, 1,depth+1);
4192 REGTAIL(pRExC_state, ret, lastbr);
4195 c = *nextchar(pRExC_state);
4200 vFAIL("Switch (?(condition)... contains too many branches");
4201 ender = reg_node(pRExC_state, TAIL);
4202 REGTAIL(pRExC_state, br, ender);
4204 REGTAIL(pRExC_state, lastbr, ender);
4205 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
4208 REGTAIL(pRExC_state, ret, ender);
4212 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
4216 RExC_parse--; /* for vFAIL to print correctly */
4217 vFAIL("Sequence (? incomplete");
4221 parse_flags: /* (?i) */
4222 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
4223 /* (?g), (?gc) and (?o) are useless here
4224 and must be globally applied -- japhy */
4226 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4227 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
4228 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
4229 if (! (wastedflags & wflagbit) ) {
4230 wastedflags |= wflagbit;
4233 "Useless (%s%c) - %suse /%c modifier",
4234 flagsp == &negflags ? "?-" : "?",
4236 flagsp == &negflags ? "don't " : "",
4242 else if (*RExC_parse == 'c') {
4243 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
4244 if (! (wastedflags & WASTED_C) ) {
4245 wastedflags |= WASTED_GC;
4248 "Useless (%sc) - %suse /gc modifier",
4249 flagsp == &negflags ? "?-" : "?",
4250 flagsp == &negflags ? "don't " : ""
4255 else { pmflag(flagsp, *RExC_parse); }
4259 if (*RExC_parse == '-') {
4261 wastedflags = 0; /* reset so (?g-c) warns twice */
4265 RExC_flags |= posflags;
4266 RExC_flags &= ~negflags;
4267 if (*RExC_parse == ':') {
4273 if (*RExC_parse != ')') {
4275 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
4277 nextchar(pRExC_state);
4285 ret = reganode(pRExC_state, OPEN, parno);
4286 Set_Node_Length(ret, 1); /* MJD */
4287 Set_Node_Offset(ret, RExC_parse); /* MJD */
4294 /* Pick up the branches, linking them together. */
4295 parse_start = RExC_parse; /* MJD */
4296 br = regbranch(pRExC_state, &flags, 1,depth+1);
4297 /* branch_len = (paren != 0); */
4301 if (*RExC_parse == '|') {
4302 if (!SIZE_ONLY && RExC_extralen) {
4303 reginsert(pRExC_state, BRANCHJ, br);
4306 reginsert(pRExC_state, BRANCH, br);
4307 Set_Node_Length(br, paren != 0);
4308 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4312 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4314 else if (paren == ':') {
4315 *flagp |= flags&SIMPLE;
4317 if (is_open) { /* Starts with OPEN. */
4318 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4320 else if (paren != '?') /* Not Conditional */
4322 *flagp |= flags & (SPSTART | HASWIDTH);
4324 while (*RExC_parse == '|') {
4325 if (!SIZE_ONLY && RExC_extralen) {
4326 ender = reganode(pRExC_state, LONGJMP,0);
4327 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4330 RExC_extralen += 2; /* Account for LONGJMP. */
4331 nextchar(pRExC_state);
4332 br = regbranch(pRExC_state, &flags, 0, depth+1);
4336 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4340 *flagp |= flags&SPSTART;
4343 if (have_branch || paren != ':') {
4344 /* Make a closing node, and hook it on the end. */
4347 ender = reg_node(pRExC_state, TAIL);
4350 ender = reganode(pRExC_state, CLOSE, parno);
4351 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4352 Set_Node_Length(ender,1); /* MJD */
4358 *flagp &= ~HASWIDTH;
4361 ender = reg_node(pRExC_state, SUCCEED);
4364 ender = reg_node(pRExC_state, END);
4367 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4369 if (have_branch && !SIZE_ONLY) {
4370 /* Hook the tails of the branches to the closing node. */
4371 for (br = ret; br; br = regnext(br)) {
4372 const U8 op = PL_regkind[OP(br)];
4374 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4376 else if (op == BRANCHJ) {
4377 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4385 static const char parens[] = "=!<,>";
4387 if (paren && (p = strchr(parens, paren))) {
4388 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4389 int flag = (p - parens) > 1;
4392 node = SUSPEND, flag = 0;
4393 reginsert(pRExC_state, node,ret);
4394 Set_Node_Cur_Length(ret);
4395 Set_Node_Offset(ret, parse_start + 1);
4397 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4401 /* Check for proper termination. */
4403 RExC_flags = oregflags;
4404 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4405 RExC_parse = oregcomp_parse;
4406 vFAIL("Unmatched (");
4409 else if (!paren && RExC_parse < RExC_end) {
4410 if (*RExC_parse == ')') {
4412 vFAIL("Unmatched )");
4415 FAIL("Junk on end of regexp"); /* "Can't happen". */
4423 - regbranch - one alternative of an | operator
4425 * Implements the concatenation operator.
4428 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4431 register regnode *ret;
4432 register regnode *chain = NULL;
4433 register regnode *latest;
4434 I32 flags = 0, c = 0;
4435 GET_RE_DEBUG_FLAGS_DECL;
4436 DEBUG_PARSE("brnc");
4440 if (!SIZE_ONLY && RExC_extralen)
4441 ret = reganode(pRExC_state, BRANCHJ,0);
4443 ret = reg_node(pRExC_state, BRANCH);
4444 Set_Node_Length(ret, 1);
4448 if (!first && SIZE_ONLY)
4449 RExC_extralen += 1; /* BRANCHJ */
4451 *flagp = WORST; /* Tentatively. */
4454 nextchar(pRExC_state);
4455 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4457 latest = regpiece(pRExC_state, &flags,depth+1);
4458 if (latest == NULL) {
4459 if (flags & TRYAGAIN)
4463 else if (ret == NULL)
4465 *flagp |= flags&HASWIDTH;
4466 if (chain == NULL) /* First piece. */
4467 *flagp |= flags&SPSTART;
4470 REGTAIL(pRExC_state, chain, latest);
4475 if (chain == NULL) { /* Loop ran zero times. */
4476 chain = reg_node(pRExC_state, NOTHING);
4481 *flagp |= flags&SIMPLE;
4488 - regpiece - something followed by possible [*+?]
4490 * Note that the branching code sequences used for ? and the general cases
4491 * of * and + are somewhat optimized: they use the same NOTHING node as
4492 * both the endmarker for their branch list and the body of the last branch.
4493 * It might seem that this node could be dispensed with entirely, but the
4494 * endmarker role is not redundant.
4497 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4500 register regnode *ret;
4502 register char *next;
4504 const char * const origparse = RExC_parse;
4506 I32 max = REG_INFTY;
4508 const char *maxpos = NULL;
4509 GET_RE_DEBUG_FLAGS_DECL;
4510 DEBUG_PARSE("piec");
4512 ret = regatom(pRExC_state, &flags,depth+1);
4514 if (flags & TRYAGAIN)
4521 if (op == '{' && regcurly(RExC_parse)) {
4523 parse_start = RExC_parse; /* MJD */
4524 next = RExC_parse + 1;
4525 while (isDIGIT(*next) || *next == ',') {
4534 if (*next == '}') { /* got one */
4538 min = atoi(RExC_parse);
4542 maxpos = RExC_parse;
4544 if (!max && *maxpos != '0')
4545 max = REG_INFTY; /* meaning "infinity" */
4546 else if (max >= REG_INFTY)
4547 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4549 nextchar(pRExC_state);
4552 if ((flags&SIMPLE)) {
4553 RExC_naughty += 2 + RExC_naughty / 2;
4554 reginsert(pRExC_state, CURLY, ret);
4555 Set_Node_Offset(ret, parse_start+1); /* MJD */
4556 Set_Node_Cur_Length(ret);
4559 regnode * const w = reg_node(pRExC_state, WHILEM);
4562 REGTAIL(pRExC_state, ret, w);
4563 if (!SIZE_ONLY && RExC_extralen) {
4564 reginsert(pRExC_state, LONGJMP,ret);
4565 reginsert(pRExC_state, NOTHING,ret);
4566 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4568 reginsert(pRExC_state, CURLYX,ret);
4570 Set_Node_Offset(ret, parse_start+1);
4571 Set_Node_Length(ret,
4572 op == '{' ? (RExC_parse - parse_start) : 1);
4574 if (!SIZE_ONLY && RExC_extralen)
4575 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4576 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4578 RExC_whilem_seen++, RExC_extralen += 3;
4579 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4587 if (max && max < min)
4588 vFAIL("Can't do {n,m} with n > m");
4590 ARG1_SET(ret, (U16)min);
4591 ARG2_SET(ret, (U16)max);
4603 #if 0 /* Now runtime fix should be reliable. */
4605 /* if this is reinstated, don't forget to put this back into perldiag:
4607 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4609 (F) The part of the regexp subject to either the * or + quantifier
4610 could match an empty string. The {#} shows in the regular
4611 expression about where the problem was discovered.
4615 if (!(flags&HASWIDTH) && op != '?')
4616 vFAIL("Regexp *+ operand could be empty");
4619 parse_start = RExC_parse;
4620 nextchar(pRExC_state);
4622 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4624 if (op == '*' && (flags&SIMPLE)) {
4625 reginsert(pRExC_state, STAR, ret);
4629 else if (op == '*') {
4633 else if (op == '+' && (flags&SIMPLE)) {
4634 reginsert(pRExC_state, PLUS, ret);
4638 else if (op == '+') {
4642 else if (op == '?') {
4647 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4649 "%.*s matches null string many times",
4650 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4654 if (*RExC_parse == '?') {
4655 nextchar(pRExC_state);
4656 reginsert(pRExC_state, MINMOD, ret);
4657 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4659 if (ISMULT2(RExC_parse)) {
4661 vFAIL("Nested quantifiers");
4668 - regatom - the lowest level
4670 * Optimization: gobbles an entire sequence of ordinary characters so that
4671 * it can turn them into a single node, which is smaller to store and
4672 * faster to run. Backslashed characters are exceptions, each becoming a
4673 * separate node; the code is simpler that way and it's not worth fixing.
4675 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4676 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4679 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4682 register regnode *ret = NULL;
4684 char *parse_start = RExC_parse;
4685 GET_RE_DEBUG_FLAGS_DECL;
4686 DEBUG_PARSE("atom");
4687 *flagp = WORST; /* Tentatively. */
4690 switch (*RExC_parse) {
4692 RExC_seen_zerolen++;
4693 nextchar(pRExC_state);
4694 if (RExC_flags & PMf_MULTILINE)
4695 ret = reg_node(pRExC_state, MBOL);
4696 else if (RExC_flags & PMf_SINGLELINE)
4697 ret = reg_node(pRExC_state, SBOL);
4699 ret = reg_node(pRExC_state, BOL);
4700 Set_Node_Length(ret, 1); /* MJD */
4703 nextchar(pRExC_state);
4705 RExC_seen_zerolen++;
4706 if (RExC_flags & PMf_MULTILINE)
4707 ret = reg_node(pRExC_state, MEOL);
4708 else if (RExC_flags & PMf_SINGLELINE)
4709 ret = reg_node(pRExC_state, SEOL);
4711 ret = reg_node(pRExC_state, EOL);
4712 Set_Node_Length(ret, 1); /* MJD */
4715 nextchar(pRExC_state);
4716 if (RExC_flags & PMf_SINGLELINE)
4717 ret = reg_node(pRExC_state, SANY);
4719 ret = reg_node(pRExC_state, REG_ANY);
4720 *flagp |= HASWIDTH|SIMPLE;
4722 Set_Node_Length(ret, 1); /* MJD */
4726 char * const oregcomp_parse = ++RExC_parse;
4727 ret = regclass(pRExC_state,depth+1);
4728 if (*RExC_parse != ']') {
4729 RExC_parse = oregcomp_parse;
4730 vFAIL("Unmatched [");
4732 nextchar(pRExC_state);
4733 *flagp |= HASWIDTH|SIMPLE;
4734 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
4738 nextchar(pRExC_state);
4739 ret = reg(pRExC_state, 1, &flags,depth+1);
4741 if (flags & TRYAGAIN) {
4742 if (RExC_parse == RExC_end) {
4743 /* Make parent create an empty node if needed. */
4751 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
4755 if (flags & TRYAGAIN) {
4759 vFAIL("Internal urp");
4760 /* Supposed to be caught earlier. */
4763 if (!regcurly(RExC_parse)) {
4772 vFAIL("Quantifier follows nothing");
4775 switch (*++RExC_parse) {
4777 RExC_seen_zerolen++;
4778 ret = reg_node(pRExC_state, SBOL);
4780 nextchar(pRExC_state);
4781 Set_Node_Length(ret, 2); /* MJD */
4784 ret = reg_node(pRExC_state, GPOS);
4785 RExC_seen |= REG_SEEN_GPOS;
4787 nextchar(pRExC_state);
4788 Set_Node_Length(ret, 2); /* MJD */
4791 ret = reg_node(pRExC_state, SEOL);
4793 RExC_seen_zerolen++; /* Do not optimize RE away */
4794 nextchar(pRExC_state);
4797 ret = reg_node(pRExC_state, EOS);
4799 RExC_seen_zerolen++; /* Do not optimize RE away */
4800 nextchar(pRExC_state);
4801 Set_Node_Length(ret, 2); /* MJD */
4804 ret = reg_node(pRExC_state, CANY);
4805 RExC_seen |= REG_SEEN_CANY;
4806 *flagp |= HASWIDTH|SIMPLE;
4807 nextchar(pRExC_state);
4808 Set_Node_Length(ret, 2); /* MJD */
4811 ret = reg_node(pRExC_state, CLUMP);
4813 nextchar(pRExC_state);
4814 Set_Node_Length(ret, 2); /* MJD */
4817 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
4818 *flagp |= HASWIDTH|SIMPLE;
4819 nextchar(pRExC_state);
4820 Set_Node_Length(ret, 2); /* MJD */
4823 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
4824 *flagp |= HASWIDTH|SIMPLE;
4825 nextchar(pRExC_state);
4826 Set_Node_Length(ret, 2); /* MJD */
4829 RExC_seen_zerolen++;
4830 RExC_seen |= REG_SEEN_LOOKBEHIND;
4831 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
4833 nextchar(pRExC_state);
4834 Set_Node_Length(ret, 2); /* MJD */
4837 RExC_seen_zerolen++;
4838 RExC_seen |= REG_SEEN_LOOKBEHIND;
4839 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
4841 nextchar(pRExC_state);
4842 Set_Node_Length(ret, 2); /* MJD */
4845 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
4846 *flagp |= HASWIDTH|SIMPLE;
4847 nextchar(pRExC_state);
4848 Set_Node_Length(ret, 2); /* MJD */
4851 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
4852 *flagp |= HASWIDTH|SIMPLE;
4853 nextchar(pRExC_state);
4854 Set_Node_Length(ret, 2); /* MJD */
4857 ret = reg_node(pRExC_state, DIGIT);
4858 *flagp |= HASWIDTH|SIMPLE;
4859 nextchar(pRExC_state);
4860 Set_Node_Length(ret, 2); /* MJD */
4863 ret = reg_node(pRExC_state, NDIGIT);
4864 *flagp |= HASWIDTH|SIMPLE;
4865 nextchar(pRExC_state);
4866 Set_Node_Length(ret, 2); /* MJD */
4871 char* const oldregxend = RExC_end;
4872 char* parse_start = RExC_parse - 2;
4874 if (RExC_parse[1] == '{') {
4875 /* a lovely hack--pretend we saw [\pX] instead */
4876 RExC_end = strchr(RExC_parse, '}');
4878 const U8 c = (U8)*RExC_parse;
4880 RExC_end = oldregxend;
4881 vFAIL2("Missing right brace on \\%c{}", c);
4886 RExC_end = RExC_parse + 2;
4887 if (RExC_end > oldregxend)
4888 RExC_end = oldregxend;
4892 ret = regclass(pRExC_state,depth+1);
4894 RExC_end = oldregxend;
4897 Set_Node_Offset(ret, parse_start + 2);
4898 Set_Node_Cur_Length(ret);
4899 nextchar(pRExC_state);
4900 *flagp |= HASWIDTH|SIMPLE;
4913 case '1': case '2': case '3': case '4':
4914 case '5': case '6': case '7': case '8': case '9':
4916 const I32 num = atoi(RExC_parse);
4918 if (num > 9 && num >= RExC_npar)
4921 char * const parse_start = RExC_parse - 1; /* MJD */
4922 while (isDIGIT(*RExC_parse))
4925 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
4926 vFAIL("Reference to nonexistent group");
4928 ret = reganode(pRExC_state,
4929 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4933 /* override incorrect value set in reganode MJD */
4934 Set_Node_Offset(ret, parse_start+1);
4935 Set_Node_Cur_Length(ret); /* MJD */
4937 nextchar(pRExC_state);
4942 if (RExC_parse >= RExC_end)
4943 FAIL("Trailing \\");
4946 /* Do not generate "unrecognized" warnings here, we fall
4947 back into the quick-grab loop below */
4954 if (RExC_flags & PMf_EXTENDED) {
4955 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4957 if (RExC_parse < RExC_end)
4963 register STRLEN len;
4968 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
4970 parse_start = RExC_parse - 1;
4976 ret = reg_node(pRExC_state,
4977 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
4979 for (len = 0, p = RExC_parse - 1;
4980 len < 127 && p < RExC_end;
4983 char * const oldp = p;
4985 if (RExC_flags & PMf_EXTENDED)
4986 p = regwhite(p, RExC_end);
5033 ender = ASCII_TO_NATIVE('\033');
5037 ender = ASCII_TO_NATIVE('\007');
5042 char* const e = strchr(p, '}');
5046 vFAIL("Missing right brace on \\x{}");
5049 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5050 | PERL_SCAN_DISALLOW_PREFIX;
5051 STRLEN numlen = e - p - 1;
5052 ender = grok_hex(p + 1, &numlen, &flags, NULL);
5059 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5061 ender = grok_hex(p, &numlen, &flags, NULL);
5067 ender = UCHARAT(p++);
5068 ender = toCTRL(ender);
5070 case '0': case '1': case '2': case '3':case '4':
5071 case '5': case '6': case '7': case '8':case '9':
5073 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
5076 ender = grok_oct(p, &numlen, &flags, NULL);
5086 FAIL("Trailing \\");
5089 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
5090 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
5091 goto normal_default;
5096 if (UTF8_IS_START(*p) && UTF) {
5098 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
5099 &numlen, UTF8_ALLOW_DEFAULT);
5106 if (RExC_flags & PMf_EXTENDED)
5107 p = regwhite(p, RExC_end);
5109 /* Prime the casefolded buffer. */
5110 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
5112 if (ISMULT2(p)) { /* Back off on ?+*. */
5117 /* Emit all the Unicode characters. */
5119 for (foldbuf = tmpbuf;
5121 foldlen -= numlen) {
5122 ender = utf8_to_uvchr(foldbuf, &numlen);
5124 const STRLEN unilen = reguni(pRExC_state, ender, s);
5127 /* In EBCDIC the numlen
5128 * and unilen can differ. */
5130 if (numlen >= foldlen)
5134 break; /* "Can't happen." */
5138 const STRLEN unilen = reguni(pRExC_state, ender, s);
5147 REGC((char)ender, s++);
5153 /* Emit all the Unicode characters. */
5155 for (foldbuf = tmpbuf;
5157 foldlen -= numlen) {
5158 ender = utf8_to_uvchr(foldbuf, &numlen);
5160 const STRLEN unilen = reguni(pRExC_state, ender, s);
5163 /* In EBCDIC the numlen
5164 * and unilen can differ. */
5166 if (numlen >= foldlen)
5174 const STRLEN unilen = reguni(pRExC_state, ender, s);
5183 REGC((char)ender, s++);
5187 Set_Node_Cur_Length(ret); /* MJD */
5188 nextchar(pRExC_state);
5190 /* len is STRLEN which is unsigned, need to copy to signed */
5193 vFAIL("Internal disaster");
5197 if (len == 1 && UNI_IS_INVARIANT(ender))
5201 RExC_size += STR_SZ(len);
5204 RExC_emit += STR_SZ(len);
5210 /* If the encoding pragma is in effect recode the text of
5211 * any EXACT-kind nodes. */
5212 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
5213 const STRLEN oldlen = STR_LEN(ret);
5214 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
5218 if (sv_utf8_downgrade(sv, TRUE)) {
5219 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5220 const STRLEN newlen = SvCUR(sv);
5225 GET_RE_DEBUG_FLAGS_DECL;
5226 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
5227 (int)oldlen, STRING(ret),
5229 Copy(s, STRING(ret), newlen, char);
5230 STR_LEN(ret) += newlen - oldlen;
5231 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5233 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5241 S_regwhite(char *p, const char *e)
5246 else if (*p == '#') {
5249 } while (p < e && *p != '\n');
5257 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5258 Character classes ([:foo:]) can also be negated ([:^foo:]).
5259 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5260 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
5261 but trigger failures because they are currently unimplemented. */
5263 #define POSIXCC_DONE(c) ((c) == ':')
5264 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5265 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5268 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
5271 I32 namedclass = OOB_NAMEDCLASS;
5273 if (value == '[' && RExC_parse + 1 < RExC_end &&
5274 /* I smell either [: or [= or [. -- POSIX has been here, right? */
5275 POSIXCC(UCHARAT(RExC_parse))) {
5276 const char c = UCHARAT(RExC_parse);
5277 char* const s = RExC_parse++;
5279 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
5281 if (RExC_parse == RExC_end)
5282 /* Grandfather lone [:, [=, [. */
5285 const char* const t = RExC_parse++; /* skip over the c */
5288 if (UCHARAT(RExC_parse) == ']') {
5289 const char *posixcc = s + 1;
5290 RExC_parse++; /* skip over the ending ] */
5293 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5294 const I32 skip = t - posixcc;
5296 /* Initially switch on the length of the name. */
5299 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5300 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5303 /* Names all of length 5. */
5304 /* alnum alpha ascii blank cntrl digit graph lower
5305 print punct space upper */
5306 /* Offset 4 gives the best switch position. */
5307 switch (posixcc[4]) {
5309 if (memEQ(posixcc, "alph", 4)) /* alpha */
5310 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5313 if (memEQ(posixcc, "spac", 4)) /* space */
5314 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5317 if (memEQ(posixcc, "grap", 4)) /* graph */
5318 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5321 if (memEQ(posixcc, "asci", 4)) /* ascii */
5322 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5325 if (memEQ(posixcc, "blan", 4)) /* blank */
5326 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5329 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5330 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5333 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5334 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5337 if (memEQ(posixcc, "lowe", 4)) /* lower */
5338 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5339 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5340 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5343 if (memEQ(posixcc, "digi", 4)) /* digit */
5344 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5345 else if (memEQ(posixcc, "prin", 4)) /* print */
5346 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5347 else if (memEQ(posixcc, "punc", 4)) /* punct */
5348 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5353 if (memEQ(posixcc, "xdigit", 6))
5354 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5358 if (namedclass == OOB_NAMEDCLASS)
5359 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5361 assert (posixcc[skip] == ':');
5362 assert (posixcc[skip+1] == ']');
5363 } else if (!SIZE_ONLY) {
5364 /* [[=foo=]] and [[.foo.]] are still future. */
5366 /* adjust RExC_parse so the warning shows after
5368 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5370 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5373 /* Maternal grandfather:
5374 * "[:" ending in ":" but not in ":]" */
5384 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5387 if (POSIXCC(UCHARAT(RExC_parse))) {
5388 const char *s = RExC_parse;
5389 const char c = *s++;
5393 if (*s && c == *s && s[1] == ']') {
5394 if (ckWARN(WARN_REGEXP))
5396 "POSIX syntax [%c %c] belongs inside character classes",
5399 /* [[=foo=]] and [[.foo.]] are still future. */
5400 if (POSIXCC_NOTYET(c)) {
5401 /* adjust RExC_parse so the error shows after
5403 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5405 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5413 parse a class specification and produce either an ANYOF node that
5414 matches the pattern. If the pattern matches a single char only and
5415 that char is < 256 then we produce an EXACT node instead.
5418 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5422 register UV nextvalue;
5423 register IV prevvalue = OOB_UNICODE;
5424 register IV range = 0;
5425 register regnode *ret;
5428 char *rangebegin = NULL;
5429 bool need_class = 0;
5432 bool optimize_invert = TRUE;
5433 AV* unicode_alternate = NULL;
5435 UV literal_endpoint = 0;
5437 UV stored = 0; /* number of chars stored in the class */
5439 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5440 case we need to change the emitted regop to an EXACT. */
5441 const char * orig_parse = RExC_parse;
5442 GET_RE_DEBUG_FLAGS_DECL;
5444 PERL_UNUSED_ARG(depth);
5447 DEBUG_PARSE("clas");
5449 /* Assume we are going to generate an ANYOF node. */
5450 ret = reganode(pRExC_state, ANYOF, 0);
5453 ANYOF_FLAGS(ret) = 0;
5455 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5459 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5463 RExC_size += ANYOF_SKIP;
5464 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5467 RExC_emit += ANYOF_SKIP;
5469 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5471 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5472 ANYOF_BITMAP_ZERO(ret);
5473 listsv = newSVpvs("# comment\n");
5476 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5478 if (!SIZE_ONLY && POSIXCC(nextvalue))
5479 checkposixcc(pRExC_state);
5481 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5482 if (UCHARAT(RExC_parse) == ']')
5485 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5489 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5492 rangebegin = RExC_parse;
5494 value = utf8n_to_uvchr((U8*)RExC_parse,
5495 RExC_end - RExC_parse,
5496 &numlen, UTF8_ALLOW_DEFAULT);
5497 RExC_parse += numlen;
5500 value = UCHARAT(RExC_parse++);
5502 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5503 if (value == '[' && POSIXCC(nextvalue))
5504 namedclass = regpposixcc(pRExC_state, value);
5505 else if (value == '\\') {
5507 value = utf8n_to_uvchr((U8*)RExC_parse,
5508 RExC_end - RExC_parse,
5509 &numlen, UTF8_ALLOW_DEFAULT);
5510 RExC_parse += numlen;
5513 value = UCHARAT(RExC_parse++);
5514 /* Some compilers cannot handle switching on 64-bit integer
5515 * values, therefore value cannot be an UV. Yes, this will
5516 * be a problem later if we want switch on Unicode.
5517 * A similar issue a little bit later when switching on
5518 * namedclass. --jhi */
5519 switch ((I32)value) {
5520 case 'w': namedclass = ANYOF_ALNUM; break;
5521 case 'W': namedclass = ANYOF_NALNUM; break;
5522 case 's': namedclass = ANYOF_SPACE; break;
5523 case 'S': namedclass = ANYOF_NSPACE; break;
5524 case 'd': namedclass = ANYOF_DIGIT; break;
5525 case 'D': namedclass = ANYOF_NDIGIT; break;
5530 if (RExC_parse >= RExC_end)
5531 vFAIL2("Empty \\%c{}", (U8)value);
5532 if (*RExC_parse == '{') {
5533 const U8 c = (U8)value;
5534 e = strchr(RExC_parse++, '}');
5536 vFAIL2("Missing right brace on \\%c{}", c);
5537 while (isSPACE(UCHARAT(RExC_parse)))
5539 if (e == RExC_parse)
5540 vFAIL2("Empty \\%c{}", c);
5542 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5550 if (UCHARAT(RExC_parse) == '^') {
5553 value = value == 'p' ? 'P' : 'p'; /* toggle */
5554 while (isSPACE(UCHARAT(RExC_parse))) {
5559 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5560 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5563 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5564 namedclass = ANYOF_MAX; /* no official name, but it's named */
5567 case 'n': value = '\n'; break;
5568 case 'r': value = '\r'; break;
5569 case 't': value = '\t'; break;
5570 case 'f': value = '\f'; break;
5571 case 'b': value = '\b'; break;
5572 case 'e': value = ASCII_TO_NATIVE('\033');break;
5573 case 'a': value = ASCII_TO_NATIVE('\007');break;
5575 if (*RExC_parse == '{') {
5576 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5577 | PERL_SCAN_DISALLOW_PREFIX;
5578 char * const e = strchr(RExC_parse++, '}');
5580 vFAIL("Missing right brace on \\x{}");
5582 numlen = e - RExC_parse;
5583 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5587 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5589 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5590 RExC_parse += numlen;
5594 value = UCHARAT(RExC_parse++);
5595 value = toCTRL(value);
5597 case '0': case '1': case '2': case '3': case '4':
5598 case '5': case '6': case '7': case '8': case '9':
5602 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5603 RExC_parse += numlen;
5607 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5609 "Unrecognized escape \\%c in character class passed through",
5613 } /* end of \blah */
5619 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5621 if (!SIZE_ONLY && !need_class)
5622 ANYOF_CLASS_ZERO(ret);
5626 /* a bad range like a-\d, a-[:digit:] ? */
5629 if (ckWARN(WARN_REGEXP)) {
5631 RExC_parse >= rangebegin ?
5632 RExC_parse - rangebegin : 0;
5634 "False [] range \"%*.*s\"",
5637 if (prevvalue < 256) {
5638 ANYOF_BITMAP_SET(ret, prevvalue);
5639 ANYOF_BITMAP_SET(ret, '-');
5642 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5643 Perl_sv_catpvf(aTHX_ listsv,
5644 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5648 range = 0; /* this was not a true range */
5652 const char *what = NULL;
5655 if (namedclass > OOB_NAMEDCLASS)
5656 optimize_invert = FALSE;
5657 /* Possible truncation here but in some 64-bit environments
5658 * the compiler gets heartburn about switch on 64-bit values.
5659 * A similar issue a little earlier when switching on value.
5661 switch ((I32)namedclass) {
5664 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5666 for (value = 0; value < 256; value++)
5668 ANYOF_BITMAP_SET(ret, value);
5675 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5677 for (value = 0; value < 256; value++)
5678 if (!isALNUM(value))
5679 ANYOF_BITMAP_SET(ret, value);
5686 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5688 for (value = 0; value < 256; value++)
5689 if (isALNUMC(value))
5690 ANYOF_BITMAP_SET(ret, value);
5697 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5699 for (value = 0; value < 256; value++)
5700 if (!isALNUMC(value))
5701 ANYOF_BITMAP_SET(ret, value);
5708 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5710 for (value = 0; value < 256; value++)
5712 ANYOF_BITMAP_SET(ret, value);
5719 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
5721 for (value = 0; value < 256; value++)
5722 if (!isALPHA(value))
5723 ANYOF_BITMAP_SET(ret, value);
5730 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
5733 for (value = 0; value < 128; value++)
5734 ANYOF_BITMAP_SET(ret, value);
5736 for (value = 0; value < 256; value++) {
5738 ANYOF_BITMAP_SET(ret, value);
5747 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
5750 for (value = 128; value < 256; value++)
5751 ANYOF_BITMAP_SET(ret, value);
5753 for (value = 0; value < 256; value++) {
5754 if (!isASCII(value))
5755 ANYOF_BITMAP_SET(ret, value);
5764 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5766 for (value = 0; value < 256; value++)
5768 ANYOF_BITMAP_SET(ret, value);
5775 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5777 for (value = 0; value < 256; value++)
5778 if (!isBLANK(value))
5779 ANYOF_BITMAP_SET(ret, value);
5786 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
5788 for (value = 0; value < 256; value++)
5790 ANYOF_BITMAP_SET(ret, value);
5797 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
5799 for (value = 0; value < 256; value++)
5800 if (!isCNTRL(value))
5801 ANYOF_BITMAP_SET(ret, value);
5808 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5810 /* consecutive digits assumed */
5811 for (value = '0'; value <= '9'; value++)
5812 ANYOF_BITMAP_SET(ret, value);
5819 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5821 /* consecutive digits assumed */
5822 for (value = 0; value < '0'; value++)
5823 ANYOF_BITMAP_SET(ret, value);
5824 for (value = '9' + 1; value < 256; value++)
5825 ANYOF_BITMAP_SET(ret, value);
5832 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
5834 for (value = 0; value < 256; value++)
5836 ANYOF_BITMAP_SET(ret, value);
5843 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
5845 for (value = 0; value < 256; value++)
5846 if (!isGRAPH(value))
5847 ANYOF_BITMAP_SET(ret, value);
5854 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
5856 for (value = 0; value < 256; value++)
5858 ANYOF_BITMAP_SET(ret, value);
5865 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
5867 for (value = 0; value < 256; value++)
5868 if (!isLOWER(value))
5869 ANYOF_BITMAP_SET(ret, value);
5876 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
5878 for (value = 0; value < 256; value++)
5880 ANYOF_BITMAP_SET(ret, value);
5887 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
5889 for (value = 0; value < 256; value++)
5890 if (!isPRINT(value))
5891 ANYOF_BITMAP_SET(ret, value);
5898 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5900 for (value = 0; value < 256; value++)
5901 if (isPSXSPC(value))
5902 ANYOF_BITMAP_SET(ret, value);
5909 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5911 for (value = 0; value < 256; value++)
5912 if (!isPSXSPC(value))
5913 ANYOF_BITMAP_SET(ret, value);
5920 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
5922 for (value = 0; value < 256; value++)
5924 ANYOF_BITMAP_SET(ret, value);
5931 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
5933 for (value = 0; value < 256; value++)
5934 if (!isPUNCT(value))
5935 ANYOF_BITMAP_SET(ret, value);
5942 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5944 for (value = 0; value < 256; value++)
5946 ANYOF_BITMAP_SET(ret, value);
5953 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5955 for (value = 0; value < 256; value++)
5956 if (!isSPACE(value))
5957 ANYOF_BITMAP_SET(ret, value);
5964 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
5966 for (value = 0; value < 256; value++)
5968 ANYOF_BITMAP_SET(ret, value);
5975 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
5977 for (value = 0; value < 256; value++)
5978 if (!isUPPER(value))
5979 ANYOF_BITMAP_SET(ret, value);
5986 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
5988 for (value = 0; value < 256; value++)
5989 if (isXDIGIT(value))
5990 ANYOF_BITMAP_SET(ret, value);
5997 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
5999 for (value = 0; value < 256; value++)
6000 if (!isXDIGIT(value))
6001 ANYOF_BITMAP_SET(ret, value);
6007 /* this is to handle \p and \P */
6010 vFAIL("Invalid [::] class");
6014 /* Strings such as "+utf8::isWord\n" */
6015 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
6018 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
6021 } /* end of namedclass \blah */
6024 if (prevvalue > (IV)value) /* b-a */ {
6025 const int w = RExC_parse - rangebegin;
6026 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
6027 range = 0; /* not a valid range */
6031 prevvalue = value; /* save the beginning of the range */
6032 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
6033 RExC_parse[1] != ']') {
6036 /* a bad range like \w-, [:word:]- ? */
6037 if (namedclass > OOB_NAMEDCLASS) {
6038 if (ckWARN(WARN_REGEXP)) {
6040 RExC_parse >= rangebegin ?
6041 RExC_parse - rangebegin : 0;
6043 "False [] range \"%*.*s\"",
6047 ANYOF_BITMAP_SET(ret, '-');
6049 range = 1; /* yeah, it's a range! */
6050 continue; /* but do it the next time */
6054 /* now is the next time */
6055 /*stored += (value - prevvalue + 1);*/
6057 if (prevvalue < 256) {
6058 const IV ceilvalue = value < 256 ? value : 255;
6061 /* In EBCDIC [\x89-\x91] should include
6062 * the \x8e but [i-j] should not. */
6063 if (literal_endpoint == 2 &&
6064 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
6065 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
6067 if (isLOWER(prevvalue)) {
6068 for (i = prevvalue; i <= ceilvalue; i++)
6070 ANYOF_BITMAP_SET(ret, i);
6072 for (i = prevvalue; i <= ceilvalue; i++)
6074 ANYOF_BITMAP_SET(ret, i);
6079 for (i = prevvalue; i <= ceilvalue; i++) {
6080 if (!ANYOF_BITMAP_TEST(ret,i)) {
6082 ANYOF_BITMAP_SET(ret, i);
6086 if (value > 255 || UTF) {
6087 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
6088 const UV natvalue = NATIVE_TO_UNI(value);
6089 stored+=2; /* can't optimize this class */
6090 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6091 if (prevnatvalue < natvalue) { /* what about > ? */
6092 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
6093 prevnatvalue, natvalue);
6095 else if (prevnatvalue == natvalue) {
6096 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
6098 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
6100 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
6102 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
6103 if (RExC_precomp[0] == ':' &&
6104 RExC_precomp[1] == '[' &&
6105 (f == 0xDF || f == 0x92)) {
6106 f = NATIVE_TO_UNI(f);
6109 /* If folding and foldable and a single
6110 * character, insert also the folded version
6111 * to the charclass. */
6113 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
6114 if ((RExC_precomp[0] == ':' &&
6115 RExC_precomp[1] == '[' &&
6117 (value == 0xFB05 || value == 0xFB06))) ?
6118 foldlen == ((STRLEN)UNISKIP(f) - 1) :
6119 foldlen == (STRLEN)UNISKIP(f) )
6121 if (foldlen == (STRLEN)UNISKIP(f))
6123 Perl_sv_catpvf(aTHX_ listsv,
6126 /* Any multicharacter foldings
6127 * require the following transform:
6128 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
6129 * where E folds into "pq" and F folds
6130 * into "rst", all other characters
6131 * fold to single characters. We save
6132 * away these multicharacter foldings,
6133 * to be later saved as part of the
6134 * additional "s" data. */
6137 if (!unicode_alternate)
6138 unicode_alternate = newAV();
6139 sv = newSVpvn((char*)foldbuf, foldlen);
6141 av_push(unicode_alternate, sv);
6145 /* If folding and the value is one of the Greek
6146 * sigmas insert a few more sigmas to make the
6147 * folding rules of the sigmas to work right.
6148 * Note that not all the possible combinations
6149 * are handled here: some of them are handled
6150 * by the standard folding rules, and some of
6151 * them (literal or EXACTF cases) are handled
6152 * during runtime in regexec.c:S_find_byclass(). */
6153 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
6154 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
6155 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
6156 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
6157 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
6159 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
6160 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
6161 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
6166 literal_endpoint = 0;
6170 range = 0; /* this range (if it was one) is done now */
6174 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
6176 RExC_size += ANYOF_CLASS_ADD_SKIP;
6178 RExC_emit += ANYOF_CLASS_ADD_SKIP;
6184 /****** !SIZE_ONLY AFTER HERE *********/
6186 if( stored == 1 && value < 256
6187 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
6189 /* optimize single char class to an EXACT node
6190 but *only* when its not a UTF/high char */
6191 const char * cur_parse= RExC_parse;
6192 RExC_emit = (regnode *)orig_emit;
6193 RExC_parse = (char *)orig_parse;
6194 ret = reg_node(pRExC_state,
6195 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
6196 RExC_parse = (char *)cur_parse;
6197 *STRING(ret)= (char)value;
6199 RExC_emit += STR_SZ(1);
6202 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
6203 if ( /* If the only flag is folding (plus possibly inversion). */
6204 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
6206 for (value = 0; value < 256; ++value) {
6207 if (ANYOF_BITMAP_TEST(ret, value)) {
6208 UV fold = PL_fold[value];
6211 ANYOF_BITMAP_SET(ret, fold);
6214 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
6217 /* optimize inverted simple patterns (e.g. [^a-z]) */
6218 if (optimize_invert &&
6219 /* If the only flag is inversion. */
6220 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
6221 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
6222 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
6223 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
6226 AV * const av = newAV();
6228 /* The 0th element stores the character class description
6229 * in its textual form: used later (regexec.c:Perl_regclass_swash())
6230 * to initialize the appropriate swash (which gets stored in
6231 * the 1st element), and also useful for dumping the regnode.
6232 * The 2nd element stores the multicharacter foldings,
6233 * used later (regexec.c:S_reginclass()). */
6234 av_store(av, 0, listsv);
6235 av_store(av, 1, NULL);
6236 av_store(av, 2, (SV*)unicode_alternate);
6237 rv = newRV_noinc((SV*)av);
6238 n = add_data(pRExC_state, 1, "s");
6239 RExC_rx->data->data[n] = (void*)rv;
6246 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
6248 char* const retval = RExC_parse++;
6251 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6252 RExC_parse[2] == '#') {
6253 while (*RExC_parse != ')') {
6254 if (RExC_parse == RExC_end)
6255 FAIL("Sequence (?#... not terminated");
6261 if (RExC_flags & PMf_EXTENDED) {
6262 if (isSPACE(*RExC_parse)) {
6266 else if (*RExC_parse == '#') {
6267 while (RExC_parse < RExC_end)
6268 if (*RExC_parse++ == '\n') break;
6277 - reg_node - emit a node
6279 STATIC regnode * /* Location. */
6280 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
6283 register regnode *ptr;
6284 regnode * const ret = RExC_emit;
6285 GET_RE_DEBUG_FLAGS_DECL;
6288 SIZE_ALIGN(RExC_size);
6292 NODE_ALIGN_FILL(ret);
6294 FILL_ADVANCE_NODE(ptr, op);
6295 if (RExC_offsets) { /* MJD */
6296 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
6297 "reg_node", __LINE__,
6299 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6300 ? "Overwriting end of array!\n" : "OK",
6301 (UV)(RExC_emit - RExC_emit_start),
6302 (UV)(RExC_parse - RExC_start),
6303 (UV)RExC_offsets[0]));
6304 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
6313 - reganode - emit a node with an argument
6315 STATIC regnode * /* Location. */
6316 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6319 register regnode *ptr;
6320 regnode * const ret = RExC_emit;
6321 GET_RE_DEBUG_FLAGS_DECL;
6324 SIZE_ALIGN(RExC_size);
6329 NODE_ALIGN_FILL(ret);
6331 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6332 if (RExC_offsets) { /* MJD */
6333 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6337 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6338 "Overwriting end of array!\n" : "OK",
6339 (UV)(RExC_emit - RExC_emit_start),
6340 (UV)(RExC_parse - RExC_start),
6341 (UV)RExC_offsets[0]));
6342 Set_Cur_Node_Offset;
6351 - reguni - emit (if appropriate) a Unicode character
6354 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6357 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6361 - reginsert - insert an operator in front of already-emitted operand
6363 * Means relocating the operand.
6366 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6369 register regnode *src;
6370 register regnode *dst;
6371 register regnode *place;
6372 const int offset = regarglen[(U8)op];
6373 GET_RE_DEBUG_FLAGS_DECL;
6374 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6377 RExC_size += NODE_STEP_REGNODE + offset;
6382 RExC_emit += NODE_STEP_REGNODE + offset;
6384 while (src > opnd) {
6385 StructCopy(--src, --dst, regnode);
6386 if (RExC_offsets) { /* MJD 20010112 */
6387 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6391 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6392 ? "Overwriting end of array!\n" : "OK",
6393 (UV)(src - RExC_emit_start),
6394 (UV)(dst - RExC_emit_start),
6395 (UV)RExC_offsets[0]));
6396 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6397 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6402 place = opnd; /* Op node, where operand used to be. */
6403 if (RExC_offsets) { /* MJD */
6404 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6408 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6409 ? "Overwriting end of array!\n" : "OK",
6410 (UV)(place - RExC_emit_start),
6411 (UV)(RExC_parse - RExC_start),
6412 (UV)RExC_offsets[0]));
6413 Set_Node_Offset(place, RExC_parse);
6414 Set_Node_Length(place, 1);
6416 src = NEXTOPER(place);
6417 FILL_ADVANCE_NODE(place, op);
6418 Zero(src, offset, regnode);
6422 - regtail - set the next-pointer at the end of a node chain of p to val.
6423 - SEE ALSO: regtail_study
6425 /* TODO: All three parms should be const */
6427 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6430 register regnode *scan;
6431 GET_RE_DEBUG_FLAGS_DECL;
6433 PERL_UNUSED_ARG(depth);
6439 /* Find last node. */
6442 regnode * const temp = regnext(scan);
6444 SV * const mysv=sv_newmortal();
6445 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6446 regprop(RExC_rx, mysv, scan);
6447 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6448 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6455 if (reg_off_by_arg[OP(scan)]) {
6456 ARG_SET(scan, val - scan);
6459 NEXT_OFF(scan) = val - scan;
6465 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6466 - Look for optimizable sequences at the same time.
6467 - currently only looks for EXACT chains.
6469 This is expermental code. The idea is to use this routine to perform
6470 in place optimizations on branches and groups as they are constructed,
6471 with the long term intention of removing optimization from study_chunk so
6472 that it is purely analytical.
6474 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6475 to control which is which.
6478 /* TODO: All four parms should be const */
6481 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6484 register regnode *scan;
6486 #ifdef EXPERIMENTAL_INPLACESCAN
6490 GET_RE_DEBUG_FLAGS_DECL;
6496 /* Find last node. */
6500 regnode * const temp = regnext(scan);
6501 #ifdef EXPERIMENTAL_INPLACESCAN
6502 if (PL_regkind[OP(scan)] == EXACT)
6503 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6511 if( exact == PSEUDO )
6513 else if ( exact != OP(scan) )
6522 SV * const mysv=sv_newmortal();
6523 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6524 regprop(RExC_rx, mysv, scan);
6525 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6526 SvPV_nolen_const(mysv),
6528 REG_NODE_NUM(scan));
6535 SV * const mysv_val=sv_newmortal();
6536 DEBUG_PARSE_MSG("");
6537 regprop(RExC_rx, mysv_val, val);
6538 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6539 SvPV_nolen_const(mysv_val),
6544 if (reg_off_by_arg[OP(scan)]) {
6545 ARG_SET(scan, val - scan);
6548 NEXT_OFF(scan) = val - scan;
6556 - regcurly - a little FSA that accepts {\d+,?\d*}
6559 S_regcurly(register const char *s)
6578 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6581 Perl_regdump(pTHX_ const regexp *r)
6585 SV * const sv = sv_newmortal();
6586 SV *dsv= sv_newmortal();
6588 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
6590 /* Header fields of interest. */
6591 if (r->anchored_substr) {
6592 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
6593 RE_SV_DUMPLEN(r->anchored_substr), 30);
6594 PerlIO_printf(Perl_debug_log,
6595 "anchored %s%s at %"IVdf" ",
6596 s, RE_SV_TAIL(r->anchored_substr),
6597 (IV)r->anchored_offset);
6598 } else if (r->anchored_utf8) {
6599 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
6600 RE_SV_DUMPLEN(r->anchored_utf8), 30);
6601 PerlIO_printf(Perl_debug_log,
6602 "anchored utf8 %s%s at %"IVdf" ",
6603 s, RE_SV_TAIL(r->anchored_utf8),
6604 (IV)r->anchored_offset);
6606 if (r->float_substr) {
6607 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
6608 RE_SV_DUMPLEN(r->float_substr), 30);
6609 PerlIO_printf(Perl_debug_log,
6610 "floating %s%s at %"IVdf"..%"UVuf" ",
6611 s, RE_SV_TAIL(r->float_substr),
6612 (IV)r->float_min_offset, (UV)r->float_max_offset);
6613 } else if (r->float_utf8) {
6614 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
6615 RE_SV_DUMPLEN(r->float_utf8), 30);
6616 PerlIO_printf(Perl_debug_log,
6617 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
6618 s, RE_SV_TAIL(r->float_utf8),
6619 (IV)r->float_min_offset, (UV)r->float_max_offset);
6621 if (r->check_substr || r->check_utf8)
6622 PerlIO_printf(Perl_debug_log,
6624 (r->check_substr == r->float_substr
6625 && r->check_utf8 == r->float_utf8
6626 ? "(checking floating" : "(checking anchored"));
6627 if (r->reganch & ROPT_NOSCAN)
6628 PerlIO_printf(Perl_debug_log, " noscan");
6629 if (r->reganch & ROPT_CHECK_ALL)
6630 PerlIO_printf(Perl_debug_log, " isall");
6631 if (r->check_substr || r->check_utf8)
6632 PerlIO_printf(Perl_debug_log, ") ");
6634 if (r->regstclass) {
6635 regprop(r, sv, r->regstclass);
6636 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
6638 if (r->reganch & ROPT_ANCH) {
6639 PerlIO_printf(Perl_debug_log, "anchored");
6640 if (r->reganch & ROPT_ANCH_BOL)
6641 PerlIO_printf(Perl_debug_log, "(BOL)");
6642 if (r->reganch & ROPT_ANCH_MBOL)
6643 PerlIO_printf(Perl_debug_log, "(MBOL)");
6644 if (r->reganch & ROPT_ANCH_SBOL)
6645 PerlIO_printf(Perl_debug_log, "(SBOL)");
6646 if (r->reganch & ROPT_ANCH_GPOS)
6647 PerlIO_printf(Perl_debug_log, "(GPOS)");
6648 PerlIO_putc(Perl_debug_log, ' ');
6650 if (r->reganch & ROPT_GPOS_SEEN)
6651 PerlIO_printf(Perl_debug_log, "GPOS ");
6652 if (r->reganch & ROPT_SKIP)
6653 PerlIO_printf(Perl_debug_log, "plus ");
6654 if (r->reganch & ROPT_IMPLICIT)
6655 PerlIO_printf(Perl_debug_log, "implicit ");
6656 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6657 if (r->reganch & ROPT_EVAL_SEEN)
6658 PerlIO_printf(Perl_debug_log, "with eval ");
6659 PerlIO_printf(Perl_debug_log, "\n");
6661 PERL_UNUSED_CONTEXT;
6663 #endif /* DEBUGGING */
6667 - regprop - printable representation of opcode
6670 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6676 sv_setpvn(sv, "", 0);
6677 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6678 /* It would be nice to FAIL() here, but this may be called from
6679 regexec.c, and it would be hard to supply pRExC_state. */
6680 Perl_croak(aTHX_ "Corrupted regexp opcode");
6681 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6683 k = PL_regkind[OP(o)];
6686 SV * const dsv = sv_2mortal(newSVpvs(""));
6687 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
6688 * is a crude hack but it may be the best for now since
6689 * we have no flag "this EXACTish node was UTF-8"
6691 const char * const s =
6692 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
6693 PL_colors[0], PL_colors[1],
6694 PERL_PV_ESCAPE_UNI_DETECT |
6695 PERL_PV_PRETTY_ELIPSES |
6698 Perl_sv_catpvf(aTHX_ sv, " %s", s );
6699 } else if (k == TRIE) {
6700 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6701 /* print the details of the trie in dumpuntil instead, as
6702 * prog->data isn't available here */
6703 } else if (k == CURLY) {
6704 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
6705 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6706 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
6708 else if (k == WHILEM && o->flags) /* Ordinal/of */
6709 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6710 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
6711 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6712 else if (k == LOGICAL)
6713 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
6714 else if (k == ANYOF) {
6715 int i, rangestart = -1;
6716 const U8 flags = ANYOF_FLAGS(o);
6718 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6719 static const char * const anyofs[] = {
6752 if (flags & ANYOF_LOCALE)
6753 sv_catpvs(sv, "{loc}");
6754 if (flags & ANYOF_FOLD)
6755 sv_catpvs(sv, "{i}");
6756 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
6757 if (flags & ANYOF_INVERT)
6759 for (i = 0; i <= 256; i++) {
6760 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6761 if (rangestart == -1)
6763 } else if (rangestart != -1) {
6764 if (i <= rangestart + 3)
6765 for (; rangestart < i; rangestart++)
6766 put_byte(sv, rangestart);
6768 put_byte(sv, rangestart);
6770 put_byte(sv, i - 1);
6776 if (o->flags & ANYOF_CLASS)
6777 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
6778 if (ANYOF_CLASS_TEST(o,i))
6779 sv_catpv(sv, anyofs[i]);
6781 if (flags & ANYOF_UNICODE)
6782 sv_catpvs(sv, "{unicode}");
6783 else if (flags & ANYOF_UNICODE_ALL)
6784 sv_catpvs(sv, "{unicode_all}");
6788 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
6792 U8 s[UTF8_MAXBYTES_CASE+1];
6794 for (i = 0; i <= 256; i++) { /* just the first 256 */
6795 uvchr_to_utf8(s, i);
6797 if (i < 256 && swash_fetch(sw, s, TRUE)) {
6798 if (rangestart == -1)
6800 } else if (rangestart != -1) {
6801 if (i <= rangestart + 3)
6802 for (; rangestart < i; rangestart++) {
6803 const U8 * const e = uvchr_to_utf8(s,rangestart);
6805 for(p = s; p < e; p++)
6809 const U8 *e = uvchr_to_utf8(s,rangestart);
6811 for (p = s; p < e; p++)
6814 e = uvchr_to_utf8(s, i-1);
6815 for (p = s; p < e; p++)
6822 sv_catpvs(sv, "..."); /* et cetera */
6826 char *s = savesvpv(lv);
6827 char * const origs = s;
6829 while (*s && *s != '\n')
6833 const char * const t = ++s;
6851 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6853 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
6854 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
6856 PERL_UNUSED_CONTEXT;
6857 PERL_UNUSED_ARG(sv);
6859 PERL_UNUSED_ARG(prog);
6860 #endif /* DEBUGGING */
6864 Perl_re_intuit_string(pTHX_ regexp *prog)
6865 { /* Assume that RE_INTUIT is set */
6867 GET_RE_DEBUG_FLAGS_DECL;
6868 PERL_UNUSED_CONTEXT;
6872 const char * const s = SvPV_nolen_const(prog->check_substr
6873 ? prog->check_substr : prog->check_utf8);
6875 if (!PL_colorset) reginitcolors();
6876 PerlIO_printf(Perl_debug_log,
6877 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
6879 prog->check_substr ? "" : "utf8 ",
6880 PL_colors[5],PL_colors[0],
6883 (strlen(s) > 60 ? "..." : ""));
6886 return prog->check_substr ? prog->check_substr : prog->check_utf8;
6890 Perl_pregfree(pTHX_ struct regexp *r)
6894 GET_RE_DEBUG_FLAGS_DECL;
6896 if (!r || (--r->refcnt > 0))
6902 SV *dsv= sv_newmortal();
6903 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
6904 dsv, r->precomp, r->prelen, 60);
6905 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
6906 PL_colors[4],PL_colors[5],s);
6910 /* gcov results gave these as non-null 100% of the time, so there's no
6911 optimisation in checking them before calling Safefree */
6912 Safefree(r->precomp);
6913 Safefree(r->offsets); /* 20010421 MJD */
6914 RX_MATCH_COPY_FREE(r);
6915 #ifdef PERL_OLD_COPY_ON_WRITE
6917 SvREFCNT_dec(r->saved_copy);
6920 if (r->anchored_substr)
6921 SvREFCNT_dec(r->anchored_substr);
6922 if (r->anchored_utf8)
6923 SvREFCNT_dec(r->anchored_utf8);
6924 if (r->float_substr)
6925 SvREFCNT_dec(r->float_substr);
6927 SvREFCNT_dec(r->float_utf8);
6928 Safefree(r->substrs);
6931 int n = r->data->count;
6932 PAD* new_comppad = NULL;
6937 /* If you add a ->what type here, update the comment in regcomp.h */
6938 switch (r->data->what[n]) {
6940 SvREFCNT_dec((SV*)r->data->data[n]);
6943 Safefree(r->data->data[n]);
6946 new_comppad = (AV*)r->data->data[n];
6949 if (new_comppad == NULL)
6950 Perl_croak(aTHX_ "panic: pregfree comppad");
6951 PAD_SAVE_LOCAL(old_comppad,
6952 /* Watch out for global destruction's random ordering. */
6953 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
6956 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6959 op_free((OP_4tree*)r->data->data[n]);
6961 PAD_RESTORE_LOCAL(old_comppad);
6962 SvREFCNT_dec((SV*)new_comppad);
6968 { /* Aho Corasick add-on structure for a trie node.
6969 Used in stclass optimization only */
6971 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6973 refcount = --aho->refcount;
6976 Safefree(aho->states);
6977 Safefree(aho->fail);
6978 aho->trie=NULL; /* not necessary to free this as it is
6979 handled by the 't' case */
6980 Safefree(r->data->data[n]); /* do this last!!!! */
6981 Safefree(r->regstclass);
6987 /* trie structure. */
6989 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6991 refcount = --trie->refcount;
6994 Safefree(trie->charmap);
6995 if (trie->widecharmap)
6996 SvREFCNT_dec((SV*)trie->widecharmap);
6997 Safefree(trie->states);
6998 Safefree(trie->trans);
7000 Safefree(trie->bitmap);
7002 Safefree(trie->wordlen);
7004 Safefree(trie->jump);
7006 Safefree(trie->nextword);
7010 SvREFCNT_dec((SV*)trie->words);
7011 if (trie->revcharmap)
7012 SvREFCNT_dec((SV*)trie->revcharmap);
7015 Safefree(r->data->data[n]); /* do this last!!!! */
7020 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
7023 Safefree(r->data->what);
7026 Safefree(r->startp);
7031 #ifndef PERL_IN_XSUB_RE
7033 - regnext - dig the "next" pointer out of a node
7036 Perl_regnext(pTHX_ register regnode *p)
7039 register I32 offset;
7041 if (p == &PL_regdummy)
7044 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
7053 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
7056 STRLEN l1 = strlen(pat1);
7057 STRLEN l2 = strlen(pat2);
7060 const char *message;
7066 Copy(pat1, buf, l1 , char);
7067 Copy(pat2, buf + l1, l2 , char);
7068 buf[l1 + l2] = '\n';
7069 buf[l1 + l2 + 1] = '\0';
7071 /* ANSI variant takes additional second argument */
7072 va_start(args, pat2);
7076 msv = vmess(buf, &args);
7078 message = SvPV_const(msv,l1);
7081 Copy(message, buf, l1 , char);
7082 buf[l1-1] = '\0'; /* Overwrite \n */
7083 Perl_croak(aTHX_ "%s", buf);
7086 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
7088 #ifndef PERL_IN_XSUB_RE
7090 Perl_save_re_context(pTHX)
7094 struct re_save_state *state;
7096 SAVEVPTR(PL_curcop);
7097 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
7099 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
7100 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
7101 SSPUSHINT(SAVEt_RE_STATE);
7103 Copy(&PL_reg_state, state, 1, struct re_save_state);
7105 PL_reg_start_tmp = 0;
7106 PL_reg_start_tmpl = 0;
7107 PL_reg_oldsaved = NULL;
7108 PL_reg_oldsavedlen = 0;
7110 PL_reg_leftiter = 0;
7111 PL_reg_poscache = NULL;
7112 PL_reg_poscache_size = 0;
7113 #ifdef PERL_OLD_COPY_ON_WRITE
7117 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
7119 const REGEXP * const rx = PM_GETRE(PL_curpm);
7122 for (i = 1; i <= rx->nparens; i++) {
7123 char digits[TYPE_CHARS(long)];
7124 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
7125 GV *const *const gvp
7126 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
7129 GV * const gv = *gvp;
7130 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
7140 clear_re(pTHX_ void *r)
7143 ReREFCNT_dec((regexp *)r);
7149 S_put_byte(pTHX_ SV *sv, int c)
7151 if (isCNTRL(c) || c == 255 || !isPRINT(c))
7152 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
7153 else if (c == '-' || c == ']' || c == '\\' || c == '^')
7154 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
7156 Perl_sv_catpvf(aTHX_ sv, "%c", c);
7160 #define CLEAR_OPTSTART \
7161 if (optstart) STMT_START { \
7162 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
7166 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
7168 STATIC const regnode *
7169 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
7170 const regnode *last, const regnode *plast,
7171 SV* sv, I32 indent, U32 depth)
7174 register U8 op = PSEUDO; /* Arbitrary non-END op. */
7175 register const regnode *next;
7176 const regnode *optstart= NULL;
7177 GET_RE_DEBUG_FLAGS_DECL;
7179 #ifdef DEBUG_DUMPUNTIL
7180 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
7181 last ? last-start : 0,plast ? plast-start : 0);
7184 if (plast && plast < last)
7187 while (PL_regkind[op] != END && (!last || node < last)) {
7188 /* While that wasn't END last time... */
7194 next = regnext((regnode *)node);
7197 if (OP(node) == OPTIMIZED) {
7198 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
7205 regprop(r, sv, node);
7206 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
7207 (int)(2*indent + 1), "", SvPVX_const(sv));
7209 if (OP(node) != OPTIMIZED) {
7210 if (next == NULL) /* Next ptr. */
7211 PerlIO_printf(Perl_debug_log, "(0)");
7212 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
7213 PerlIO_printf(Perl_debug_log, "(FAIL)");
7215 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
7217 if (PL_regkind[(U8)op] != TRIE)
7218 (void)PerlIO_putc(Perl_debug_log, '\n');
7222 if (PL_regkind[(U8)op] == BRANCHJ) {
7225 register const regnode *nnode = (OP(next) == LONGJMP
7226 ? regnext((regnode *)next)
7228 if (last && nnode > last)
7230 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
7233 else if (PL_regkind[(U8)op] == BRANCH) {
7235 DUMPUNTIL(NEXTOPER(node), next);
7237 else if ( PL_regkind[(U8)op] == TRIE ) {
7238 const I32 n = ARG(node);
7239 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
7240 const regnode *nextbranch= NULL;
7243 DEBUG_TRIE_COMPILE_r(
7244 PerlIO_printf(Perl_debug_log,
7245 " S:%"UVuf"/%"IVdf" W:%d L:%d/%d C:%d/%d ",
7246 (UV)trie->startstate,
7247 (IV)trie->laststate-1,
7248 (int)trie->wordcount,
7251 (int)TRIE_CHARCOUNT(trie),
7252 trie->uniquecharcount
7255 if ( op==TRIEC || trie->bitmap ) {
7257 int rangestart = -1;
7258 U8* bitmap = op==TRIEC ? (U8*)ANYOF_BITMAP(node) : (U8*)TRIE_BITMAP(trie);
7260 sv_setpvn(sv, "", 0);
7261 for (i = 0; i <= 256; i++) {
7262 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7263 if (rangestart == -1)
7265 } else if (rangestart != -1) {
7266 if (i <= rangestart + 3)
7267 for (; rangestart < i; rangestart++)
7268 put_byte(sv, rangestart);
7270 put_byte(sv, rangestart);
7272 put_byte(sv, i - 1);
7277 PerlIO_printf(Perl_debug_log, "[%s]\n", SvPVX_const(sv));
7279 PerlIO_printf(Perl_debug_log, "\n");
7283 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
7284 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
7286 PerlIO_printf(Perl_debug_log, "%*s%s ",
7287 (int)(2*(indent+3)), "",
7288 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
7289 PL_colors[0], PL_colors[1],
7290 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
7291 PERL_PV_PRETTY_ELIPSES |
7297 U16 dist= trie->jump[word_idx+1];
7298 PerlIO_printf(Perl_debug_log, "(%u)\n",(next - dist) - start);
7301 nextbranch= next - trie->jump[0];
7302 DUMPUNTIL(next - dist, nextbranch);
7304 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
7305 nextbranch= regnext((regnode *)nextbranch);
7307 PerlIO_printf(Perl_debug_log, "\n");
7310 if (last && next > last)
7315 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
7316 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
7317 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
7319 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
7321 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
7323 else if ( op == PLUS || op == STAR) {
7324 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
7326 else if (op == ANYOF) {
7327 /* arglen 1 + class block */
7328 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7329 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7330 node = NEXTOPER(node);
7332 else if (PL_regkind[(U8)op] == EXACT) {
7333 /* Literal string, where present. */
7334 node += NODE_SZ_STR(node) - 1;
7335 node = NEXTOPER(node);
7338 node = NEXTOPER(node);
7339 node += regarglen[(U8)op];
7341 if (op == CURLYX || op == OPEN)
7343 else if (op == WHILEM)
7347 #ifdef DEBUG_DUMPUNTIL
7348 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
7350 return last ? last : node;
7353 #endif /* DEBUGGING */
7357 * c-indentation-style: bsd
7359 * indent-tabs-mode: t
7362 * ex: set ts=8 sts=4 sw=4 noet: