5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
174 #define FULL_TRIE_STUDY
179 /* About scan_data_t.
181 During optimisation we recurse through the regexp program performing
182 various inplace (keyhole style) optimisations. In addition study_chunk
183 and scan_commit populate this data structure with information about
184 what strings MUST appear in the pattern. We look for the longest
185 string that must appear for at a fixed location, and we look for the
186 longest string that may appear at a floating location. So for instance
191 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
192 strings (because they follow a .* construct). study_chunk will identify
193 both FOO and BAR as being the longest fixed and floating strings respectively.
195 The strings can be composites, for instance
199 will result in a composite fixed substring 'foo'.
201 For each string some basic information is maintained:
203 - offset or min_offset
204 This is the position the string must appear at, or not before.
205 It also implicitly (when combined with minlenp) tells us how many
206 character must match before the string we are searching.
207 Likewise when combined with minlenp and the length of the string
208 tells us how many characters must appear after the string we have
212 Only used for floating strings. This is the rightmost point that
213 the string can appear at. Ifset to I32 max it indicates that the
214 string can occur infinitely far to the right.
217 A pointer to the minimum length of the pattern that the string
218 was found inside. This is important as in the case of positive
219 lookahead or positive lookbehind we can have multiple patterns
224 The minimum length of the pattern overall is 3, the minimum length
225 of the lookahead part is 3, but the minimum length of the part that
226 will actually match is 1. So 'FOO's minimum length is 3, but the
227 minimum length for the F is 1. This is important as the minimum length
228 is used to determine offsets in front of and behind the string being
229 looked for. Since strings can be composites this is the length of the
230 pattern at the time it was commited with a scan_commit. Note that
231 the length is calculated by study_chunk, so that the minimum lengths
232 are not known until the full pattern has been compiled, thus the
233 pointer to the value.
237 In the case of lookbehind the string being searched for can be
238 offset past the start point of the final matching string.
239 If this value was just blithely removed from the min_offset it would
240 invalidate some of the calculations for how many chars must match
241 before or after (as they are derived from min_offset and minlen and
242 the length of the string being searched for).
243 When the final pattern is compiled and the data is moved from the
244 scan_data_t structure into the regexp structure the information
245 about lookbehind is factored in, with the information that would
246 have been lost precalculated in the end_shift field for the
249 The fields pos_min and pos_delta are used to store the minimum offset
250 and the delta to the maximum offset at the current point in the pattern.
254 typedef struct scan_data_t {
255 /*I32 len_min; unused */
256 /*I32 len_delta; unused */
260 I32 last_end; /* min value, <0 unless valid. */
263 SV **longest; /* Either &l_fixed, or &l_float. */
264 SV *longest_fixed; /* longest fixed string found in pattern */
265 I32 offset_fixed; /* offset where it starts */
266 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
267 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
268 SV *longest_float; /* longest floating string found in pattern */
269 I32 offset_float_min; /* earliest point in string it can appear */
270 I32 offset_float_max; /* latest point in string it can appear */
271 I32 *minlen_float; /* pointer to the minlen relevent to the string */
272 I32 lookbehind_float; /* is the position of the string modified by LB */
276 struct regnode_charclass_class *start_class;
280 * Forward declarations for pregcomp()'s friends.
283 static const scan_data_t zero_scan_data =
284 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
286 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
287 #define SF_BEFORE_SEOL 0x0001
288 #define SF_BEFORE_MEOL 0x0002
289 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
290 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
293 # define SF_FIX_SHIFT_EOL (0+2)
294 # define SF_FL_SHIFT_EOL (0+4)
296 # define SF_FIX_SHIFT_EOL (+2)
297 # define SF_FL_SHIFT_EOL (+4)
300 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
301 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
303 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
304 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
305 #define SF_IS_INF 0x0040
306 #define SF_HAS_PAR 0x0080
307 #define SF_IN_PAR 0x0100
308 #define SF_HAS_EVAL 0x0200
309 #define SCF_DO_SUBSTR 0x0400
310 #define SCF_DO_STCLASS_AND 0x0800
311 #define SCF_DO_STCLASS_OR 0x1000
312 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
313 #define SCF_WHILEM_VISITED_POS 0x2000
315 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
318 #define UTF (RExC_utf8 != 0)
319 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
320 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
322 #define OOB_UNICODE 12345678
323 #define OOB_NAMEDCLASS -1
325 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
326 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
329 /* length of regex to show in messages that don't mark a position within */
330 #define RegexLengthToShowInErrorMessages 127
333 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
334 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
335 * op/pragma/warn/regcomp.
337 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
338 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
340 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
343 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
344 * arg. Show regex, up to a maximum length. If it's too long, chop and add
347 #define FAIL(msg) STMT_START { \
348 const char *ellipses = ""; \
349 IV len = RExC_end - RExC_precomp; \
352 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
353 if (len > RegexLengthToShowInErrorMessages) { \
354 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
355 len = RegexLengthToShowInErrorMessages - 10; \
358 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
359 msg, (int)len, RExC_precomp, ellipses); \
363 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
365 #define Simple_vFAIL(m) STMT_START { \
366 const IV offset = RExC_parse - RExC_precomp; \
367 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
368 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
372 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
374 #define vFAIL(m) STMT_START { \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 * Like Simple_vFAIL(), but accepts two arguments.
383 #define Simple_vFAIL2(m,a1) STMT_START { \
384 const IV offset = RExC_parse - RExC_precomp; \
385 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
386 (int)offset, RExC_precomp, RExC_precomp + offset); \
390 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
392 #define vFAIL2(m,a1) STMT_START { \
394 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
395 Simple_vFAIL2(m, a1); \
400 * Like Simple_vFAIL(), but accepts three arguments.
402 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
403 const IV offset = RExC_parse - RExC_precomp; \
404 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
405 (int)offset, RExC_precomp, RExC_precomp + offset); \
409 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
411 #define vFAIL3(m,a1,a2) STMT_START { \
413 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
414 Simple_vFAIL3(m, a1, a2); \
418 * Like Simple_vFAIL(), but accepts four arguments.
420 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
421 const IV offset = RExC_parse - RExC_precomp; \
422 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
423 (int)offset, RExC_precomp, RExC_precomp + offset); \
426 #define vWARN(loc,m) STMT_START { \
427 const IV offset = loc - RExC_precomp; \
428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
432 #define vWARNdep(loc,m) STMT_START { \
433 const IV offset = loc - RExC_precomp; \
434 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
435 "%s" REPORT_LOCATION, \
436 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
440 #define vWARN2(loc, m, a1) STMT_START { \
441 const IV offset = loc - RExC_precomp; \
442 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
443 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
446 #define vWARN3(loc, m, a1, a2) STMT_START { \
447 const IV offset = loc - RExC_precomp; \
448 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
449 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
452 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
453 const IV offset = loc - RExC_precomp; \
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
455 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
458 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
459 const IV offset = loc - RExC_precomp; \
460 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
461 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
465 /* Allow for side effects in s */
466 #define REGC(c,s) STMT_START { \
467 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
470 /* Macros for recording node offsets. 20001227 mjd@plover.com
471 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
472 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
473 * Element 0 holds the number n.
474 * Position is 1 indexed.
477 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
479 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
480 __LINE__, (node), (int)(byte))); \
482 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
484 RExC_offsets[2*(node)-1] = (byte); \
489 #define Set_Node_Offset(node,byte) \
490 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
491 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
493 #define Set_Node_Length_To_R(node,len) STMT_START { \
495 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
496 __LINE__, (int)(node), (int)(len))); \
498 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
500 RExC_offsets[2*(node)] = (len); \
505 #define Set_Node_Length(node,len) \
506 Set_Node_Length_To_R((node)-RExC_emit_start, len)
507 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
508 #define Set_Node_Cur_Length(node) \
509 Set_Node_Length(node, RExC_parse - parse_start)
511 /* Get offsets and lengths */
512 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
513 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
515 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
517 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
521 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
522 #define EXPERIMENTAL_INPLACESCAN
525 #define DEBUG_STUDYDATA(data,depth) \
526 DEBUG_OPTIMISE_r(if(data){ \
527 PerlIO_printf(Perl_debug_log, \
528 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
529 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
530 (int)(depth)*2, "", \
531 (IV)((data)->pos_min), \
532 (IV)((data)->pos_delta), \
533 (IV)((data)->flags), \
534 (IV)((data)->whilem_c), \
535 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
537 if ((data)->last_found) \
538 PerlIO_printf(Perl_debug_log, \
539 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
540 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
541 SvPVX_const((data)->last_found), \
542 (IV)((data)->last_end), \
543 (IV)((data)->last_start_min), \
544 (IV)((data)->last_start_max), \
545 ((data)->longest && \
546 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
547 SvPVX_const((data)->longest_fixed), \
548 (IV)((data)->offset_fixed), \
549 ((data)->longest && \
550 (data)->longest==&((data)->longest_float)) ? "*" : "", \
551 SvPVX_const((data)->longest_float), \
552 (IV)((data)->offset_float_min), \
553 (IV)((data)->offset_float_max) \
555 PerlIO_printf(Perl_debug_log,"\n"); \
558 static void clear_re(pTHX_ void *r);
560 /* Mark that we cannot extend a found fixed substring at this point.
561 Update the longest found anchored substring and the longest found
562 floating substrings if needed. */
565 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
567 const STRLEN l = CHR_SVLEN(data->last_found);
568 const STRLEN old_l = CHR_SVLEN(*data->longest);
569 GET_RE_DEBUG_FLAGS_DECL;
571 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
572 SvSetMagicSV(*data->longest, data->last_found);
573 if (*data->longest == data->longest_fixed) {
574 data->offset_fixed = l ? data->last_start_min : data->pos_min;
575 if (data->flags & SF_BEFORE_EOL)
577 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
579 data->flags &= ~SF_FIX_BEFORE_EOL;
580 data->minlen_fixed=minlenp;
581 data->lookbehind_fixed=0;
584 data->offset_float_min = l ? data->last_start_min : data->pos_min;
585 data->offset_float_max = (l
586 ? data->last_start_max
587 : data->pos_min + data->pos_delta);
588 if ((U32)data->offset_float_max > (U32)I32_MAX)
589 data->offset_float_max = I32_MAX;
590 if (data->flags & SF_BEFORE_EOL)
592 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
594 data->flags &= ~SF_FL_BEFORE_EOL;
595 data->minlen_float=minlenp;
596 data->lookbehind_float=0;
599 SvCUR_set(data->last_found, 0);
601 SV * const sv = data->last_found;
602 if (SvUTF8(sv) && SvMAGICAL(sv)) {
603 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
609 data->flags &= ~SF_BEFORE_EOL;
610 DEBUG_STUDYDATA(data,0);
613 /* Can match anything (initialization) */
615 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
617 ANYOF_CLASS_ZERO(cl);
618 ANYOF_BITMAP_SETALL(cl);
619 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
621 cl->flags |= ANYOF_LOCALE;
624 /* Can match anything (initialization) */
626 S_cl_is_anything(const struct regnode_charclass_class *cl)
630 for (value = 0; value <= ANYOF_MAX; value += 2)
631 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
633 if (!(cl->flags & ANYOF_UNICODE_ALL))
635 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
640 /* Can match anything (initialization) */
642 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
644 Zero(cl, 1, struct regnode_charclass_class);
646 cl_anything(pRExC_state, cl);
650 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
652 Zero(cl, 1, struct regnode_charclass_class);
654 cl_anything(pRExC_state, cl);
656 cl->flags |= ANYOF_LOCALE;
659 /* 'And' a given class with another one. Can create false positives */
660 /* We assume that cl is not inverted */
662 S_cl_and(struct regnode_charclass_class *cl,
663 const struct regnode_charclass_class *and_with)
665 if (!(and_with->flags & ANYOF_CLASS)
666 && !(cl->flags & ANYOF_CLASS)
667 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
668 && !(and_with->flags & ANYOF_FOLD)
669 && !(cl->flags & ANYOF_FOLD)) {
672 if (and_with->flags & ANYOF_INVERT)
673 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
674 cl->bitmap[i] &= ~and_with->bitmap[i];
676 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
677 cl->bitmap[i] &= and_with->bitmap[i];
678 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
679 if (!(and_with->flags & ANYOF_EOS))
680 cl->flags &= ~ANYOF_EOS;
682 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
683 !(and_with->flags & ANYOF_INVERT)) {
684 cl->flags &= ~ANYOF_UNICODE_ALL;
685 cl->flags |= ANYOF_UNICODE;
686 ARG_SET(cl, ARG(and_with));
688 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
689 !(and_with->flags & ANYOF_INVERT))
690 cl->flags &= ~ANYOF_UNICODE_ALL;
691 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
692 !(and_with->flags & ANYOF_INVERT))
693 cl->flags &= ~ANYOF_UNICODE;
696 /* 'OR' a given class with another one. Can create false positives */
697 /* We assume that cl is not inverted */
699 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
701 if (or_with->flags & ANYOF_INVERT) {
703 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
704 * <= (B1 | !B2) | (CL1 | !CL2)
705 * which is wasteful if CL2 is small, but we ignore CL2:
706 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
707 * XXXX Can we handle case-fold? Unclear:
708 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
709 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
711 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
712 && !(or_with->flags & ANYOF_FOLD)
713 && !(cl->flags & ANYOF_FOLD) ) {
716 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
717 cl->bitmap[i] |= ~or_with->bitmap[i];
718 } /* XXXX: logic is complicated otherwise */
720 cl_anything(pRExC_state, cl);
723 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
724 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
725 && (!(or_with->flags & ANYOF_FOLD)
726 || (cl->flags & ANYOF_FOLD)) ) {
729 /* OR char bitmap and class bitmap separately */
730 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
731 cl->bitmap[i] |= or_with->bitmap[i];
732 if (or_with->flags & ANYOF_CLASS) {
733 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
734 cl->classflags[i] |= or_with->classflags[i];
735 cl->flags |= ANYOF_CLASS;
738 else { /* XXXX: logic is complicated, leave it along for a moment. */
739 cl_anything(pRExC_state, cl);
742 if (or_with->flags & ANYOF_EOS)
743 cl->flags |= ANYOF_EOS;
745 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
746 ARG(cl) != ARG(or_with)) {
747 cl->flags |= ANYOF_UNICODE_ALL;
748 cl->flags &= ~ANYOF_UNICODE;
750 if (or_with->flags & ANYOF_UNICODE_ALL) {
751 cl->flags |= ANYOF_UNICODE_ALL;
752 cl->flags &= ~ANYOF_UNICODE;
756 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
757 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
758 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
759 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
765 dump_trie_interim_list(trie,next_alloc)
766 dump_trie_interim_table(trie,next_alloc)
768 These routines dump out a trie in a somewhat readable format.
769 The _interim_ variants are used for debugging the interim
770 tables that are used to generate the final compressed
771 representation which is what dump_trie expects.
773 Part of the reason for their existance is to provide a form
774 of documentation as to how the different representations function.
780 Dumps the final compressed table form of the trie to Perl_debug_log.
781 Used for debugging make_trie().
785 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
788 SV *sv=sv_newmortal();
789 int colwidth= trie->widecharmap ? 6 : 4;
790 GET_RE_DEBUG_FLAGS_DECL;
793 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
794 (int)depth * 2 + 2,"",
795 "Match","Base","Ofs" );
797 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
798 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
800 PerlIO_printf( Perl_debug_log, "%*s",
802 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
803 PL_colors[0], PL_colors[1],
804 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
805 PERL_PV_ESCAPE_FIRSTCHAR
810 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
811 (int)depth * 2 + 2,"");
813 for( state = 0 ; state < trie->uniquecharcount ; state++ )
814 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
815 PerlIO_printf( Perl_debug_log, "\n");
817 for( state = 1 ; state < trie->laststate ; state++ ) {
818 const U32 base = trie->states[ state ].trans.base;
820 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
822 if ( trie->states[ state ].wordnum ) {
823 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
825 PerlIO_printf( Perl_debug_log, "%6s", "" );
828 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
833 while( ( base + ofs < trie->uniquecharcount ) ||
834 ( base + ofs - trie->uniquecharcount < trie->lasttrans
835 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
838 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
840 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
841 if ( ( base + ofs >= trie->uniquecharcount ) &&
842 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
843 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
845 PerlIO_printf( Perl_debug_log, "%*"UVXf,
847 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
849 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
853 PerlIO_printf( Perl_debug_log, "]");
856 PerlIO_printf( Perl_debug_log, "\n" );
860 dump_trie_interim_list(trie,next_alloc)
861 Dumps a fully constructed but uncompressed trie in list form.
862 List tries normally only are used for construction when the number of
863 possible chars (trie->uniquecharcount) is very high.
864 Used for debugging make_trie().
867 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
870 SV *sv=sv_newmortal();
871 int colwidth= trie->widecharmap ? 6 : 4;
872 GET_RE_DEBUG_FLAGS_DECL;
873 /* print out the table precompression. */
874 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
875 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
876 "------:-----+-----------------\n" );
878 for( state=1 ; state < next_alloc ; state ++ ) {
881 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
882 (int)depth * 2 + 2,"", (UV)state );
883 if ( ! trie->states[ state ].wordnum ) {
884 PerlIO_printf( Perl_debug_log, "%5s| ","");
886 PerlIO_printf( Perl_debug_log, "W%4x| ",
887 trie->states[ state ].wordnum
890 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
891 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
893 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
895 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
896 PL_colors[0], PL_colors[1],
897 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
898 PERL_PV_ESCAPE_FIRSTCHAR
900 TRIE_LIST_ITEM(state,charid).forid,
901 (UV)TRIE_LIST_ITEM(state,charid).newstate
905 PerlIO_printf( Perl_debug_log, "\n");
910 dump_trie_interim_table(trie,next_alloc)
911 Dumps a fully constructed but uncompressed trie in table form.
912 This is the normal DFA style state transition table, with a few
913 twists to facilitate compression later.
914 Used for debugging make_trie().
917 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
921 SV *sv=sv_newmortal();
922 int colwidth= trie->widecharmap ? 6 : 4;
923 GET_RE_DEBUG_FLAGS_DECL;
926 print out the table precompression so that we can do a visual check
927 that they are identical.
930 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
932 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
933 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
935 PerlIO_printf( Perl_debug_log, "%*s",
937 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
938 PL_colors[0], PL_colors[1],
939 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
940 PERL_PV_ESCAPE_FIRSTCHAR
946 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
948 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
949 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
952 PerlIO_printf( Perl_debug_log, "\n" );
954 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
956 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
957 (int)depth * 2 + 2,"",
958 (UV)TRIE_NODENUM( state ) );
960 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
961 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
963 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
965 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
967 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
968 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
970 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
971 trie->states[ TRIE_NODENUM( state ) ].wordnum );
978 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
979 startbranch: the first branch in the whole branch sequence
980 first : start branch of sequence of branch-exact nodes.
981 May be the same as startbranch
982 last : Thing following the last branch.
983 May be the same as tail.
984 tail : item following the branch sequence
985 count : words in the sequence
986 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
989 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
991 A trie is an N'ary tree where the branches are determined by digital
992 decomposition of the key. IE, at the root node you look up the 1st character and
993 follow that branch repeat until you find the end of the branches. Nodes can be
994 marked as "accepting" meaning they represent a complete word. Eg:
998 would convert into the following structure. Numbers represent states, letters
999 following numbers represent valid transitions on the letter from that state, if
1000 the number is in square brackets it represents an accepting state, otherwise it
1001 will be in parenthesis.
1003 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1007 (1) +-i->(6)-+-s->[7]
1009 +-s->(3)-+-h->(4)-+-e->[5]
1011 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1013 This shows that when matching against the string 'hers' we will begin at state 1
1014 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1015 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1016 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1017 single traverse. We store a mapping from accepting to state to which word was
1018 matched, and then when we have multiple possibilities we try to complete the
1019 rest of the regex in the order in which they occured in the alternation.
1021 The only prior NFA like behaviour that would be changed by the TRIE support is
1022 the silent ignoring of duplicate alternations which are of the form:
1024 / (DUPE|DUPE) X? (?{ ... }) Y /x
1026 Thus EVAL blocks follwing a trie may be called a different number of times with
1027 and without the optimisation. With the optimisations dupes will be silently
1028 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1029 the following demonstrates:
1031 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1033 which prints out 'word' three times, but
1035 'words'=~/(word|word|word)(?{ print $1 })S/
1037 which doesnt print it out at all. This is due to other optimisations kicking in.
1039 Example of what happens on a structural level:
1041 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1043 1: CURLYM[1] {1,32767}(18)
1054 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1055 and should turn into:
1057 1: CURLYM[1] {1,32767}(18)
1059 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1067 Cases where tail != last would be like /(?foo|bar)baz/:
1077 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1078 and would end up looking like:
1081 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1088 d = uvuni_to_utf8_flags(d, uv, 0);
1090 is the recommended Unicode-aware way of saying
1095 #define TRIE_STORE_REVCHAR \
1097 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
1098 if (UTF) SvUTF8_on(tmp); \
1099 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1102 #define TRIE_READ_CHAR STMT_START { \
1106 if ( foldlen > 0 ) { \
1107 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1112 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1113 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1114 foldlen -= UNISKIP( uvc ); \
1115 scan = foldbuf + UNISKIP( uvc ); \
1118 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1128 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1129 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1130 TRIE_LIST_LEN( state ) *= 2; \
1131 Renew( trie->states[ state ].trans.list, \
1132 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1134 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1135 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1136 TRIE_LIST_CUR( state )++; \
1139 #define TRIE_LIST_NEW(state) STMT_START { \
1140 Newxz( trie->states[ state ].trans.list, \
1141 4, reg_trie_trans_le ); \
1142 TRIE_LIST_CUR( state ) = 1; \
1143 TRIE_LIST_LEN( state ) = 4; \
1146 #define TRIE_HANDLE_WORD(state) STMT_START { \
1147 U16 dupe= trie->states[ state ].wordnum; \
1148 regnode * const noper_next = regnext( noper ); \
1150 if (trie->wordlen) \
1151 trie->wordlen[ curword ] = wordlen; \
1153 /* store the word for dumping */ \
1155 if (OP(noper) != NOTHING) \
1156 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1158 tmp = newSVpvn( "", 0 ); \
1159 if ( UTF ) SvUTF8_on( tmp ); \
1160 av_push( trie->words, tmp ); \
1165 if ( noper_next < tail ) { \
1167 Newxz( trie->jump, word_count + 1, U16); \
1168 trie->jump[curword] = (U16)(tail - noper_next); \
1170 jumper = noper_next; \
1172 nextbranch= regnext(cur); \
1176 /* So it's a dupe. This means we need to maintain a */\
1177 /* linked-list from the first to the next. */\
1178 /* we only allocate the nextword buffer when there */\
1179 /* a dupe, so first time we have to do the allocation */\
1180 if (!trie->nextword) \
1181 Newxz( trie->nextword, word_count + 1, U16); \
1182 while ( trie->nextword[dupe] ) \
1183 dupe= trie->nextword[dupe]; \
1184 trie->nextword[dupe]= curword; \
1186 /* we haven't inserted this word yet. */ \
1187 trie->states[ state ].wordnum = curword; \
1192 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1193 ( ( base + charid >= ucharcount \
1194 && base + charid < ubound \
1195 && state == trie->trans[ base - ucharcount + charid ].check \
1196 && trie->trans[ base - ucharcount + charid ].next ) \
1197 ? trie->trans[ base - ucharcount + charid ].next \
1198 : ( state==1 ? special : 0 ) \
1202 #define MADE_JUMP_TRIE 2
1203 #define MADE_EXACT_TRIE 4
1206 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1209 /* first pass, loop through and scan words */
1210 reg_trie_data *trie;
1212 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1217 regnode *jumper = NULL;
1218 regnode *nextbranch = NULL;
1219 /* we just use folder as a flag in utf8 */
1220 const U8 * const folder = ( flags == EXACTF
1222 : ( flags == EXACTFL
1228 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1229 SV *re_trie_maxbuff;
1231 /* these are only used during construction but are useful during
1232 * debugging so we store them in the struct when debugging.
1234 STRLEN trie_charcount=0;
1235 AV *trie_revcharmap;
1237 GET_RE_DEBUG_FLAGS_DECL;
1239 PERL_UNUSED_ARG(depth);
1242 Newxz( trie, 1, reg_trie_data );
1244 trie->startstate = 1;
1245 trie->wordcount = word_count;
1246 RExC_rx->data->data[ data_slot ] = (void*)trie;
1247 Newxz( trie->charmap, 256, U16 );
1248 if (!(UTF && folder))
1249 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1251 trie->words = newAV();
1253 TRIE_REVCHARMAP(trie) = newAV();
1255 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1256 if (!SvIOK(re_trie_maxbuff)) {
1257 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1260 PerlIO_printf( Perl_debug_log,
1261 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1262 (int)depth * 2 + 2, "",
1263 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1264 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1267 /* -- First loop and Setup --
1269 We first traverse the branches and scan each word to determine if it
1270 contains widechars, and how many unique chars there are, this is
1271 important as we have to build a table with at least as many columns as we
1274 We use an array of integers to represent the character codes 0..255
1275 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1276 native representation of the character value as the key and IV's for the
1279 *TODO* If we keep track of how many times each character is used we can
1280 remap the columns so that the table compression later on is more
1281 efficient in terms of memory by ensuring most common value is in the
1282 middle and the least common are on the outside. IMO this would be better
1283 than a most to least common mapping as theres a decent chance the most
1284 common letter will share a node with the least common, meaning the node
1285 will not be compressable. With a middle is most common approach the worst
1286 case is when we have the least common nodes twice.
1290 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1291 regnode * const noper = NEXTOPER( cur );
1292 const U8 *uc = (U8*)STRING( noper );
1293 const U8 * const e = uc + STR_LEN( noper );
1295 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1296 const U8 *scan = (U8*)NULL;
1297 U32 wordlen = 0; /* required init */
1300 if (OP(noper) == NOTHING) {
1305 TRIE_BITMAP_SET(trie,*uc);
1306 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1308 for ( ; uc < e ; uc += len ) {
1309 TRIE_CHARCOUNT(trie)++;
1313 if ( !trie->charmap[ uvc ] ) {
1314 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1316 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1321 if ( !trie->widecharmap )
1322 trie->widecharmap = newHV();
1324 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1327 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1329 if ( !SvTRUE( *svpp ) ) {
1330 sv_setiv( *svpp, ++trie->uniquecharcount );
1335 if( cur == first ) {
1338 } else if (chars < trie->minlen) {
1340 } else if (chars > trie->maxlen) {
1344 } /* end first pass */
1345 DEBUG_TRIE_COMPILE_r(
1346 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1347 (int)depth * 2 + 2,"",
1348 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1349 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1350 (int)trie->minlen, (int)trie->maxlen )
1352 Newxz( trie->wordlen, word_count, U32 );
1355 We now know what we are dealing with in terms of unique chars and
1356 string sizes so we can calculate how much memory a naive
1357 representation using a flat table will take. If it's over a reasonable
1358 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1359 conservative but potentially much slower representation using an array
1362 At the end we convert both representations into the same compressed
1363 form that will be used in regexec.c for matching with. The latter
1364 is a form that cannot be used to construct with but has memory
1365 properties similar to the list form and access properties similar
1366 to the table form making it both suitable for fast searches and
1367 small enough that its feasable to store for the duration of a program.
1369 See the comment in the code where the compressed table is produced
1370 inplace from the flat tabe representation for an explanation of how
1371 the compression works.
1376 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1378 Second Pass -- Array Of Lists Representation
1380 Each state will be represented by a list of charid:state records
1381 (reg_trie_trans_le) the first such element holds the CUR and LEN
1382 points of the allocated array. (See defines above).
1384 We build the initial structure using the lists, and then convert
1385 it into the compressed table form which allows faster lookups
1386 (but cant be modified once converted).
1389 STRLEN transcount = 1;
1391 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1395 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1397 regnode * const noper = NEXTOPER( cur );
1398 U8 *uc = (U8*)STRING( noper );
1399 const U8 * const e = uc + STR_LEN( noper );
1400 U32 state = 1; /* required init */
1401 U16 charid = 0; /* sanity init */
1402 U8 *scan = (U8*)NULL; /* sanity init */
1403 STRLEN foldlen = 0; /* required init */
1404 U32 wordlen = 0; /* required init */
1405 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1407 if (OP(noper) != NOTHING) {
1408 for ( ; uc < e ; uc += len ) {
1413 charid = trie->charmap[ uvc ];
1415 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1419 charid=(U16)SvIV( *svpp );
1422 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1429 if ( !trie->states[ state ].trans.list ) {
1430 TRIE_LIST_NEW( state );
1432 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1433 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1434 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1439 newstate = next_alloc++;
1440 TRIE_LIST_PUSH( state, charid, newstate );
1445 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1449 TRIE_HANDLE_WORD(state);
1451 } /* end second pass */
1453 trie->laststate = next_alloc;
1454 Renew( trie->states, next_alloc, reg_trie_state );
1456 /* and now dump it out before we compress it */
1457 DEBUG_TRIE_COMPILE_MORE_r(
1458 dump_trie_interim_list(trie,next_alloc,depth+1)
1461 Newxz( trie->trans, transcount ,reg_trie_trans );
1468 for( state=1 ; state < next_alloc ; state ++ ) {
1472 DEBUG_TRIE_COMPILE_MORE_r(
1473 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1477 if (trie->states[state].trans.list) {
1478 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1482 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1483 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1484 if ( forid < minid ) {
1486 } else if ( forid > maxid ) {
1490 if ( transcount < tp + maxid - minid + 1) {
1492 Renew( trie->trans, transcount, reg_trie_trans );
1493 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1495 base = trie->uniquecharcount + tp - minid;
1496 if ( maxid == minid ) {
1498 for ( ; zp < tp ; zp++ ) {
1499 if ( ! trie->trans[ zp ].next ) {
1500 base = trie->uniquecharcount + zp - minid;
1501 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1502 trie->trans[ zp ].check = state;
1508 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1509 trie->trans[ tp ].check = state;
1514 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1515 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1516 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1517 trie->trans[ tid ].check = state;
1519 tp += ( maxid - minid + 1 );
1521 Safefree(trie->states[ state ].trans.list);
1524 DEBUG_TRIE_COMPILE_MORE_r(
1525 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1528 trie->states[ state ].trans.base=base;
1530 trie->lasttrans = tp + 1;
1534 Second Pass -- Flat Table Representation.
1536 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1537 We know that we will need Charcount+1 trans at most to store the data
1538 (one row per char at worst case) So we preallocate both structures
1539 assuming worst case.
1541 We then construct the trie using only the .next slots of the entry
1544 We use the .check field of the first entry of the node temporarily to
1545 make compression both faster and easier by keeping track of how many non
1546 zero fields are in the node.
1548 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1551 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1552 number representing the first entry of the node, and state as a
1553 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1554 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1555 are 2 entrys per node. eg:
1563 The table is internally in the right hand, idx form. However as we also
1564 have to deal with the states array which is indexed by nodenum we have to
1565 use TRIE_NODENUM() to convert.
1570 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1572 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1573 next_alloc = trie->uniquecharcount + 1;
1576 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1578 regnode * const noper = NEXTOPER( cur );
1579 const U8 *uc = (U8*)STRING( noper );
1580 const U8 * const e = uc + STR_LEN( noper );
1582 U32 state = 1; /* required init */
1584 U16 charid = 0; /* sanity init */
1585 U32 accept_state = 0; /* sanity init */
1586 U8 *scan = (U8*)NULL; /* sanity init */
1588 STRLEN foldlen = 0; /* required init */
1589 U32 wordlen = 0; /* required init */
1590 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1592 if ( OP(noper) != NOTHING ) {
1593 for ( ; uc < e ; uc += len ) {
1598 charid = trie->charmap[ uvc ];
1600 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1601 charid = svpp ? (U16)SvIV(*svpp) : 0;
1605 if ( !trie->trans[ state + charid ].next ) {
1606 trie->trans[ state + charid ].next = next_alloc;
1607 trie->trans[ state ].check++;
1608 next_alloc += trie->uniquecharcount;
1610 state = trie->trans[ state + charid ].next;
1612 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1614 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1617 accept_state = TRIE_NODENUM( state );
1618 TRIE_HANDLE_WORD(accept_state);
1620 } /* end second pass */
1622 /* and now dump it out before we compress it */
1623 DEBUG_TRIE_COMPILE_MORE_r(
1624 dump_trie_interim_table(trie,next_alloc,depth+1)
1629 * Inplace compress the table.*
1631 For sparse data sets the table constructed by the trie algorithm will
1632 be mostly 0/FAIL transitions or to put it another way mostly empty.
1633 (Note that leaf nodes will not contain any transitions.)
1635 This algorithm compresses the tables by eliminating most such
1636 transitions, at the cost of a modest bit of extra work during lookup:
1638 - Each states[] entry contains a .base field which indicates the
1639 index in the state[] array wheres its transition data is stored.
1641 - If .base is 0 there are no valid transitions from that node.
1643 - If .base is nonzero then charid is added to it to find an entry in
1646 -If trans[states[state].base+charid].check!=state then the
1647 transition is taken to be a 0/Fail transition. Thus if there are fail
1648 transitions at the front of the node then the .base offset will point
1649 somewhere inside the previous nodes data (or maybe even into a node
1650 even earlier), but the .check field determines if the transition is
1654 The following process inplace converts the table to the compressed
1655 table: We first do not compress the root node 1,and mark its all its
1656 .check pointers as 1 and set its .base pointer as 1 as well. This
1657 allows to do a DFA construction from the compressed table later, and
1658 ensures that any .base pointers we calculate later are greater than
1661 - We set 'pos' to indicate the first entry of the second node.
1663 - We then iterate over the columns of the node, finding the first and
1664 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1665 and set the .check pointers accordingly, and advance pos
1666 appropriately and repreat for the next node. Note that when we copy
1667 the next pointers we have to convert them from the original
1668 NODEIDX form to NODENUM form as the former is not valid post
1671 - If a node has no transitions used we mark its base as 0 and do not
1672 advance the pos pointer.
1674 - If a node only has one transition we use a second pointer into the
1675 structure to fill in allocated fail transitions from other states.
1676 This pointer is independent of the main pointer and scans forward
1677 looking for null transitions that are allocated to a state. When it
1678 finds one it writes the single transition into the "hole". If the
1679 pointer doesnt find one the single transition is appended as normal.
1681 - Once compressed we can Renew/realloc the structures to release the
1684 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1685 specifically Fig 3.47 and the associated pseudocode.
1689 const U32 laststate = TRIE_NODENUM( next_alloc );
1692 trie->laststate = laststate;
1694 for ( state = 1 ; state < laststate ; state++ ) {
1696 const U32 stateidx = TRIE_NODEIDX( state );
1697 const U32 o_used = trie->trans[ stateidx ].check;
1698 U32 used = trie->trans[ stateidx ].check;
1699 trie->trans[ stateidx ].check = 0;
1701 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1702 if ( flag || trie->trans[ stateidx + charid ].next ) {
1703 if ( trie->trans[ stateidx + charid ].next ) {
1705 for ( ; zp < pos ; zp++ ) {
1706 if ( ! trie->trans[ zp ].next ) {
1710 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1711 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1712 trie->trans[ zp ].check = state;
1713 if ( ++zp > pos ) pos = zp;
1720 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1722 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1723 trie->trans[ pos ].check = state;
1728 trie->lasttrans = pos + 1;
1729 Renew( trie->states, laststate + 1, reg_trie_state);
1730 DEBUG_TRIE_COMPILE_MORE_r(
1731 PerlIO_printf( Perl_debug_log,
1732 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1733 (int)depth * 2 + 2,"",
1734 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1737 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1740 } /* end table compress */
1742 /* resize the trans array to remove unused space */
1743 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1745 /* and now dump out the compressed format */
1746 DEBUG_TRIE_COMPILE_r(
1747 dump_trie(trie,depth+1)
1750 { /* Modify the program and insert the new TRIE node*/
1752 U8 nodetype =(U8)(flags & 0xFF);
1761 This means we convert either the first branch or the first Exact,
1762 depending on whether the thing following (in 'last') is a branch
1763 or not and whther first is the startbranch (ie is it a sub part of
1764 the alternation or is it the whole thing.)
1765 Assuming its a sub part we conver the EXACT otherwise we convert
1766 the whole branch sequence, including the first.
1768 /* Find the node we are going to overwrite */
1769 if ( first == startbranch && OP( last ) != BRANCH ) {
1770 /* whole branch chain */
1773 const regnode *nop = NEXTOPER( convert );
1774 mjd_offset= Node_Offset((nop));
1775 mjd_nodelen= Node_Length((nop));
1778 /* branch sub-chain */
1779 convert = NEXTOPER( first );
1780 NEXT_OFF( first ) = (U16)(last - first);
1782 mjd_offset= Node_Offset((convert));
1783 mjd_nodelen= Node_Length((convert));
1787 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1788 (int)depth * 2 + 2, "",
1789 (UV)mjd_offset, (UV)mjd_nodelen)
1792 /* But first we check to see if there is a common prefix we can
1793 split out as an EXACT and put in front of the TRIE node. */
1794 trie->startstate= 1;
1795 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1798 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1799 (int)depth * 2 + 2, "",
1800 (UV)trie->laststate)
1802 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
1806 const U32 base = trie->states[ state ].trans.base;
1808 if ( trie->states[state].wordnum )
1811 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1812 if ( ( base + ofs >= trie->uniquecharcount ) &&
1813 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1814 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1816 if ( ++count > 1 ) {
1817 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1818 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1819 if ( state == 1 ) break;
1821 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1823 PerlIO_printf(Perl_debug_log,
1824 "%*sNew Start State=%"UVuf" Class: [",
1825 (int)depth * 2 + 2, "",
1828 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1829 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1831 TRIE_BITMAP_SET(trie,*ch);
1833 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1835 PerlIO_printf(Perl_debug_log, (char*)ch)
1839 TRIE_BITMAP_SET(trie,*ch);
1841 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1842 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1848 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1849 const char *ch = SvPV_nolen_const( *tmp );
1851 PerlIO_printf( Perl_debug_log,
1852 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1853 (int)depth * 2 + 2, "",
1854 (UV)state, (UV)idx, ch)
1857 OP( convert ) = nodetype;
1858 str=STRING(convert);
1867 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1873 regnode *n = convert+NODE_SZ_STR(convert);
1874 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1875 trie->startstate = state;
1876 trie->minlen -= (state - 1);
1877 trie->maxlen -= (state - 1);
1879 regnode *fix = convert;
1881 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1882 while( ++fix < n ) {
1883 Set_Node_Offset_Length(fix, 0, 0);
1889 NEXT_OFF(convert) = (U16)(tail - convert);
1893 if ( trie->maxlen ) {
1894 NEXT_OFF( convert ) = (U16)(tail - convert);
1895 ARG_SET( convert, data_slot );
1896 /* Store the offset to the first unabsorbed branch in
1897 jump[0], which is otherwise unused by the jump logic.
1898 We use this when dumping a trie and during optimisation. */
1900 trie->jump[0] = (U16)(tail - nextbranch);
1904 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1905 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1907 OP( convert ) = TRIEC;
1908 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1909 Safefree(trie->bitmap);
1912 OP( convert ) = TRIE;
1914 /* store the type in the flags */
1915 convert->flags = nodetype;
1916 /* XXX We really should free up the resource in trie now, as we wont use them */
1918 /* needed for dumping*/
1920 regnode *optimize = convert
1922 + regarglen[ OP( convert ) ];
1923 regnode *opt = convert;
1924 while (++opt<optimize) {
1925 Set_Node_Offset_Length(opt,0,0);
1928 Try to clean up some of the debris left after the
1931 while( optimize < jumper ) {
1932 mjd_nodelen += Node_Length((optimize));
1933 OP( optimize ) = OPTIMIZED;
1934 Set_Node_Offset_Length(optimize,0,0);
1937 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1939 } /* end node insert */
1941 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1945 : trie->startstate>1
1951 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1953 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1955 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1956 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1959 We find the fail state for each state in the trie, this state is the longest proper
1960 suffix of the current states 'word' that is also a proper prefix of another word in our
1961 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1962 the DFA not to have to restart after its tried and failed a word at a given point, it
1963 simply continues as though it had been matching the other word in the first place.
1965 'abcdgu'=~/abcdefg|cdgu/
1966 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1967 fail, which would bring use to the state representing 'd' in the second word where we would
1968 try 'g' and succeed, prodceding to match 'cdgu'.
1970 /* add a fail transition */
1971 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1973 const U32 ucharcount = trie->uniquecharcount;
1974 const U32 numstates = trie->laststate;
1975 const U32 ubound = trie->lasttrans + ucharcount;
1979 U32 base = trie->states[ 1 ].trans.base;
1982 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1983 GET_RE_DEBUG_FLAGS_DECL;
1985 PERL_UNUSED_ARG(depth);
1989 ARG_SET( stclass, data_slot );
1990 Newxz( aho, 1, reg_ac_data );
1991 RExC_rx->data->data[ data_slot ] = (void*)aho;
1993 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1994 (trie->laststate+1)*sizeof(reg_trie_state));
1995 Newxz( q, numstates, U32);
1996 Newxz( aho->fail, numstates, U32 );
1999 /* initialize fail[0..1] to be 1 so that we always have
2000 a valid final fail state */
2001 fail[ 0 ] = fail[ 1 ] = 1;
2003 for ( charid = 0; charid < ucharcount ; charid++ ) {
2004 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2006 q[ q_write ] = newstate;
2007 /* set to point at the root */
2008 fail[ q[ q_write++ ] ]=1;
2011 while ( q_read < q_write) {
2012 const U32 cur = q[ q_read++ % numstates ];
2013 base = trie->states[ cur ].trans.base;
2015 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2016 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2018 U32 fail_state = cur;
2021 fail_state = fail[ fail_state ];
2022 fail_base = aho->states[ fail_state ].trans.base;
2023 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2025 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2026 fail[ ch_state ] = fail_state;
2027 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2029 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2031 q[ q_write++ % numstates] = ch_state;
2035 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2036 when we fail in state 1, this allows us to use the
2037 charclass scan to find a valid start char. This is based on the principle
2038 that theres a good chance the string being searched contains lots of stuff
2039 that cant be a start char.
2041 fail[ 0 ] = fail[ 1 ] = 0;
2042 DEBUG_TRIE_COMPILE_r({
2043 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
2044 for( q_read=1; q_read<numstates; q_read++ ) {
2045 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2047 PerlIO_printf(Perl_debug_log, "\n");
2050 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2055 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2056 * These need to be revisited when a newer toolchain becomes available.
2058 #if defined(__sparc64__) && defined(__GNUC__)
2059 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2060 # undef SPARC64_GCC_WORKAROUND
2061 # define SPARC64_GCC_WORKAROUND 1
2065 #define DEBUG_PEEP(str,scan,depth) \
2066 DEBUG_OPTIMISE_r({ \
2067 SV * const mysv=sv_newmortal(); \
2068 regnode *Next = regnext(scan); \
2069 regprop(RExC_rx, mysv, scan); \
2070 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
2071 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2072 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2079 #define JOIN_EXACT(scan,min,flags) \
2080 if (PL_regkind[OP(scan)] == EXACT) \
2081 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2084 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2085 /* Merge several consecutive EXACTish nodes into one. */
2086 regnode *n = regnext(scan);
2088 regnode *next = scan + NODE_SZ_STR(scan);
2092 regnode *stop = scan;
2093 GET_RE_DEBUG_FLAGS_DECL;
2095 PERL_UNUSED_ARG(depth);
2097 #ifndef EXPERIMENTAL_INPLACESCAN
2098 PERL_UNUSED_ARG(flags);
2099 PERL_UNUSED_ARG(val);
2101 DEBUG_PEEP("join",scan,depth);
2103 /* Skip NOTHING, merge EXACT*. */
2105 ( PL_regkind[OP(n)] == NOTHING ||
2106 (stringok && (OP(n) == OP(scan))))
2108 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2110 if (OP(n) == TAIL || n > next)
2112 if (PL_regkind[OP(n)] == NOTHING) {
2113 DEBUG_PEEP("skip:",n,depth);
2114 NEXT_OFF(scan) += NEXT_OFF(n);
2115 next = n + NODE_STEP_REGNODE;
2122 else if (stringok) {
2123 const unsigned int oldl = STR_LEN(scan);
2124 regnode * const nnext = regnext(n);
2126 DEBUG_PEEP("merg",n,depth);
2129 if (oldl + STR_LEN(n) > U8_MAX)
2131 NEXT_OFF(scan) += NEXT_OFF(n);
2132 STR_LEN(scan) += STR_LEN(n);
2133 next = n + NODE_SZ_STR(n);
2134 /* Now we can overwrite *n : */
2135 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2143 #ifdef EXPERIMENTAL_INPLACESCAN
2144 if (flags && !NEXT_OFF(n)) {
2145 DEBUG_PEEP("atch", val, depth);
2146 if (reg_off_by_arg[OP(n)]) {
2147 ARG_SET(n, val - n);
2150 NEXT_OFF(n) = val - n;
2157 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2159 Two problematic code points in Unicode casefolding of EXACT nodes:
2161 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2162 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2168 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2169 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2171 This means that in case-insensitive matching (or "loose matching",
2172 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2173 length of the above casefolded versions) can match a target string
2174 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2175 This would rather mess up the minimum length computation.
2177 What we'll do is to look for the tail four bytes, and then peek
2178 at the preceding two bytes to see whether we need to decrease
2179 the minimum length by four (six minus two).
2181 Thanks to the design of UTF-8, there cannot be false matches:
2182 A sequence of valid UTF-8 bytes cannot be a subsequence of
2183 another valid sequence of UTF-8 bytes.
2186 char * const s0 = STRING(scan), *s, *t;
2187 char * const s1 = s0 + STR_LEN(scan) - 1;
2188 char * const s2 = s1 - 4;
2189 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2190 const char t0[] = "\xaf\x49\xaf\x42";
2192 const char t0[] = "\xcc\x88\xcc\x81";
2194 const char * const t1 = t0 + 3;
2197 s < s2 && (t = ninstr(s, s1, t0, t1));
2200 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2201 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2203 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2204 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2212 n = scan + NODE_SZ_STR(scan);
2214 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2221 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2225 /* REx optimizer. Converts nodes into quickier variants "in place".
2226 Finds fixed substrings. */
2228 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2229 to the position after last scanned or to NULL. */
2234 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2235 I32 *minlenp, I32 *deltap,
2236 regnode *last, scan_data_t *data, U32 flags, U32 depth)
2237 /* scanp: Start here (read-write). */
2238 /* deltap: Write maxlen-minlen here. */
2239 /* last: Stop before this one. */
2242 I32 min = 0, pars = 0, code;
2243 regnode *scan = *scanp, *next;
2245 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2246 int is_inf_internal = 0; /* The studied chunk is infinite */
2247 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2248 scan_data_t data_fake;
2249 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2250 SV *re_trie_maxbuff = NULL;
2251 regnode *first_non_open = scan;
2254 GET_RE_DEBUG_FLAGS_DECL;
2256 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2259 while (first_non_open && OP(first_non_open) == OPEN)
2260 first_non_open=regnext(first_non_open);
2264 while (scan && OP(scan) != END && scan < last) {
2265 /* Peephole optimizer: */
2266 DEBUG_STUDYDATA(data,depth);
2267 DEBUG_PEEP("Peep",scan,depth);
2268 JOIN_EXACT(scan,&min,0);
2270 /* Follow the next-chain of the current node and optimize
2271 away all the NOTHINGs from it. */
2272 if (OP(scan) != CURLYX) {
2273 const int max = (reg_off_by_arg[OP(scan)]
2275 /* I32 may be smaller than U16 on CRAYs! */
2276 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2277 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2281 /* Skip NOTHING and LONGJMP. */
2282 while ((n = regnext(n))
2283 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2284 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2285 && off + noff < max)
2287 if (reg_off_by_arg[OP(scan)])
2290 NEXT_OFF(scan) = off;
2295 /* The principal pseudo-switch. Cannot be a switch, since we
2296 look into several different things. */
2297 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2298 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2299 next = regnext(scan);
2301 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2303 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2304 /* NOTE - There is similar code to this block below for handling
2305 TRIE nodes on a re-study. If you change stuff here check there
2307 I32 max1 = 0, min1 = I32_MAX, num = 0;
2308 struct regnode_charclass_class accum;
2309 regnode * const startbranch=scan;
2311 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2312 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2313 if (flags & SCF_DO_STCLASS)
2314 cl_init_zero(pRExC_state, &accum);
2316 while (OP(scan) == code) {
2317 I32 deltanext, minnext, f = 0, fake;
2318 struct regnode_charclass_class this_class;
2321 data_fake.flags = 0;
2323 data_fake.whilem_c = data->whilem_c;
2324 data_fake.last_closep = data->last_closep;
2327 data_fake.last_closep = &fake;
2328 next = regnext(scan);
2329 scan = NEXTOPER(scan);
2331 scan = NEXTOPER(scan);
2332 if (flags & SCF_DO_STCLASS) {
2333 cl_init(pRExC_state, &this_class);
2334 data_fake.start_class = &this_class;
2335 f = SCF_DO_STCLASS_AND;
2337 if (flags & SCF_WHILEM_VISITED_POS)
2338 f |= SCF_WHILEM_VISITED_POS;
2340 /* we suppose the run is continuous, last=next...*/
2341 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2342 next, &data_fake, f,depth+1);
2345 if (max1 < minnext + deltanext)
2346 max1 = minnext + deltanext;
2347 if (deltanext == I32_MAX)
2348 is_inf = is_inf_internal = 1;
2350 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2353 if (data_fake.flags & SF_HAS_EVAL)
2354 data->flags |= SF_HAS_EVAL;
2355 data->whilem_c = data_fake.whilem_c;
2357 if (flags & SCF_DO_STCLASS)
2358 cl_or(pRExC_state, &accum, &this_class);
2359 if (code == SUSPEND)
2362 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2364 if (flags & SCF_DO_SUBSTR) {
2365 data->pos_min += min1;
2366 data->pos_delta += max1 - min1;
2367 if (max1 != min1 || is_inf)
2368 data->longest = &(data->longest_float);
2371 delta += max1 - min1;
2372 if (flags & SCF_DO_STCLASS_OR) {
2373 cl_or(pRExC_state, data->start_class, &accum);
2375 cl_and(data->start_class, &and_with);
2376 flags &= ~SCF_DO_STCLASS;
2379 else if (flags & SCF_DO_STCLASS_AND) {
2381 cl_and(data->start_class, &accum);
2382 flags &= ~SCF_DO_STCLASS;
2385 /* Switch to OR mode: cache the old value of
2386 * data->start_class */
2387 StructCopy(data->start_class, &and_with,
2388 struct regnode_charclass_class);
2389 flags &= ~SCF_DO_STCLASS_AND;
2390 StructCopy(&accum, data->start_class,
2391 struct regnode_charclass_class);
2392 flags |= SCF_DO_STCLASS_OR;
2393 data->start_class->flags |= ANYOF_EOS;
2397 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2400 Assuming this was/is a branch we are dealing with: 'scan' now
2401 points at the item that follows the branch sequence, whatever
2402 it is. We now start at the beginning of the sequence and look
2409 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2411 If we can find such a subseqence we need to turn the first
2412 element into a trie and then add the subsequent branch exact
2413 strings to the trie.
2417 1. patterns where the whole set of branch can be converted.
2419 2. patterns where only a subset can be converted.
2421 In case 1 we can replace the whole set with a single regop
2422 for the trie. In case 2 we need to keep the start and end
2425 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2426 becomes BRANCH TRIE; BRANCH X;
2428 There is an additional case, that being where there is a
2429 common prefix, which gets split out into an EXACT like node
2430 preceding the TRIE node.
2432 If x(1..n)==tail then we can do a simple trie, if not we make
2433 a "jump" trie, such that when we match the appropriate word
2434 we "jump" to the appopriate tail node. Essentailly we turn
2435 a nested if into a case structure of sorts.
2440 if (!re_trie_maxbuff) {
2441 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2442 if (!SvIOK(re_trie_maxbuff))
2443 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2445 if ( SvIV(re_trie_maxbuff)>=0 ) {
2447 regnode *first = (regnode *)NULL;
2448 regnode *last = (regnode *)NULL;
2449 regnode *tail = scan;
2454 SV * const mysv = sv_newmortal(); /* for dumping */
2456 /* var tail is used because there may be a TAIL
2457 regop in the way. Ie, the exacts will point to the
2458 thing following the TAIL, but the last branch will
2459 point at the TAIL. So we advance tail. If we
2460 have nested (?:) we may have to move through several
2464 while ( OP( tail ) == TAIL ) {
2465 /* this is the TAIL generated by (?:) */
2466 tail = regnext( tail );
2471 regprop(RExC_rx, mysv, tail );
2472 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2473 (int)depth * 2 + 2, "",
2474 "Looking for TRIE'able sequences. Tail node is: ",
2475 SvPV_nolen_const( mysv )
2481 step through the branches, cur represents each
2482 branch, noper is the first thing to be matched
2483 as part of that branch and noper_next is the
2484 regnext() of that node. if noper is an EXACT
2485 and noper_next is the same as scan (our current
2486 position in the regex) then the EXACT branch is
2487 a possible optimization target. Once we have
2488 two or more consequetive such branches we can
2489 create a trie of the EXACT's contents and stich
2490 it in place. If the sequence represents all of
2491 the branches we eliminate the whole thing and
2492 replace it with a single TRIE. If it is a
2493 subsequence then we need to stitch it in. This
2494 means the first branch has to remain, and needs
2495 to be repointed at the item on the branch chain
2496 following the last branch optimized. This could
2497 be either a BRANCH, in which case the
2498 subsequence is internal, or it could be the
2499 item following the branch sequence in which
2500 case the subsequence is at the end.
2504 /* dont use tail as the end marker for this traverse */
2505 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2506 regnode * const noper = NEXTOPER( cur );
2507 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2508 regnode * const noper_next = regnext( noper );
2512 regprop(RExC_rx, mysv, cur);
2513 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2514 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2516 regprop(RExC_rx, mysv, noper);
2517 PerlIO_printf( Perl_debug_log, " -> %s",
2518 SvPV_nolen_const(mysv));
2521 regprop(RExC_rx, mysv, noper_next );
2522 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2523 SvPV_nolen_const(mysv));
2525 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2526 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2528 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2529 : PL_regkind[ OP( noper ) ] == EXACT )
2530 || OP(noper) == NOTHING )
2532 && noper_next == tail
2537 if ( !first || optype == NOTHING ) {
2538 if (!first) first = cur;
2539 optype = OP( noper );
2545 make_trie( pRExC_state,
2546 startbranch, first, cur, tail, count,
2549 if ( PL_regkind[ OP( noper ) ] == EXACT
2551 && noper_next == tail
2556 optype = OP( noper );
2566 regprop(RExC_rx, mysv, cur);
2567 PerlIO_printf( Perl_debug_log,
2568 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2569 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2573 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2574 #ifdef TRIE_STUDY_OPT
2575 if ( ((made == MADE_EXACT_TRIE &&
2576 startbranch == first)
2577 || ( first_non_open == first )) &&
2579 flags |= SCF_TRIE_RESTUDY;
2587 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2588 scan = NEXTOPER(NEXTOPER(scan));
2589 } else /* single branch is optimized. */
2590 scan = NEXTOPER(scan);
2593 else if (OP(scan) == EXACT) {
2594 I32 l = STR_LEN(scan);
2597 const U8 * const s = (U8*)STRING(scan);
2598 l = utf8_length(s, s + l);
2599 uc = utf8_to_uvchr(s, NULL);
2601 uc = *((U8*)STRING(scan));
2604 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2605 /* The code below prefers earlier match for fixed
2606 offset, later match for variable offset. */
2607 if (data->last_end == -1) { /* Update the start info. */
2608 data->last_start_min = data->pos_min;
2609 data->last_start_max = is_inf
2610 ? I32_MAX : data->pos_min + data->pos_delta;
2612 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2614 SvUTF8_on(data->last_found);
2616 SV * const sv = data->last_found;
2617 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2618 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2619 if (mg && mg->mg_len >= 0)
2620 mg->mg_len += utf8_length((U8*)STRING(scan),
2621 (U8*)STRING(scan)+STR_LEN(scan));
2623 data->last_end = data->pos_min + l;
2624 data->pos_min += l; /* As in the first entry. */
2625 data->flags &= ~SF_BEFORE_EOL;
2627 if (flags & SCF_DO_STCLASS_AND) {
2628 /* Check whether it is compatible with what we know already! */
2632 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2633 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2634 && (!(data->start_class->flags & ANYOF_FOLD)
2635 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2638 ANYOF_CLASS_ZERO(data->start_class);
2639 ANYOF_BITMAP_ZERO(data->start_class);
2641 ANYOF_BITMAP_SET(data->start_class, uc);
2642 data->start_class->flags &= ~ANYOF_EOS;
2644 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2646 else if (flags & SCF_DO_STCLASS_OR) {
2647 /* false positive possible if the class is case-folded */
2649 ANYOF_BITMAP_SET(data->start_class, uc);
2651 data->start_class->flags |= ANYOF_UNICODE_ALL;
2652 data->start_class->flags &= ~ANYOF_EOS;
2653 cl_and(data->start_class, &and_with);
2655 flags &= ~SCF_DO_STCLASS;
2657 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2658 I32 l = STR_LEN(scan);
2659 UV uc = *((U8*)STRING(scan));
2661 /* Search for fixed substrings supports EXACT only. */
2662 if (flags & SCF_DO_SUBSTR) {
2664 scan_commit(pRExC_state, data, minlenp);
2667 const U8 * const s = (U8 *)STRING(scan);
2668 l = utf8_length(s, s + l);
2669 uc = utf8_to_uvchr(s, NULL);
2672 if (flags & SCF_DO_SUBSTR)
2674 if (flags & SCF_DO_STCLASS_AND) {
2675 /* Check whether it is compatible with what we know already! */
2679 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2680 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2681 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2683 ANYOF_CLASS_ZERO(data->start_class);
2684 ANYOF_BITMAP_ZERO(data->start_class);
2686 ANYOF_BITMAP_SET(data->start_class, uc);
2687 data->start_class->flags &= ~ANYOF_EOS;
2688 data->start_class->flags |= ANYOF_FOLD;
2689 if (OP(scan) == EXACTFL)
2690 data->start_class->flags |= ANYOF_LOCALE;
2693 else if (flags & SCF_DO_STCLASS_OR) {
2694 if (data->start_class->flags & ANYOF_FOLD) {
2695 /* false positive possible if the class is case-folded.
2696 Assume that the locale settings are the same... */
2698 ANYOF_BITMAP_SET(data->start_class, uc);
2699 data->start_class->flags &= ~ANYOF_EOS;
2701 cl_and(data->start_class, &and_with);
2703 flags &= ~SCF_DO_STCLASS;
2705 else if (strchr((const char*)PL_varies,OP(scan))) {
2706 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2707 I32 f = flags, pos_before = 0;
2708 regnode * const oscan = scan;
2709 struct regnode_charclass_class this_class;
2710 struct regnode_charclass_class *oclass = NULL;
2711 I32 next_is_eval = 0;
2713 switch (PL_regkind[OP(scan)]) {
2714 case WHILEM: /* End of (?:...)* . */
2715 scan = NEXTOPER(scan);
2718 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2719 next = NEXTOPER(scan);
2720 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2722 maxcount = REG_INFTY;
2723 next = regnext(scan);
2724 scan = NEXTOPER(scan);
2728 if (flags & SCF_DO_SUBSTR)
2733 if (flags & SCF_DO_STCLASS) {
2735 maxcount = REG_INFTY;
2736 next = regnext(scan);
2737 scan = NEXTOPER(scan);
2740 is_inf = is_inf_internal = 1;
2741 scan = regnext(scan);
2742 if (flags & SCF_DO_SUBSTR) {
2743 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2744 data->longest = &(data->longest_float);
2746 goto optimize_curly_tail;
2748 mincount = ARG1(scan);
2749 maxcount = ARG2(scan);
2750 next = regnext(scan);
2751 if (OP(scan) == CURLYX) {
2752 I32 lp = (data ? *(data->last_closep) : 0);
2753 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2755 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2756 next_is_eval = (OP(scan) == EVAL);
2758 if (flags & SCF_DO_SUBSTR) {
2759 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2760 pos_before = data->pos_min;
2764 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2766 data->flags |= SF_IS_INF;
2768 if (flags & SCF_DO_STCLASS) {
2769 cl_init(pRExC_state, &this_class);
2770 oclass = data->start_class;
2771 data->start_class = &this_class;
2772 f |= SCF_DO_STCLASS_AND;
2773 f &= ~SCF_DO_STCLASS_OR;
2775 /* These are the cases when once a subexpression
2776 fails at a particular position, it cannot succeed
2777 even after backtracking at the enclosing scope.
2779 XXXX what if minimal match and we are at the
2780 initial run of {n,m}? */
2781 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2782 f &= ~SCF_WHILEM_VISITED_POS;
2784 /* This will finish on WHILEM, setting scan, or on NULL: */
2785 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
2787 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2789 if (flags & SCF_DO_STCLASS)
2790 data->start_class = oclass;
2791 if (mincount == 0 || minnext == 0) {
2792 if (flags & SCF_DO_STCLASS_OR) {
2793 cl_or(pRExC_state, data->start_class, &this_class);
2795 else if (flags & SCF_DO_STCLASS_AND) {
2796 /* Switch to OR mode: cache the old value of
2797 * data->start_class */
2798 StructCopy(data->start_class, &and_with,
2799 struct regnode_charclass_class);
2800 flags &= ~SCF_DO_STCLASS_AND;
2801 StructCopy(&this_class, data->start_class,
2802 struct regnode_charclass_class);
2803 flags |= SCF_DO_STCLASS_OR;
2804 data->start_class->flags |= ANYOF_EOS;
2806 } else { /* Non-zero len */
2807 if (flags & SCF_DO_STCLASS_OR) {
2808 cl_or(pRExC_state, data->start_class, &this_class);
2809 cl_and(data->start_class, &and_with);
2811 else if (flags & SCF_DO_STCLASS_AND)
2812 cl_and(data->start_class, &this_class);
2813 flags &= ~SCF_DO_STCLASS;
2815 if (!scan) /* It was not CURLYX, but CURLY. */
2817 if ( /* ? quantifier ok, except for (?{ ... }) */
2818 (next_is_eval || !(mincount == 0 && maxcount == 1))
2819 && (minnext == 0) && (deltanext == 0)
2820 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2821 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2822 && ckWARN(WARN_REGEXP))
2825 "Quantifier unexpected on zero-length expression");
2828 min += minnext * mincount;
2829 is_inf_internal |= ((maxcount == REG_INFTY
2830 && (minnext + deltanext) > 0)
2831 || deltanext == I32_MAX);
2832 is_inf |= is_inf_internal;
2833 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2835 /* Try powerful optimization CURLYX => CURLYN. */
2836 if ( OP(oscan) == CURLYX && data
2837 && data->flags & SF_IN_PAR
2838 && !(data->flags & SF_HAS_EVAL)
2839 && !deltanext && minnext == 1 ) {
2840 /* Try to optimize to CURLYN. */
2841 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2842 regnode * const nxt1 = nxt;
2849 if (!strchr((const char*)PL_simple,OP(nxt))
2850 && !(PL_regkind[OP(nxt)] == EXACT
2851 && STR_LEN(nxt) == 1))
2857 if (OP(nxt) != CLOSE)
2859 /* Now we know that nxt2 is the only contents: */
2860 oscan->flags = (U8)ARG(nxt);
2862 OP(nxt1) = NOTHING; /* was OPEN. */
2864 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2865 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2866 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2867 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2868 OP(nxt + 1) = OPTIMIZED; /* was count. */
2869 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2874 /* Try optimization CURLYX => CURLYM. */
2875 if ( OP(oscan) == CURLYX && data
2876 && !(data->flags & SF_HAS_PAR)
2877 && !(data->flags & SF_HAS_EVAL)
2878 && !deltanext /* atom is fixed width */
2879 && minnext != 0 /* CURLYM can't handle zero width */
2881 /* XXXX How to optimize if data == 0? */
2882 /* Optimize to a simpler form. */
2883 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2887 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2888 && (OP(nxt2) != WHILEM))
2890 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2891 /* Need to optimize away parenths. */
2892 if (data->flags & SF_IN_PAR) {
2893 /* Set the parenth number. */
2894 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2896 if (OP(nxt) != CLOSE)
2897 FAIL("Panic opt close");
2898 oscan->flags = (U8)ARG(nxt);
2899 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2900 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2902 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2903 OP(nxt + 1) = OPTIMIZED; /* was count. */
2904 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2905 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2908 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2909 regnode *nnxt = regnext(nxt1);
2912 if (reg_off_by_arg[OP(nxt1)])
2913 ARG_SET(nxt1, nxt2 - nxt1);
2914 else if (nxt2 - nxt1 < U16_MAX)
2915 NEXT_OFF(nxt1) = nxt2 - nxt1;
2917 OP(nxt) = NOTHING; /* Cannot beautify */
2922 /* Optimize again: */
2923 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
2929 else if ((OP(oscan) == CURLYX)
2930 && (flags & SCF_WHILEM_VISITED_POS)
2931 /* See the comment on a similar expression above.
2932 However, this time it not a subexpression
2933 we care about, but the expression itself. */
2934 && (maxcount == REG_INFTY)
2935 && data && ++data->whilem_c < 16) {
2936 /* This stays as CURLYX, we can put the count/of pair. */
2937 /* Find WHILEM (as in regexec.c) */
2938 regnode *nxt = oscan + NEXT_OFF(oscan);
2940 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2942 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2943 | (RExC_whilem_seen << 4)); /* On WHILEM */
2945 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2947 if (flags & SCF_DO_SUBSTR) {
2948 SV *last_str = NULL;
2949 int counted = mincount != 0;
2951 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2952 #if defined(SPARC64_GCC_WORKAROUND)
2955 const char *s = NULL;
2958 if (pos_before >= data->last_start_min)
2961 b = data->last_start_min;
2964 s = SvPV_const(data->last_found, l);
2965 old = b - data->last_start_min;
2968 I32 b = pos_before >= data->last_start_min
2969 ? pos_before : data->last_start_min;
2971 const char * const s = SvPV_const(data->last_found, l);
2972 I32 old = b - data->last_start_min;
2976 old = utf8_hop((U8*)s, old) - (U8*)s;
2979 /* Get the added string: */
2980 last_str = newSVpvn(s + old, l);
2982 SvUTF8_on(last_str);
2983 if (deltanext == 0 && pos_before == b) {
2984 /* What was added is a constant string */
2986 SvGROW(last_str, (mincount * l) + 1);
2987 repeatcpy(SvPVX(last_str) + l,
2988 SvPVX_const(last_str), l, mincount - 1);
2989 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2990 /* Add additional parts. */
2991 SvCUR_set(data->last_found,
2992 SvCUR(data->last_found) - l);
2993 sv_catsv(data->last_found, last_str);
2995 SV * sv = data->last_found;
2997 SvUTF8(sv) && SvMAGICAL(sv) ?
2998 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2999 if (mg && mg->mg_len >= 0)
3000 mg->mg_len += CHR_SVLEN(last_str);
3002 data->last_end += l * (mincount - 1);
3005 /* start offset must point into the last copy */
3006 data->last_start_min += minnext * (mincount - 1);
3007 data->last_start_max += is_inf ? I32_MAX
3008 : (maxcount - 1) * (minnext + data->pos_delta);
3011 /* It is counted once already... */
3012 data->pos_min += minnext * (mincount - counted);
3013 data->pos_delta += - counted * deltanext +
3014 (minnext + deltanext) * maxcount - minnext * mincount;
3015 if (mincount != maxcount) {
3016 /* Cannot extend fixed substrings found inside
3018 scan_commit(pRExC_state,data,minlenp);
3019 if (mincount && last_str) {
3020 SV * const sv = data->last_found;
3021 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3022 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3026 sv_setsv(sv, last_str);
3027 data->last_end = data->pos_min;
3028 data->last_start_min =
3029 data->pos_min - CHR_SVLEN(last_str);
3030 data->last_start_max = is_inf
3032 : data->pos_min + data->pos_delta
3033 - CHR_SVLEN(last_str);
3035 data->longest = &(data->longest_float);
3037 SvREFCNT_dec(last_str);
3039 if (data && (fl & SF_HAS_EVAL))
3040 data->flags |= SF_HAS_EVAL;
3041 optimize_curly_tail:
3042 if (OP(oscan) != CURLYX) {
3043 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3045 NEXT_OFF(oscan) += NEXT_OFF(next);
3048 default: /* REF and CLUMP only? */
3049 if (flags & SCF_DO_SUBSTR) {
3050 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3051 data->longest = &(data->longest_float);
3053 is_inf = is_inf_internal = 1;
3054 if (flags & SCF_DO_STCLASS_OR)
3055 cl_anything(pRExC_state, data->start_class);
3056 flags &= ~SCF_DO_STCLASS;
3060 else if (strchr((const char*)PL_simple,OP(scan))) {
3063 if (flags & SCF_DO_SUBSTR) {
3064 scan_commit(pRExC_state,data,minlenp);
3068 if (flags & SCF_DO_STCLASS) {
3069 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3071 /* Some of the logic below assumes that switching
3072 locale on will only add false positives. */
3073 switch (PL_regkind[OP(scan)]) {
3077 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3078 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3079 cl_anything(pRExC_state, data->start_class);
3082 if (OP(scan) == SANY)
3084 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3085 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3086 || (data->start_class->flags & ANYOF_CLASS));
3087 cl_anything(pRExC_state, data->start_class);
3089 if (flags & SCF_DO_STCLASS_AND || !value)
3090 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3093 if (flags & SCF_DO_STCLASS_AND)
3094 cl_and(data->start_class,
3095 (struct regnode_charclass_class*)scan);
3097 cl_or(pRExC_state, data->start_class,
3098 (struct regnode_charclass_class*)scan);
3101 if (flags & SCF_DO_STCLASS_AND) {
3102 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3103 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3104 for (value = 0; value < 256; value++)
3105 if (!isALNUM(value))
3106 ANYOF_BITMAP_CLEAR(data->start_class, value);
3110 if (data->start_class->flags & ANYOF_LOCALE)
3111 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3113 for (value = 0; value < 256; value++)
3115 ANYOF_BITMAP_SET(data->start_class, value);
3120 if (flags & SCF_DO_STCLASS_AND) {
3121 if (data->start_class->flags & ANYOF_LOCALE)
3122 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3125 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3126 data->start_class->flags |= ANYOF_LOCALE;
3130 if (flags & SCF_DO_STCLASS_AND) {
3131 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3132 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3133 for (value = 0; value < 256; value++)
3135 ANYOF_BITMAP_CLEAR(data->start_class, value);
3139 if (data->start_class->flags & ANYOF_LOCALE)
3140 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3142 for (value = 0; value < 256; value++)
3143 if (!isALNUM(value))
3144 ANYOF_BITMAP_SET(data->start_class, value);
3149 if (flags & SCF_DO_STCLASS_AND) {
3150 if (data->start_class->flags & ANYOF_LOCALE)
3151 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3154 data->start_class->flags |= ANYOF_LOCALE;
3155 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3159 if (flags & SCF_DO_STCLASS_AND) {
3160 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3161 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3162 for (value = 0; value < 256; value++)
3163 if (!isSPACE(value))
3164 ANYOF_BITMAP_CLEAR(data->start_class, value);
3168 if (data->start_class->flags & ANYOF_LOCALE)
3169 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3171 for (value = 0; value < 256; value++)
3173 ANYOF_BITMAP_SET(data->start_class, value);
3178 if (flags & SCF_DO_STCLASS_AND) {
3179 if (data->start_class->flags & ANYOF_LOCALE)
3180 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3183 data->start_class->flags |= ANYOF_LOCALE;
3184 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3188 if (flags & SCF_DO_STCLASS_AND) {
3189 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3190 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3191 for (value = 0; value < 256; value++)
3193 ANYOF_BITMAP_CLEAR(data->start_class, value);
3197 if (data->start_class->flags & ANYOF_LOCALE)
3198 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3200 for (value = 0; value < 256; value++)
3201 if (!isSPACE(value))
3202 ANYOF_BITMAP_SET(data->start_class, value);
3207 if (flags & SCF_DO_STCLASS_AND) {
3208 if (data->start_class->flags & ANYOF_LOCALE) {
3209 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3210 for (value = 0; value < 256; value++)
3211 if (!isSPACE(value))
3212 ANYOF_BITMAP_CLEAR(data->start_class, value);
3216 data->start_class->flags |= ANYOF_LOCALE;
3217 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3221 if (flags & SCF_DO_STCLASS_AND) {
3222 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3223 for (value = 0; value < 256; value++)
3224 if (!isDIGIT(value))
3225 ANYOF_BITMAP_CLEAR(data->start_class, value);
3228 if (data->start_class->flags & ANYOF_LOCALE)
3229 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3231 for (value = 0; value < 256; value++)
3233 ANYOF_BITMAP_SET(data->start_class, value);
3238 if (flags & SCF_DO_STCLASS_AND) {
3239 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3240 for (value = 0; value < 256; value++)
3242 ANYOF_BITMAP_CLEAR(data->start_class, value);
3245 if (data->start_class->flags & ANYOF_LOCALE)
3246 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3248 for (value = 0; value < 256; value++)
3249 if (!isDIGIT(value))
3250 ANYOF_BITMAP_SET(data->start_class, value);
3255 if (flags & SCF_DO_STCLASS_OR)
3256 cl_and(data->start_class, &and_with);
3257 flags &= ~SCF_DO_STCLASS;
3260 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3261 data->flags |= (OP(scan) == MEOL
3265 else if ( PL_regkind[OP(scan)] == BRANCHJ
3266 /* Lookbehind, or need to calculate parens/evals/stclass: */
3267 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3268 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3269 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3270 || OP(scan) == UNLESSM )
3272 /* Negative Lookahead/lookbehind
3273 In this case we can't do fixed string optimisation.
3276 I32 deltanext, minnext, fake = 0;
3278 struct regnode_charclass_class intrnl;
3281 data_fake.flags = 0;
3283 data_fake.whilem_c = data->whilem_c;
3284 data_fake.last_closep = data->last_closep;
3287 data_fake.last_closep = &fake;
3288 if ( flags & SCF_DO_STCLASS && !scan->flags
3289 && OP(scan) == IFMATCH ) { /* Lookahead */
3290 cl_init(pRExC_state, &intrnl);
3291 data_fake.start_class = &intrnl;
3292 f |= SCF_DO_STCLASS_AND;
3294 if (flags & SCF_WHILEM_VISITED_POS)
3295 f |= SCF_WHILEM_VISITED_POS;
3296 next = regnext(scan);
3297 nscan = NEXTOPER(NEXTOPER(scan));
3298 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
3301 vFAIL("Variable length lookbehind not implemented");
3303 else if (minnext > (I32)U8_MAX) {
3304 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3306 scan->flags = (U8)minnext;
3309 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3311 if (data_fake.flags & SF_HAS_EVAL)
3312 data->flags |= SF_HAS_EVAL;
3313 data->whilem_c = data_fake.whilem_c;
3315 if (f & SCF_DO_STCLASS_AND) {
3316 const int was = (data->start_class->flags & ANYOF_EOS);
3318 cl_and(data->start_class, &intrnl);
3320 data->start_class->flags |= ANYOF_EOS;
3323 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3325 /* Positive Lookahead/lookbehind
3326 In this case we can do fixed string optimisation,
3327 but we must be careful about it. Note in the case of
3328 lookbehind the positions will be offset by the minimum
3329 length of the pattern, something we won't know about
3330 until after the recurse.
3332 I32 deltanext, fake = 0;
3334 struct regnode_charclass_class intrnl;
3336 /* We use SAVEFREEPV so that when the full compile
3337 is finished perl will clean up the allocated
3338 minlens when its all done. This was we don't
3339 have to worry about freeing them when we know
3340 they wont be used, which would be a pain.
3343 Newx( minnextp, 1, I32 );
3344 SAVEFREEPV(minnextp);
3347 StructCopy(data, &data_fake, scan_data_t);
3348 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3351 scan_commit(pRExC_state, &data_fake,minlenp);
3352 data_fake.last_found=newSVsv(data->last_found);
3356 data_fake.last_closep = &fake;
3357 data_fake.flags = 0;
3359 data_fake.flags |= SF_IS_INF;
3360 if ( flags & SCF_DO_STCLASS && !scan->flags
3361 && OP(scan) == IFMATCH ) { /* Lookahead */
3362 cl_init(pRExC_state, &intrnl);
3363 data_fake.start_class = &intrnl;
3364 f |= SCF_DO_STCLASS_AND;
3366 if (flags & SCF_WHILEM_VISITED_POS)
3367 f |= SCF_WHILEM_VISITED_POS;
3368 next = regnext(scan);
3369 nscan = NEXTOPER(NEXTOPER(scan));
3371 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
3374 vFAIL("Variable length lookbehind not implemented");
3376 else if (*minnextp > (I32)U8_MAX) {
3377 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3379 scan->flags = (U8)*minnextp;
3385 if (f & SCF_DO_STCLASS_AND) {
3386 const int was = (data->start_class->flags & ANYOF_EOS);
3388 cl_and(data->start_class, &intrnl);
3390 data->start_class->flags |= ANYOF_EOS;
3393 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3395 if (data_fake.flags & SF_HAS_EVAL)
3396 data->flags |= SF_HAS_EVAL;
3397 data->whilem_c = data_fake.whilem_c;
3398 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3399 if (RExC_rx->minlen<*minnextp)
3400 RExC_rx->minlen=*minnextp;
3401 scan_commit(pRExC_state, &data_fake, minnextp);
3402 SvREFCNT_dec(data_fake.last_found);
3404 if ( data_fake.minlen_fixed != minlenp )
3406 data->offset_fixed= data_fake.offset_fixed;
3407 data->minlen_fixed= data_fake.minlen_fixed;
3408 data->lookbehind_fixed+= scan->flags;
3410 if ( data_fake.minlen_float != minlenp )
3412 data->minlen_float= data_fake.minlen_float;
3413 data->offset_float_min=data_fake.offset_float_min;
3414 data->offset_float_max=data_fake.offset_float_max;
3415 data->lookbehind_float+= scan->flags;
3424 else if (OP(scan) == OPEN) {
3427 else if (OP(scan) == CLOSE) {
3428 if ((I32)ARG(scan) == is_par) {
3429 next = regnext(scan);
3431 if ( next && (OP(next) != WHILEM) && next < last)
3432 is_par = 0; /* Disable optimization */
3435 *(data->last_closep) = ARG(scan);
3437 else if (OP(scan) == EVAL) {
3439 data->flags |= SF_HAS_EVAL;
3441 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3442 if (flags & SCF_DO_SUBSTR) {
3443 scan_commit(pRExC_state,data,minlenp);
3444 data->longest = &(data->longest_float);
3446 is_inf = is_inf_internal = 1;
3447 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3448 cl_anything(pRExC_state, data->start_class);
3449 flags &= ~SCF_DO_STCLASS;
3451 #ifdef TRIE_STUDY_OPT
3452 #ifdef FULL_TRIE_STUDY
3453 else if (PL_regkind[OP(scan)] == TRIE) {
3454 /* NOTE - There is similar code to this block above for handling
3455 BRANCH nodes on the initial study. If you change stuff here
3457 regnode *tail= regnext(scan);
3458 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3459 I32 max1 = 0, min1 = I32_MAX;
3460 struct regnode_charclass_class accum;
3462 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3463 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3464 if (flags & SCF_DO_STCLASS)
3465 cl_init_zero(pRExC_state, &accum);
3471 const regnode *nextbranch= NULL;
3474 for ( word=1 ; word <= trie->wordcount ; word++)
3476 I32 deltanext=0, minnext=0, f = 0, fake;
3477 struct regnode_charclass_class this_class;
3479 data_fake.flags = 0;
3481 data_fake.whilem_c = data->whilem_c;
3482 data_fake.last_closep = data->last_closep;
3485 data_fake.last_closep = &fake;
3487 if (flags & SCF_DO_STCLASS) {
3488 cl_init(pRExC_state, &this_class);
3489 data_fake.start_class = &this_class;
3490 f = SCF_DO_STCLASS_AND;
3492 if (flags & SCF_WHILEM_VISITED_POS)
3493 f |= SCF_WHILEM_VISITED_POS;
3495 if (trie->jump[word]) {
3497 nextbranch = tail - trie->jump[0];
3498 scan= tail - trie->jump[word];
3499 /* We go from the jump point to the branch that follows
3500 it. Note this means we need the vestigal unused branches
3501 even though they arent otherwise used.
3503 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3504 (regnode *)nextbranch, &data_fake, f,depth+1);
3506 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3507 nextbranch= regnext((regnode*)nextbranch);
3509 if (min1 > (I32)(minnext + trie->minlen))
3510 min1 = minnext + trie->minlen;
3511 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3512 max1 = minnext + deltanext + trie->maxlen;
3513 if (deltanext == I32_MAX)
3514 is_inf = is_inf_internal = 1;
3516 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3520 if (data_fake.flags & SF_HAS_EVAL)
3521 data->flags |= SF_HAS_EVAL;
3522 data->whilem_c = data_fake.whilem_c;
3524 if (flags & SCF_DO_STCLASS)
3525 cl_or(pRExC_state, &accum, &this_class);
3528 if (flags & SCF_DO_SUBSTR) {
3529 data->pos_min += min1;
3530 data->pos_delta += max1 - min1;
3531 if (max1 != min1 || is_inf)
3532 data->longest = &(data->longest_float);
3535 delta += max1 - min1;
3536 if (flags & SCF_DO_STCLASS_OR) {
3537 cl_or(pRExC_state, data->start_class, &accum);
3539 cl_and(data->start_class, &and_with);
3540 flags &= ~SCF_DO_STCLASS;
3543 else if (flags & SCF_DO_STCLASS_AND) {
3545 cl_and(data->start_class, &accum);
3546 flags &= ~SCF_DO_STCLASS;
3549 /* Switch to OR mode: cache the old value of
3550 * data->start_class */
3551 StructCopy(data->start_class, &and_with,
3552 struct regnode_charclass_class);
3553 flags &= ~SCF_DO_STCLASS_AND;
3554 StructCopy(&accum, data->start_class,
3555 struct regnode_charclass_class);
3556 flags |= SCF_DO_STCLASS_OR;
3557 data->start_class->flags |= ANYOF_EOS;
3564 else if (PL_regkind[OP(scan)] == TRIE) {
3565 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3568 min += trie->minlen;
3569 delta += (trie->maxlen - trie->minlen);
3570 flags &= ~SCF_DO_STCLASS; /* xxx */
3571 if (flags & SCF_DO_SUBSTR) {
3572 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3573 data->pos_min += trie->minlen;
3574 data->pos_delta += (trie->maxlen - trie->minlen);
3575 if (trie->maxlen != trie->minlen)
3576 data->longest = &(data->longest_float);
3578 if (trie->jump) /* no more substrings -- for now /grr*/
3579 flags &= ~SCF_DO_SUBSTR;
3581 #endif /* old or new */
3582 #endif /* TRIE_STUDY_OPT */
3583 /* Else: zero-length, ignore. */
3584 scan = regnext(scan);
3589 *deltap = is_inf_internal ? I32_MAX : delta;
3590 if (flags & SCF_DO_SUBSTR && is_inf)
3591 data->pos_delta = I32_MAX - data->pos_min;
3592 if (is_par > (I32)U8_MAX)
3594 if (is_par && pars==1 && data) {
3595 data->flags |= SF_IN_PAR;
3596 data->flags &= ~SF_HAS_PAR;
3598 else if (pars && data) {
3599 data->flags |= SF_HAS_PAR;
3600 data->flags &= ~SF_IN_PAR;
3602 if (flags & SCF_DO_STCLASS_OR)
3603 cl_and(data->start_class, &and_with);
3604 if (flags & SCF_TRIE_RESTUDY)
3605 data->flags |= SCF_TRIE_RESTUDY;
3607 DEBUG_STUDYDATA(data,depth);
3613 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3615 if (RExC_rx->data) {
3616 Renewc(RExC_rx->data,
3617 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3618 char, struct reg_data);
3619 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3620 RExC_rx->data->count += n;
3623 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3624 char, struct reg_data);
3625 Newx(RExC_rx->data->what, n, U8);
3626 RExC_rx->data->count = n;
3628 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3629 return RExC_rx->data->count - n;
3632 #ifndef PERL_IN_XSUB_RE
3634 Perl_reginitcolors(pTHX)
3637 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3639 char *t = savepv(s);
3643 t = strchr(t, '\t');
3649 PL_colors[i] = t = (char *)"";
3654 PL_colors[i++] = (char *)"";
3661 #ifdef TRIE_STUDY_OPT
3662 #define CHECK_RESTUDY_GOTO \
3664 (data.flags & SCF_TRIE_RESTUDY) \
3668 #define CHECK_RESTUDY_GOTO
3671 - pregcomp - compile a regular expression into internal code
3673 * We can't allocate space until we know how big the compiled form will be,
3674 * but we can't compile it (and thus know how big it is) until we've got a
3675 * place to put the code. So we cheat: we compile it twice, once with code
3676 * generation turned off and size counting turned on, and once "for real".
3677 * This also means that we don't allocate space until we are sure that the
3678 * thing really will compile successfully, and we never have to move the
3679 * code and thus invalidate pointers into it. (Note that it has to be in
3680 * one piece because free() must be able to free it all.) [NB: not true in perl]
3682 * Beware that the optimization-preparation code in here knows about some
3683 * of the structure of the compiled regexp. [I'll say.]
3686 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3697 RExC_state_t RExC_state;
3698 RExC_state_t * const pRExC_state = &RExC_state;
3699 #ifdef TRIE_STUDY_OPT
3701 RExC_state_t copyRExC_state;
3704 GET_RE_DEBUG_FLAGS_DECL;
3707 FAIL("NULL regexp argument");
3709 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3712 DEBUG_r(if (!PL_colorset) reginitcolors());
3714 SV *dsv= sv_newmortal();
3715 RE_PV_QUOTED_DECL(s, RExC_utf8,
3716 dsv, RExC_precomp, (xend - exp), 60);
3717 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3718 PL_colors[4],PL_colors[5],s);
3720 RExC_flags = pm->op_pmflags;
3724 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3725 RExC_seen_evals = 0;
3728 /* First pass: determine size, legality. */
3735 RExC_emit = &PL_regdummy;
3736 RExC_whilem_seen = 0;
3737 #if 0 /* REGC() is (currently) a NOP at the first pass.
3738 * Clever compilers notice this and complain. --jhi */
3739 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3741 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3742 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3743 RExC_precomp = NULL;
3746 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3747 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3748 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3751 RExC_lastparse=NULL;
3755 /* Small enough for pointer-storage convention?
3756 If extralen==0, this means that we will not need long jumps. */
3757 if (RExC_size >= 0x10000L && RExC_extralen)
3758 RExC_size += RExC_extralen;
3761 if (RExC_whilem_seen > 15)
3762 RExC_whilem_seen = 15;
3764 /* Allocate space and initialize. */
3765 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3768 FAIL("Regexp out of space");
3771 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3772 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3775 r->prelen = xend - exp;
3776 r->precomp = savepvn(RExC_precomp, r->prelen);
3778 #ifdef PERL_OLD_COPY_ON_WRITE
3779 r->saved_copy = NULL;
3781 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3782 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3783 r->lastparen = 0; /* mg.c reads this. */
3785 r->substrs = 0; /* Useful during FAIL. */
3786 r->startp = 0; /* Useful during FAIL. */
3787 r->endp = 0; /* Useful during FAIL. */
3789 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3791 r->offsets[0] = RExC_size;
3793 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3794 "%s %"UVuf" bytes for offset annotations.\n",
3795 r->offsets ? "Got" : "Couldn't get",
3796 (UV)((2*RExC_size+1) * sizeof(U32))));
3800 /* Second pass: emit code. */
3801 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3806 RExC_emit_start = r->program;
3807 RExC_emit = r->program;
3808 /* Store the count of eval-groups for security checks: */
3809 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
3810 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3812 if (reg(pRExC_state, 0, &flags,1) == NULL)
3814 /* XXXX To minimize changes to RE engine we always allocate
3815 3-units-long substrs field. */
3816 Newx(r->substrs, 1, struct reg_substr_data);
3819 r->minlen = minlen = sawplus = sawopen = 0;
3820 Zero(r->substrs, 1, struct reg_substr_data);
3821 StructCopy(&zero_scan_data, &data, scan_data_t);
3823 #ifdef TRIE_STUDY_OPT
3825 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3826 RExC_state=copyRExC_state;
3827 if (data.last_found) {
3828 SvREFCNT_dec(data.longest_fixed);
3829 SvREFCNT_dec(data.longest_float);
3830 SvREFCNT_dec(data.last_found);
3833 copyRExC_state=RExC_state;
3836 /* Dig out information for optimizations. */
3837 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3838 pm->op_pmflags = RExC_flags;
3840 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3841 r->regstclass = NULL;
3842 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3843 r->reganch |= ROPT_NAUGHTY;
3844 scan = r->program + 1; /* First BRANCH. */
3846 /* testing for BRANCH here tells us whether there is "must appear"
3847 data in the pattern. If there is then we can use it for optimisations */
3848 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3850 STRLEN longest_float_length, longest_fixed_length;
3851 struct regnode_charclass_class ch_class; /* pointed to by data */
3853 I32 last_close = 0; /* pointed to by data */
3856 /* Skip introductions and multiplicators >= 1. */
3857 while ((OP(first) == OPEN && (sawopen = 1)) ||
3858 /* An OR of *one* alternative - should not happen now. */
3859 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3860 /* for now we can't handle lookbehind IFMATCH*/
3861 (OP(first) == IFMATCH && !first->flags) ||
3862 (OP(first) == PLUS) ||
3863 (OP(first) == MINMOD) ||
3864 /* An {n,m} with n>0 */
3865 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3868 if (OP(first) == PLUS)
3871 first += regarglen[OP(first)];
3872 if (OP(first) == IFMATCH) {
3873 first = NEXTOPER(first);
3874 first += EXTRA_STEP_2ARGS;
3875 } else /* XXX possible optimisation for /(?=)/ */
3876 first = NEXTOPER(first);
3879 /* Starting-point info. */
3881 DEBUG_PEEP("first:",first,0);
3882 /* Ignore EXACT as we deal with it later. */
3883 if (PL_regkind[OP(first)] == EXACT) {
3884 if (OP(first) == EXACT)
3885 NOOP; /* Empty, get anchored substr later. */
3886 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3887 r->regstclass = first;
3890 else if (PL_regkind[OP(first)] == TRIE &&
3891 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3894 /* this can happen only on restudy */
3895 if ( OP(first) == TRIE ) {
3896 struct regnode_1 *trieop;
3897 Newxz(trieop,1,struct regnode_1);
3898 StructCopy(first,trieop,struct regnode_1);
3899 trie_op=(regnode *)trieop;
3901 struct regnode_charclass *trieop;
3902 Newxz(trieop,1,struct regnode_charclass);
3903 StructCopy(first,trieop,struct regnode_charclass);
3904 trie_op=(regnode *)trieop;
3907 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
3908 r->regstclass = trie_op;
3911 else if (strchr((const char*)PL_simple,OP(first)))
3912 r->regstclass = first;
3913 else if (PL_regkind[OP(first)] == BOUND ||
3914 PL_regkind[OP(first)] == NBOUND)
3915 r->regstclass = first;
3916 else if (PL_regkind[OP(first)] == BOL) {
3917 r->reganch |= (OP(first) == MBOL
3919 : (OP(first) == SBOL
3922 first = NEXTOPER(first);
3925 else if (OP(first) == GPOS) {
3926 r->reganch |= ROPT_ANCH_GPOS;
3927 first = NEXTOPER(first);
3930 else if (!sawopen && (OP(first) == STAR &&
3931 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3932 !(r->reganch & ROPT_ANCH) )
3934 /* turn .* into ^.* with an implied $*=1 */
3936 (OP(NEXTOPER(first)) == REG_ANY)
3939 r->reganch |= type | ROPT_IMPLICIT;
3940 first = NEXTOPER(first);
3943 if (sawplus && (!sawopen || !RExC_sawback)
3944 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3945 /* x+ must match at the 1st pos of run of x's */
3946 r->reganch |= ROPT_SKIP;
3948 /* Scan is after the zeroth branch, first is atomic matcher. */
3949 #ifdef TRIE_STUDY_OPT
3952 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3953 (IV)(first - scan + 1))
3957 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3958 (IV)(first - scan + 1))
3964 * If there's something expensive in the r.e., find the
3965 * longest literal string that must appear and make it the
3966 * regmust. Resolve ties in favor of later strings, since
3967 * the regstart check works with the beginning of the r.e.
3968 * and avoiding duplication strengthens checking. Not a
3969 * strong reason, but sufficient in the absence of others.
3970 * [Now we resolve ties in favor of the earlier string if
3971 * it happens that c_offset_min has been invalidated, since the
3972 * earlier string may buy us something the later one won't.]
3976 data.longest_fixed = newSVpvs("");
3977 data.longest_float = newSVpvs("");
3978 data.last_found = newSVpvs("");
3979 data.longest = &(data.longest_fixed);
3981 if (!r->regstclass) {
3982 cl_init(pRExC_state, &ch_class);
3983 data.start_class = &ch_class;
3984 stclass_flag = SCF_DO_STCLASS_AND;
3985 } else /* XXXX Check for BOUND? */
3987 data.last_closep = &last_close;
3989 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
3990 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3996 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3997 && data.last_start_min == 0 && data.last_end > 0
3998 && !RExC_seen_zerolen
3999 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
4000 r->reganch |= ROPT_CHECK_ALL;
4001 scan_commit(pRExC_state, &data,&minlen);
4002 SvREFCNT_dec(data.last_found);
4004 /* Note that code very similar to this but for anchored string
4005 follows immediately below, changes may need to be made to both.
4008 longest_float_length = CHR_SVLEN(data.longest_float);
4009 if (longest_float_length
4010 || (data.flags & SF_FL_BEFORE_EOL
4011 && (!(data.flags & SF_FL_BEFORE_MEOL)
4012 || (RExC_flags & PMf_MULTILINE))))
4016 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4017 && data.offset_fixed == data.offset_float_min
4018 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4019 goto remove_float; /* As in (a)+. */
4021 /* copy the information about the longest float from the reg_scan_data
4022 over to the program. */
4023 if (SvUTF8(data.longest_float)) {
4024 r->float_utf8 = data.longest_float;
4025 r->float_substr = NULL;
4027 r->float_substr = data.longest_float;
4028 r->float_utf8 = NULL;
4030 /* float_end_shift is how many chars that must be matched that
4031 follow this item. We calculate it ahead of time as once the
4032 lookbehind offset is added in we lose the ability to correctly
4034 ml = data.minlen_float ? *(data.minlen_float)
4035 : (I32)longest_float_length;
4036 r->float_end_shift = ml - data.offset_float_min
4037 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4038 + data.lookbehind_float;
4039 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4040 r->float_max_offset = data.offset_float_max;
4041 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4042 r->float_max_offset -= data.lookbehind_float;
4044 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4045 && (!(data.flags & SF_FL_BEFORE_MEOL)
4046 || (RExC_flags & PMf_MULTILINE)));
4047 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4051 r->float_substr = r->float_utf8 = NULL;
4052 SvREFCNT_dec(data.longest_float);
4053 longest_float_length = 0;
4056 /* Note that code very similar to this but for floating string
4057 is immediately above, changes may need to be made to both.
4060 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4061 if (longest_fixed_length
4062 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4063 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4064 || (RExC_flags & PMf_MULTILINE))))
4068 /* copy the information about the longest fixed
4069 from the reg_scan_data over to the program. */
4070 if (SvUTF8(data.longest_fixed)) {
4071 r->anchored_utf8 = data.longest_fixed;
4072 r->anchored_substr = NULL;
4074 r->anchored_substr = data.longest_fixed;
4075 r->anchored_utf8 = NULL;
4077 /* fixed_end_shift is how many chars that must be matched that
4078 follow this item. We calculate it ahead of time as once the
4079 lookbehind offset is added in we lose the ability to correctly
4081 ml = data.minlen_fixed ? *(data.minlen_fixed)
4082 : (I32)longest_fixed_length;
4083 r->anchored_end_shift = ml - data.offset_fixed
4084 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4085 + data.lookbehind_fixed;
4086 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4088 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4089 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4090 || (RExC_flags & PMf_MULTILINE)));
4091 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4094 r->anchored_substr = r->anchored_utf8 = NULL;
4095 SvREFCNT_dec(data.longest_fixed);
4096 longest_fixed_length = 0;
4099 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
4100 r->regstclass = NULL;
4101 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4103 && !(data.start_class->flags & ANYOF_EOS)
4104 && !cl_is_anything(data.start_class))
4106 const I32 n = add_data(pRExC_state, 1, "f");
4108 Newx(RExC_rx->data->data[n], 1,
4109 struct regnode_charclass_class);
4110 StructCopy(data.start_class,
4111 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4112 struct regnode_charclass_class);
4113 r->regstclass = (regnode*)RExC_rx->data->data[n];
4114 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4115 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4116 regprop(r, sv, (regnode*)data.start_class);
4117 PerlIO_printf(Perl_debug_log,
4118 "synthetic stclass \"%s\".\n",
4119 SvPVX_const(sv));});
4122 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4123 if (longest_fixed_length > longest_float_length) {
4124 r->check_end_shift = r->anchored_end_shift;
4125 r->check_substr = r->anchored_substr;
4126 r->check_utf8 = r->anchored_utf8;
4127 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4128 if (r->reganch & ROPT_ANCH_SINGLE)
4129 r->reganch |= ROPT_NOSCAN;
4132 r->check_end_shift = r->float_end_shift;
4133 r->check_substr = r->float_substr;
4134 r->check_utf8 = r->float_utf8;
4135 r->check_offset_min = r->float_min_offset;
4136 r->check_offset_max = r->float_max_offset;
4138 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4139 This should be changed ASAP! */
4140 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
4141 r->reganch |= RE_USE_INTUIT;
4142 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4143 r->reganch |= RE_INTUIT_TAIL;
4145 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4146 if ( (STRLEN)minlen < longest_float_length )
4147 minlen= longest_float_length;
4148 if ( (STRLEN)minlen < longest_fixed_length )
4149 minlen= longest_fixed_length;
4153 /* Several toplevels. Best we can is to set minlen. */
4155 struct regnode_charclass_class ch_class;
4158 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
4160 scan = r->program + 1;
4161 cl_init(pRExC_state, &ch_class);
4162 data.start_class = &ch_class;
4163 data.last_closep = &last_close;
4165 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4166 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4170 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4171 = r->float_substr = r->float_utf8 = NULL;
4172 if (!(data.start_class->flags & ANYOF_EOS)
4173 && !cl_is_anything(data.start_class))
4175 const I32 n = add_data(pRExC_state, 1, "f");
4177 Newx(RExC_rx->data->data[n], 1,
4178 struct regnode_charclass_class);
4179 StructCopy(data.start_class,
4180 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4181 struct regnode_charclass_class);
4182 r->regstclass = (regnode*)RExC_rx->data->data[n];
4183 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4184 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4185 regprop(r, sv, (regnode*)data.start_class);
4186 PerlIO_printf(Perl_debug_log,
4187 "synthetic stclass \"%s\".\n",
4188 SvPVX_const(sv));});
4192 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4193 the "real" pattern. */
4194 if (r->minlen < minlen)
4197 if (RExC_seen & REG_SEEN_GPOS)
4198 r->reganch |= ROPT_GPOS_SEEN;
4199 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4200 r->reganch |= ROPT_LOOKBEHIND_SEEN;
4201 if (RExC_seen & REG_SEEN_EVAL)
4202 r->reganch |= ROPT_EVAL_SEEN;
4203 if (RExC_seen & REG_SEEN_CANY)
4204 r->reganch |= ROPT_CANY_SEEN;
4205 Newxz(r->startp, RExC_npar, I32);
4206 Newxz(r->endp, RExC_npar, I32);
4208 DEBUG_r( RX_DEBUG_on(r) );
4210 PerlIO_printf(Perl_debug_log,"Final program:\n");
4213 DEBUG_OFFSETS_r(if (r->offsets) {
4214 const U32 len = r->offsets[0];
4216 GET_RE_DEBUG_FLAGS_DECL;
4217 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4218 for (i = 1; i <= len; i++) {
4219 if (r->offsets[i*2-1] || r->offsets[i*2])
4220 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4221 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
4223 PerlIO_printf(Perl_debug_log, "\n");
4229 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4230 int rem=(int)(RExC_end - RExC_parse); \
4239 if (RExC_lastparse!=RExC_parse) \
4240 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4243 iscut ? "..." : "<" \
4246 PerlIO_printf(Perl_debug_log,"%16s",""); \
4251 num=REG_NODE_NUM(RExC_emit); \
4252 if (RExC_lastnum!=num) \
4253 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4255 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4256 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4257 (int)((depth*2)), "", \
4261 RExC_lastparse=RExC_parse; \
4266 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4267 DEBUG_PARSE_MSG((funcname)); \
4268 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4271 - reg - regular expression, i.e. main body or parenthesized thing
4273 * Caller must absorb opening parenthesis.
4275 * Combining parenthesis handling with the base level of regular expression
4276 * is a trifle forced, but the need to tie the tails of the branches to what
4277 * follows makes it hard to avoid.
4279 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4281 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4283 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4287 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4288 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4291 register regnode *ret; /* Will be the head of the group. */
4292 register regnode *br;
4293 register regnode *lastbr;
4294 register regnode *ender = NULL;
4295 register I32 parno = 0;
4297 const I32 oregflags = RExC_flags;
4298 bool have_branch = 0;
4301 /* for (?g), (?gc), and (?o) warnings; warning
4302 about (?c) will warn about (?g) -- japhy */
4304 #define WASTED_O 0x01
4305 #define WASTED_G 0x02
4306 #define WASTED_C 0x04
4307 #define WASTED_GC (0x02|0x04)
4308 I32 wastedflags = 0x00;
4310 char * parse_start = RExC_parse; /* MJD */
4311 char * const oregcomp_parse = RExC_parse;
4313 GET_RE_DEBUG_FLAGS_DECL;
4314 DEBUG_PARSE("reg ");
4317 *flagp = 0; /* Tentatively. */
4320 /* Make an OPEN node, if parenthesized. */
4322 if (*RExC_parse == '?') { /* (?...) */
4323 U32 posflags = 0, negflags = 0;
4324 U32 *flagsp = &posflags;
4325 bool is_logical = 0;
4326 const char * const seqstart = RExC_parse;
4329 paren = *RExC_parse++;
4330 ret = NULL; /* For look-ahead/behind. */
4332 case '<': /* (?<...) */
4333 RExC_seen |= REG_SEEN_LOOKBEHIND;
4334 if (*RExC_parse == '!')
4336 if (*RExC_parse != '=' && *RExC_parse != '!')
4339 case '=': /* (?=...) */
4340 case '!': /* (?!...) */
4341 RExC_seen_zerolen++;
4342 case ':': /* (?:...) */
4343 case '>': /* (?>...) */
4345 case '$': /* (?$...) */
4346 case '@': /* (?@...) */
4347 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4349 case '#': /* (?#...) */
4350 while (*RExC_parse && *RExC_parse != ')')
4352 if (*RExC_parse != ')')
4353 FAIL("Sequence (?#... not terminated");
4354 nextchar(pRExC_state);
4357 case 'p': /* (?p...) */
4358 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
4359 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
4361 case '?': /* (??...) */
4363 if (*RExC_parse != '{')
4365 paren = *RExC_parse++;
4367 case '{': /* (?{...}) */
4369 I32 count = 1, n = 0;
4371 char *s = RExC_parse;
4373 RExC_seen_zerolen++;
4374 RExC_seen |= REG_SEEN_EVAL;
4375 while (count && (c = *RExC_parse)) {
4386 if (*RExC_parse != ')') {
4388 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4392 OP_4tree *sop, *rop;
4393 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
4396 Perl_save_re_context(aTHX);
4397 rop = sv_compile_2op(sv, &sop, "re", &pad);
4398 sop->op_private |= OPpREFCOUNTED;
4399 /* re_dup will OpREFCNT_inc */
4400 OpREFCNT_set(sop, 1);
4403 n = add_data(pRExC_state, 3, "nop");
4404 RExC_rx->data->data[n] = (void*)rop;
4405 RExC_rx->data->data[n+1] = (void*)sop;
4406 RExC_rx->data->data[n+2] = (void*)pad;
4409 else { /* First pass */
4410 if (PL_reginterp_cnt < ++RExC_seen_evals
4412 /* No compiled RE interpolated, has runtime
4413 components ===> unsafe. */
4414 FAIL("Eval-group not allowed at runtime, use re 'eval'");
4415 if (PL_tainting && PL_tainted)
4416 FAIL("Eval-group in insecure regular expression");
4417 #if PERL_VERSION > 8
4418 if (IN_PERL_COMPILETIME)
4423 nextchar(pRExC_state);
4425 ret = reg_node(pRExC_state, LOGICAL);
4428 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
4429 /* deal with the length of this later - MJD */
4432 ret = reganode(pRExC_state, EVAL, n);
4433 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4434 Set_Node_Offset(ret, parse_start);
4437 case '(': /* (?(?{...})...) and (?(?=...)...) */
4439 if (RExC_parse[0] == '?') { /* (?(?...)) */
4440 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4441 || RExC_parse[1] == '<'
4442 || RExC_parse[1] == '{') { /* Lookahead or eval. */
4445 ret = reg_node(pRExC_state, LOGICAL);
4448 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
4452 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
4455 parno = atoi(RExC_parse++);
4457 while (isDIGIT(*RExC_parse))
4459 ret = reganode(pRExC_state, GROUPP, parno);
4461 if ((c = *nextchar(pRExC_state)) != ')')
4462 vFAIL("Switch condition not recognized");
4464 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4465 br = regbranch(pRExC_state, &flags, 1,depth+1);
4467 br = reganode(pRExC_state, LONGJMP, 0);
4469 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
4470 c = *nextchar(pRExC_state);
4474 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
4475 regbranch(pRExC_state, &flags, 1,depth+1);
4476 REGTAIL(pRExC_state, ret, lastbr);
4479 c = *nextchar(pRExC_state);
4484 vFAIL("Switch (?(condition)... contains too many branches");
4485 ender = reg_node(pRExC_state, TAIL);
4486 REGTAIL(pRExC_state, br, ender);
4488 REGTAIL(pRExC_state, lastbr, ender);
4489 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
4492 REGTAIL(pRExC_state, ret, ender);
4496 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
4500 RExC_parse--; /* for vFAIL to print correctly */
4501 vFAIL("Sequence (? incomplete");
4505 parse_flags: /* (?i) */
4506 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
4507 /* (?g), (?gc) and (?o) are useless here
4508 and must be globally applied -- japhy */
4510 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4511 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
4512 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
4513 if (! (wastedflags & wflagbit) ) {
4514 wastedflags |= wflagbit;
4517 "Useless (%s%c) - %suse /%c modifier",
4518 flagsp == &negflags ? "?-" : "?",
4520 flagsp == &negflags ? "don't " : "",
4526 else if (*RExC_parse == 'c') {
4527 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
4528 if (! (wastedflags & WASTED_C) ) {
4529 wastedflags |= WASTED_GC;
4532 "Useless (%sc) - %suse /gc modifier",
4533 flagsp == &negflags ? "?-" : "?",
4534 flagsp == &negflags ? "don't " : ""
4539 else { pmflag(flagsp, *RExC_parse); }
4543 if (*RExC_parse == '-') {
4545 wastedflags = 0; /* reset so (?g-c) warns twice */
4549 RExC_flags |= posflags;
4550 RExC_flags &= ~negflags;
4551 if (*RExC_parse == ':') {
4557 if (*RExC_parse != ')') {
4559 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
4561 nextchar(pRExC_state);
4569 ret = reganode(pRExC_state, OPEN, parno);
4570 Set_Node_Length(ret, 1); /* MJD */
4571 Set_Node_Offset(ret, RExC_parse); /* MJD */
4578 /* Pick up the branches, linking them together. */
4579 parse_start = RExC_parse; /* MJD */
4580 br = regbranch(pRExC_state, &flags, 1,depth+1);
4581 /* branch_len = (paren != 0); */
4585 if (*RExC_parse == '|') {
4586 if (!SIZE_ONLY && RExC_extralen) {
4587 reginsert(pRExC_state, BRANCHJ, br);
4590 reginsert(pRExC_state, BRANCH, br);
4591 Set_Node_Length(br, paren != 0);
4592 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4596 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4598 else if (paren == ':') {
4599 *flagp |= flags&SIMPLE;
4601 if (is_open) { /* Starts with OPEN. */
4602 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4604 else if (paren != '?') /* Not Conditional */
4606 *flagp |= flags & (SPSTART | HASWIDTH);
4608 while (*RExC_parse == '|') {
4609 if (!SIZE_ONLY && RExC_extralen) {
4610 ender = reganode(pRExC_state, LONGJMP,0);
4611 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4614 RExC_extralen += 2; /* Account for LONGJMP. */
4615 nextchar(pRExC_state);
4616 br = regbranch(pRExC_state, &flags, 0, depth+1);
4620 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4624 *flagp |= flags&SPSTART;
4627 if (have_branch || paren != ':') {
4628 /* Make a closing node, and hook it on the end. */
4631 ender = reg_node(pRExC_state, TAIL);
4634 ender = reganode(pRExC_state, CLOSE, parno);
4635 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4636 Set_Node_Length(ender,1); /* MJD */
4642 *flagp &= ~HASWIDTH;
4645 ender = reg_node(pRExC_state, SUCCEED);
4648 ender = reg_node(pRExC_state, END);
4651 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4653 if (have_branch && !SIZE_ONLY) {
4654 /* Hook the tails of the branches to the closing node. */
4655 for (br = ret; br; br = regnext(br)) {
4656 const U8 op = PL_regkind[OP(br)];
4658 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4660 else if (op == BRANCHJ) {
4661 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4669 static const char parens[] = "=!<,>";
4671 if (paren && (p = strchr(parens, paren))) {
4672 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4673 int flag = (p - parens) > 1;
4676 node = SUSPEND, flag = 0;
4677 reginsert(pRExC_state, node,ret);
4678 Set_Node_Cur_Length(ret);
4679 Set_Node_Offset(ret, parse_start + 1);
4681 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4685 /* Check for proper termination. */
4687 RExC_flags = oregflags;
4688 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4689 RExC_parse = oregcomp_parse;
4690 vFAIL("Unmatched (");
4693 else if (!paren && RExC_parse < RExC_end) {
4694 if (*RExC_parse == ')') {
4696 vFAIL("Unmatched )");
4699 FAIL("Junk on end of regexp"); /* "Can't happen". */
4707 - regbranch - one alternative of an | operator
4709 * Implements the concatenation operator.
4712 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4715 register regnode *ret;
4716 register regnode *chain = NULL;
4717 register regnode *latest;
4718 I32 flags = 0, c = 0;
4719 GET_RE_DEBUG_FLAGS_DECL;
4720 DEBUG_PARSE("brnc");
4724 if (!SIZE_ONLY && RExC_extralen)
4725 ret = reganode(pRExC_state, BRANCHJ,0);
4727 ret = reg_node(pRExC_state, BRANCH);
4728 Set_Node_Length(ret, 1);
4732 if (!first && SIZE_ONLY)
4733 RExC_extralen += 1; /* BRANCHJ */
4735 *flagp = WORST; /* Tentatively. */
4738 nextchar(pRExC_state);
4739 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4741 latest = regpiece(pRExC_state, &flags,depth+1);
4742 if (latest == NULL) {
4743 if (flags & TRYAGAIN)
4747 else if (ret == NULL)
4749 *flagp |= flags&HASWIDTH;
4750 if (chain == NULL) /* First piece. */
4751 *flagp |= flags&SPSTART;
4754 REGTAIL(pRExC_state, chain, latest);
4759 if (chain == NULL) { /* Loop ran zero times. */
4760 chain = reg_node(pRExC_state, NOTHING);
4765 *flagp |= flags&SIMPLE;
4772 - regpiece - something followed by possible [*+?]
4774 * Note that the branching code sequences used for ? and the general cases
4775 * of * and + are somewhat optimized: they use the same NOTHING node as
4776 * both the endmarker for their branch list and the body of the last branch.
4777 * It might seem that this node could be dispensed with entirely, but the
4778 * endmarker role is not redundant.
4781 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4784 register regnode *ret;
4786 register char *next;
4788 const char * const origparse = RExC_parse;
4790 I32 max = REG_INFTY;
4792 const char *maxpos = NULL;
4793 GET_RE_DEBUG_FLAGS_DECL;
4794 DEBUG_PARSE("piec");
4796 ret = regatom(pRExC_state, &flags,depth+1);
4798 if (flags & TRYAGAIN)
4805 if (op == '{' && regcurly(RExC_parse)) {
4807 parse_start = RExC_parse; /* MJD */
4808 next = RExC_parse + 1;
4809 while (isDIGIT(*next) || *next == ',') {
4818 if (*next == '}') { /* got one */
4822 min = atoi(RExC_parse);
4826 maxpos = RExC_parse;
4828 if (!max && *maxpos != '0')
4829 max = REG_INFTY; /* meaning "infinity" */
4830 else if (max >= REG_INFTY)
4831 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4833 nextchar(pRExC_state);
4836 if ((flags&SIMPLE)) {
4837 RExC_naughty += 2 + RExC_naughty / 2;
4838 reginsert(pRExC_state, CURLY, ret);
4839 Set_Node_Offset(ret, parse_start+1); /* MJD */
4840 Set_Node_Cur_Length(ret);
4843 regnode * const w = reg_node(pRExC_state, WHILEM);
4846 REGTAIL(pRExC_state, ret, w);
4847 if (!SIZE_ONLY && RExC_extralen) {
4848 reginsert(pRExC_state, LONGJMP,ret);
4849 reginsert(pRExC_state, NOTHING,ret);
4850 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4852 reginsert(pRExC_state, CURLYX,ret);
4854 Set_Node_Offset(ret, parse_start+1);
4855 Set_Node_Length(ret,
4856 op == '{' ? (RExC_parse - parse_start) : 1);
4858 if (!SIZE_ONLY && RExC_extralen)
4859 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4860 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4862 RExC_whilem_seen++, RExC_extralen += 3;
4863 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4871 if (max && max < min)
4872 vFAIL("Can't do {n,m} with n > m");
4874 ARG1_SET(ret, (U16)min);
4875 ARG2_SET(ret, (U16)max);
4887 #if 0 /* Now runtime fix should be reliable. */
4889 /* if this is reinstated, don't forget to put this back into perldiag:
4891 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4893 (F) The part of the regexp subject to either the * or + quantifier
4894 could match an empty string. The {#} shows in the regular
4895 expression about where the problem was discovered.
4899 if (!(flags&HASWIDTH) && op != '?')
4900 vFAIL("Regexp *+ operand could be empty");
4903 parse_start = RExC_parse;
4904 nextchar(pRExC_state);
4906 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4908 if (op == '*' && (flags&SIMPLE)) {
4909 reginsert(pRExC_state, STAR, ret);
4913 else if (op == '*') {
4917 else if (op == '+' && (flags&SIMPLE)) {
4918 reginsert(pRExC_state, PLUS, ret);
4922 else if (op == '+') {
4926 else if (op == '?') {
4931 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4933 "%.*s matches null string many times",
4934 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4938 if (*RExC_parse == '?') {
4939 nextchar(pRExC_state);
4940 reginsert(pRExC_state, MINMOD, ret);
4941 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4943 if (ISMULT2(RExC_parse)) {
4945 vFAIL("Nested quantifiers");
4952 - regatom - the lowest level
4954 * Optimization: gobbles an entire sequence of ordinary characters so that
4955 * it can turn them into a single node, which is smaller to store and
4956 * faster to run. Backslashed characters are exceptions, each becoming a
4957 * separate node; the code is simpler that way and it's not worth fixing.
4959 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4960 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4963 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4966 register regnode *ret = NULL;
4968 char *parse_start = RExC_parse;
4969 GET_RE_DEBUG_FLAGS_DECL;
4970 DEBUG_PARSE("atom");
4971 *flagp = WORST; /* Tentatively. */
4974 switch (*RExC_parse) {
4976 RExC_seen_zerolen++;
4977 nextchar(pRExC_state);
4978 if (RExC_flags & PMf_MULTILINE)
4979 ret = reg_node(pRExC_state, MBOL);
4980 else if (RExC_flags & PMf_SINGLELINE)
4981 ret = reg_node(pRExC_state, SBOL);
4983 ret = reg_node(pRExC_state, BOL);
4984 Set_Node_Length(ret, 1); /* MJD */
4987 nextchar(pRExC_state);
4989 RExC_seen_zerolen++;
4990 if (RExC_flags & PMf_MULTILINE)
4991 ret = reg_node(pRExC_state, MEOL);
4992 else if (RExC_flags & PMf_SINGLELINE)
4993 ret = reg_node(pRExC_state, SEOL);
4995 ret = reg_node(pRExC_state, EOL);
4996 Set_Node_Length(ret, 1); /* MJD */
4999 nextchar(pRExC_state);
5000 if (RExC_flags & PMf_SINGLELINE)
5001 ret = reg_node(pRExC_state, SANY);
5003 ret = reg_node(pRExC_state, REG_ANY);
5004 *flagp |= HASWIDTH|SIMPLE;
5006 Set_Node_Length(ret, 1); /* MJD */
5010 char * const oregcomp_parse = ++RExC_parse;
5011 ret = regclass(pRExC_state,depth+1);
5012 if (*RExC_parse != ']') {
5013 RExC_parse = oregcomp_parse;
5014 vFAIL("Unmatched [");
5016 nextchar(pRExC_state);
5017 *flagp |= HASWIDTH|SIMPLE;
5018 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
5022 nextchar(pRExC_state);
5023 ret = reg(pRExC_state, 1, &flags,depth+1);
5025 if (flags & TRYAGAIN) {
5026 if (RExC_parse == RExC_end) {
5027 /* Make parent create an empty node if needed. */
5035 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
5039 if (flags & TRYAGAIN) {
5043 vFAIL("Internal urp");
5044 /* Supposed to be caught earlier. */
5047 if (!regcurly(RExC_parse)) {
5056 vFAIL("Quantifier follows nothing");
5059 switch (*++RExC_parse) {
5061 RExC_seen_zerolen++;
5062 ret = reg_node(pRExC_state, SBOL);
5064 nextchar(pRExC_state);
5065 Set_Node_Length(ret, 2); /* MJD */
5068 ret = reg_node(pRExC_state, GPOS);
5069 RExC_seen |= REG_SEEN_GPOS;
5071 nextchar(pRExC_state);
5072 Set_Node_Length(ret, 2); /* MJD */
5075 ret = reg_node(pRExC_state, SEOL);
5077 RExC_seen_zerolen++; /* Do not optimize RE away */
5078 nextchar(pRExC_state);
5081 ret = reg_node(pRExC_state, EOS);
5083 RExC_seen_zerolen++; /* Do not optimize RE away */
5084 nextchar(pRExC_state);
5085 Set_Node_Length(ret, 2); /* MJD */
5088 ret = reg_node(pRExC_state, CANY);
5089 RExC_seen |= REG_SEEN_CANY;
5090 *flagp |= HASWIDTH|SIMPLE;
5091 nextchar(pRExC_state);
5092 Set_Node_Length(ret, 2); /* MJD */
5095 ret = reg_node(pRExC_state, CLUMP);
5097 nextchar(pRExC_state);
5098 Set_Node_Length(ret, 2); /* MJD */
5101 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
5102 *flagp |= HASWIDTH|SIMPLE;
5103 nextchar(pRExC_state);
5104 Set_Node_Length(ret, 2); /* MJD */
5107 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
5108 *flagp |= HASWIDTH|SIMPLE;
5109 nextchar(pRExC_state);
5110 Set_Node_Length(ret, 2); /* MJD */
5113 RExC_seen_zerolen++;
5114 RExC_seen |= REG_SEEN_LOOKBEHIND;
5115 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
5117 nextchar(pRExC_state);
5118 Set_Node_Length(ret, 2); /* MJD */
5121 RExC_seen_zerolen++;
5122 RExC_seen |= REG_SEEN_LOOKBEHIND;
5123 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
5125 nextchar(pRExC_state);
5126 Set_Node_Length(ret, 2); /* MJD */
5129 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
5130 *flagp |= HASWIDTH|SIMPLE;
5131 nextchar(pRExC_state);
5132 Set_Node_Length(ret, 2); /* MJD */
5135 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
5136 *flagp |= HASWIDTH|SIMPLE;
5137 nextchar(pRExC_state);
5138 Set_Node_Length(ret, 2); /* MJD */
5141 ret = reg_node(pRExC_state, DIGIT);
5142 *flagp |= HASWIDTH|SIMPLE;
5143 nextchar(pRExC_state);
5144 Set_Node_Length(ret, 2); /* MJD */
5147 ret = reg_node(pRExC_state, NDIGIT);
5148 *flagp |= HASWIDTH|SIMPLE;
5149 nextchar(pRExC_state);
5150 Set_Node_Length(ret, 2); /* MJD */
5155 char* const oldregxend = RExC_end;
5156 char* parse_start = RExC_parse - 2;
5158 if (RExC_parse[1] == '{') {
5159 /* a lovely hack--pretend we saw [\pX] instead */
5160 RExC_end = strchr(RExC_parse, '}');
5162 const U8 c = (U8)*RExC_parse;
5164 RExC_end = oldregxend;
5165 vFAIL2("Missing right brace on \\%c{}", c);
5170 RExC_end = RExC_parse + 2;
5171 if (RExC_end > oldregxend)
5172 RExC_end = oldregxend;
5176 ret = regclass(pRExC_state,depth+1);
5178 RExC_end = oldregxend;
5181 Set_Node_Offset(ret, parse_start + 2);
5182 Set_Node_Cur_Length(ret);
5183 nextchar(pRExC_state);
5184 *flagp |= HASWIDTH|SIMPLE;
5197 case '1': case '2': case '3': case '4':
5198 case '5': case '6': case '7': case '8': case '9':
5200 const I32 num = atoi(RExC_parse);
5202 if (num > 9 && num >= RExC_npar)
5205 char * const parse_start = RExC_parse - 1; /* MJD */
5206 while (isDIGIT(*RExC_parse))
5209 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
5210 vFAIL("Reference to nonexistent group");
5212 ret = reganode(pRExC_state,
5213 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
5217 /* override incorrect value set in reganode MJD */
5218 Set_Node_Offset(ret, parse_start+1);
5219 Set_Node_Cur_Length(ret); /* MJD */
5221 nextchar(pRExC_state);
5226 if (RExC_parse >= RExC_end)
5227 FAIL("Trailing \\");
5230 /* Do not generate "unrecognized" warnings here, we fall
5231 back into the quick-grab loop below */
5238 if (RExC_flags & PMf_EXTENDED) {
5239 while (RExC_parse < RExC_end && *RExC_parse != '\n')
5241 if (RExC_parse < RExC_end)
5247 register STRLEN len;
5252 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5254 parse_start = RExC_parse - 1;
5260 ret = reg_node(pRExC_state,
5261 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5263 for (len = 0, p = RExC_parse - 1;
5264 len < 127 && p < RExC_end;
5267 char * const oldp = p;
5269 if (RExC_flags & PMf_EXTENDED)
5270 p = regwhite(p, RExC_end);
5317 ender = ASCII_TO_NATIVE('\033');
5321 ender = ASCII_TO_NATIVE('\007');
5326 char* const e = strchr(p, '}');
5330 vFAIL("Missing right brace on \\x{}");
5333 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5334 | PERL_SCAN_DISALLOW_PREFIX;
5335 STRLEN numlen = e - p - 1;
5336 ender = grok_hex(p + 1, &numlen, &flags, NULL);
5343 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5345 ender = grok_hex(p, &numlen, &flags, NULL);
5351 ender = UCHARAT(p++);
5352 ender = toCTRL(ender);
5354 case '0': case '1': case '2': case '3':case '4':
5355 case '5': case '6': case '7': case '8':case '9':
5357 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
5360 ender = grok_oct(p, &numlen, &flags, NULL);
5370 FAIL("Trailing \\");
5373 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
5374 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
5375 goto normal_default;
5380 if (UTF8_IS_START(*p) && UTF) {
5382 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
5383 &numlen, UTF8_ALLOW_DEFAULT);
5390 if (RExC_flags & PMf_EXTENDED)
5391 p = regwhite(p, RExC_end);
5393 /* Prime the casefolded buffer. */
5394 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
5396 if (ISMULT2(p)) { /* Back off on ?+*. */
5401 /* Emit all the Unicode characters. */
5403 for (foldbuf = tmpbuf;
5405 foldlen -= numlen) {
5406 ender = utf8_to_uvchr(foldbuf, &numlen);
5408 const STRLEN unilen = reguni(pRExC_state, ender, s);
5411 /* In EBCDIC the numlen
5412 * and unilen can differ. */
5414 if (numlen >= foldlen)
5418 break; /* "Can't happen." */
5422 const STRLEN unilen = reguni(pRExC_state, ender, s);
5431 REGC((char)ender, s++);
5437 /* Emit all the Unicode characters. */
5439 for (foldbuf = tmpbuf;
5441 foldlen -= numlen) {
5442 ender = utf8_to_uvchr(foldbuf, &numlen);
5444 const STRLEN unilen = reguni(pRExC_state, ender, s);
5447 /* In EBCDIC the numlen
5448 * and unilen can differ. */
5450 if (numlen >= foldlen)
5458 const STRLEN unilen = reguni(pRExC_state, ender, s);
5467 REGC((char)ender, s++);
5471 Set_Node_Cur_Length(ret); /* MJD */
5472 nextchar(pRExC_state);
5474 /* len is STRLEN which is unsigned, need to copy to signed */
5477 vFAIL("Internal disaster");
5481 if (len == 1 && UNI_IS_INVARIANT(ender))
5485 RExC_size += STR_SZ(len);
5488 RExC_emit += STR_SZ(len);
5494 /* If the encoding pragma is in effect recode the text of
5495 * any EXACT-kind nodes. */
5496 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
5497 const STRLEN oldlen = STR_LEN(ret);
5498 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
5502 if (sv_utf8_downgrade(sv, TRUE)) {
5503 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5504 const STRLEN newlen = SvCUR(sv);
5509 GET_RE_DEBUG_FLAGS_DECL;
5510 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
5511 (int)oldlen, STRING(ret),
5513 Copy(s, STRING(ret), newlen, char);
5514 STR_LEN(ret) += newlen - oldlen;
5515 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5517 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5525 S_regwhite(char *p, const char *e)
5530 else if (*p == '#') {
5533 } while (p < e && *p != '\n');
5541 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5542 Character classes ([:foo:]) can also be negated ([:^foo:]).
5543 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5544 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
5545 but trigger failures because they are currently unimplemented. */
5547 #define POSIXCC_DONE(c) ((c) == ':')
5548 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5549 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5552 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
5555 I32 namedclass = OOB_NAMEDCLASS;
5557 if (value == '[' && RExC_parse + 1 < RExC_end &&
5558 /* I smell either [: or [= or [. -- POSIX has been here, right? */
5559 POSIXCC(UCHARAT(RExC_parse))) {
5560 const char c = UCHARAT(RExC_parse);
5561 char* const s = RExC_parse++;
5563 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
5565 if (RExC_parse == RExC_end)
5566 /* Grandfather lone [:, [=, [. */
5569 const char* const t = RExC_parse++; /* skip over the c */
5572 if (UCHARAT(RExC_parse) == ']') {
5573 const char *posixcc = s + 1;
5574 RExC_parse++; /* skip over the ending ] */
5577 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5578 const I32 skip = t - posixcc;
5580 /* Initially switch on the length of the name. */
5583 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5584 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5587 /* Names all of length 5. */
5588 /* alnum alpha ascii blank cntrl digit graph lower
5589 print punct space upper */
5590 /* Offset 4 gives the best switch position. */
5591 switch (posixcc[4]) {
5593 if (memEQ(posixcc, "alph", 4)) /* alpha */
5594 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5597 if (memEQ(posixcc, "spac", 4)) /* space */
5598 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5601 if (memEQ(posixcc, "grap", 4)) /* graph */
5602 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5605 if (memEQ(posixcc, "asci", 4)) /* ascii */
5606 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5609 if (memEQ(posixcc, "blan", 4)) /* blank */
5610 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5613 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5614 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5617 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5618 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5621 if (memEQ(posixcc, "lowe", 4)) /* lower */
5622 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5623 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5624 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5627 if (memEQ(posixcc, "digi", 4)) /* digit */
5628 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5629 else if (memEQ(posixcc, "prin", 4)) /* print */
5630 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5631 else if (memEQ(posixcc, "punc", 4)) /* punct */
5632 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5637 if (memEQ(posixcc, "xdigit", 6))
5638 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5642 if (namedclass == OOB_NAMEDCLASS)
5643 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5645 assert (posixcc[skip] == ':');
5646 assert (posixcc[skip+1] == ']');
5647 } else if (!SIZE_ONLY) {
5648 /* [[=foo=]] and [[.foo.]] are still future. */
5650 /* adjust RExC_parse so the warning shows after
5652 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5654 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5657 /* Maternal grandfather:
5658 * "[:" ending in ":" but not in ":]" */
5668 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5671 if (POSIXCC(UCHARAT(RExC_parse))) {
5672 const char *s = RExC_parse;
5673 const char c = *s++;
5677 if (*s && c == *s && s[1] == ']') {
5678 if (ckWARN(WARN_REGEXP))
5680 "POSIX syntax [%c %c] belongs inside character classes",
5683 /* [[=foo=]] and [[.foo.]] are still future. */
5684 if (POSIXCC_NOTYET(c)) {
5685 /* adjust RExC_parse so the error shows after
5687 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5689 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5697 parse a class specification and produce either an ANYOF node that
5698 matches the pattern. If the pattern matches a single char only and
5699 that char is < 256 then we produce an EXACT node instead.
5702 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5705 register UV value = 0;
5706 register UV nextvalue;
5707 register IV prevvalue = OOB_UNICODE;
5708 register IV range = 0;
5709 register regnode *ret;
5712 char *rangebegin = NULL;
5713 bool need_class = 0;
5716 bool optimize_invert = TRUE;
5717 AV* unicode_alternate = NULL;
5719 UV literal_endpoint = 0;
5721 UV stored = 0; /* number of chars stored in the class */
5723 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5724 case we need to change the emitted regop to an EXACT. */
5725 const char * orig_parse = RExC_parse;
5726 GET_RE_DEBUG_FLAGS_DECL;
5728 PERL_UNUSED_ARG(depth);
5731 DEBUG_PARSE("clas");
5733 /* Assume we are going to generate an ANYOF node. */
5734 ret = reganode(pRExC_state, ANYOF, 0);
5737 ANYOF_FLAGS(ret) = 0;
5739 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5743 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5747 RExC_size += ANYOF_SKIP;
5748 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5751 RExC_emit += ANYOF_SKIP;
5753 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5755 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5756 ANYOF_BITMAP_ZERO(ret);
5757 listsv = newSVpvs("# comment\n");
5760 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5762 if (!SIZE_ONLY && POSIXCC(nextvalue))
5763 checkposixcc(pRExC_state);
5765 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5766 if (UCHARAT(RExC_parse) == ']')
5769 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5773 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5776 rangebegin = RExC_parse;
5778 value = utf8n_to_uvchr((U8*)RExC_parse,
5779 RExC_end - RExC_parse,
5780 &numlen, UTF8_ALLOW_DEFAULT);
5781 RExC_parse += numlen;
5784 value = UCHARAT(RExC_parse++);
5786 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5787 if (value == '[' && POSIXCC(nextvalue))
5788 namedclass = regpposixcc(pRExC_state, value);
5789 else if (value == '\\') {
5791 value = utf8n_to_uvchr((U8*)RExC_parse,
5792 RExC_end - RExC_parse,
5793 &numlen, UTF8_ALLOW_DEFAULT);
5794 RExC_parse += numlen;
5797 value = UCHARAT(RExC_parse++);
5798 /* Some compilers cannot handle switching on 64-bit integer
5799 * values, therefore value cannot be an UV. Yes, this will
5800 * be a problem later if we want switch on Unicode.
5801 * A similar issue a little bit later when switching on
5802 * namedclass. --jhi */
5803 switch ((I32)value) {
5804 case 'w': namedclass = ANYOF_ALNUM; break;
5805 case 'W': namedclass = ANYOF_NALNUM; break;
5806 case 's': namedclass = ANYOF_SPACE; break;
5807 case 'S': namedclass = ANYOF_NSPACE; break;
5808 case 'd': namedclass = ANYOF_DIGIT; break;
5809 case 'D': namedclass = ANYOF_NDIGIT; break;
5814 if (RExC_parse >= RExC_end)
5815 vFAIL2("Empty \\%c{}", (U8)value);
5816 if (*RExC_parse == '{') {
5817 const U8 c = (U8)value;
5818 e = strchr(RExC_parse++, '}');
5820 vFAIL2("Missing right brace on \\%c{}", c);
5821 while (isSPACE(UCHARAT(RExC_parse)))
5823 if (e == RExC_parse)
5824 vFAIL2("Empty \\%c{}", c);
5826 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5834 if (UCHARAT(RExC_parse) == '^') {
5837 value = value == 'p' ? 'P' : 'p'; /* toggle */
5838 while (isSPACE(UCHARAT(RExC_parse))) {
5843 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5844 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5847 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5848 namedclass = ANYOF_MAX; /* no official name, but it's named */
5851 case 'n': value = '\n'; break;
5852 case 'r': value = '\r'; break;
5853 case 't': value = '\t'; break;
5854 case 'f': value = '\f'; break;
5855 case 'b': value = '\b'; break;
5856 case 'e': value = ASCII_TO_NATIVE('\033');break;
5857 case 'a': value = ASCII_TO_NATIVE('\007');break;
5859 if (*RExC_parse == '{') {
5860 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5861 | PERL_SCAN_DISALLOW_PREFIX;
5862 char * const e = strchr(RExC_parse++, '}');
5864 vFAIL("Missing right brace on \\x{}");
5866 numlen = e - RExC_parse;
5867 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5871 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5873 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5874 RExC_parse += numlen;
5878 value = UCHARAT(RExC_parse++);
5879 value = toCTRL(value);
5881 case '0': case '1': case '2': case '3': case '4':
5882 case '5': case '6': case '7': case '8': case '9':
5886 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5887 RExC_parse += numlen;
5891 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5893 "Unrecognized escape \\%c in character class passed through",
5897 } /* end of \blah */
5903 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5905 if (!SIZE_ONLY && !need_class)
5906 ANYOF_CLASS_ZERO(ret);
5910 /* a bad range like a-\d, a-[:digit:] ? */
5913 if (ckWARN(WARN_REGEXP)) {
5915 RExC_parse >= rangebegin ?
5916 RExC_parse - rangebegin : 0;
5918 "False [] range \"%*.*s\"",
5921 if (prevvalue < 256) {
5922 ANYOF_BITMAP_SET(ret, prevvalue);
5923 ANYOF_BITMAP_SET(ret, '-');
5926 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5927 Perl_sv_catpvf(aTHX_ listsv,
5928 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5932 range = 0; /* this was not a true range */
5936 const char *what = NULL;
5939 if (namedclass > OOB_NAMEDCLASS)
5940 optimize_invert = FALSE;
5941 /* Possible truncation here but in some 64-bit environments
5942 * the compiler gets heartburn about switch on 64-bit values.
5943 * A similar issue a little earlier when switching on value.
5945 switch ((I32)namedclass) {
5948 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5950 for (value = 0; value < 256; value++)
5952 ANYOF_BITMAP_SET(ret, value);
5959 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5961 for (value = 0; value < 256; value++)
5962 if (!isALNUM(value))
5963 ANYOF_BITMAP_SET(ret, value);
5970 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5972 for (value = 0; value < 256; value++)
5973 if (isALNUMC(value))
5974 ANYOF_BITMAP_SET(ret, value);
5981 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5983 for (value = 0; value < 256; value++)
5984 if (!isALNUMC(value))
5985 ANYOF_BITMAP_SET(ret, value);
5992 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5994 for (value = 0; value < 256; value++)
5996 ANYOF_BITMAP_SET(ret, value);
6003 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
6005 for (value = 0; value < 256; value++)
6006 if (!isALPHA(value))
6007 ANYOF_BITMAP_SET(ret, value);
6014 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
6017 for (value = 0; value < 128; value++)
6018 ANYOF_BITMAP_SET(ret, value);
6020 for (value = 0; value < 256; value++) {
6022 ANYOF_BITMAP_SET(ret, value);
6031 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
6034 for (value = 128; value < 256; value++)
6035 ANYOF_BITMAP_SET(ret, value);
6037 for (value = 0; value < 256; value++) {
6038 if (!isASCII(value))
6039 ANYOF_BITMAP_SET(ret, value);
6048 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
6050 for (value = 0; value < 256; value++)
6052 ANYOF_BITMAP_SET(ret, value);
6059 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
6061 for (value = 0; value < 256; value++)
6062 if (!isBLANK(value))
6063 ANYOF_BITMAP_SET(ret, value);
6070 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
6072 for (value = 0; value < 256; value++)
6074 ANYOF_BITMAP_SET(ret, value);
6081 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
6083 for (value = 0; value < 256; value++)
6084 if (!isCNTRL(value))
6085 ANYOF_BITMAP_SET(ret, value);
6092 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
6094 /* consecutive digits assumed */
6095 for (value = '0'; value <= '9'; value++)
6096 ANYOF_BITMAP_SET(ret, value);
6103 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
6105 /* consecutive digits assumed */
6106 for (value = 0; value < '0'; value++)
6107 ANYOF_BITMAP_SET(ret, value);
6108 for (value = '9' + 1; value < 256; value++)
6109 ANYOF_BITMAP_SET(ret, value);
6116 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
6118 for (value = 0; value < 256; value++)
6120 ANYOF_BITMAP_SET(ret, value);
6127 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
6129 for (value = 0; value < 256; value++)
6130 if (!isGRAPH(value))
6131 ANYOF_BITMAP_SET(ret, value);
6138 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
6140 for (value = 0; value < 256; value++)
6142 ANYOF_BITMAP_SET(ret, value);
6149 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
6151 for (value = 0; value < 256; value++)
6152 if (!isLOWER(value))
6153 ANYOF_BITMAP_SET(ret, value);
6160 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
6162 for (value = 0; value < 256; value++)
6164 ANYOF_BITMAP_SET(ret, value);
6171 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
6173 for (value = 0; value < 256; value++)
6174 if (!isPRINT(value))
6175 ANYOF_BITMAP_SET(ret, value);
6182 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
6184 for (value = 0; value < 256; value++)
6185 if (isPSXSPC(value))
6186 ANYOF_BITMAP_SET(ret, value);
6193 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
6195 for (value = 0; value < 256; value++)
6196 if (!isPSXSPC(value))
6197 ANYOF_BITMAP_SET(ret, value);
6204 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
6206 for (value = 0; value < 256; value++)
6208 ANYOF_BITMAP_SET(ret, value);
6215 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
6217 for (value = 0; value < 256; value++)
6218 if (!isPUNCT(value))
6219 ANYOF_BITMAP_SET(ret, value);
6226 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
6228 for (value = 0; value < 256; value++)
6230 ANYOF_BITMAP_SET(ret, value);
6237 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
6239 for (value = 0; value < 256; value++)
6240 if (!isSPACE(value))
6241 ANYOF_BITMAP_SET(ret, value);
6248 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
6250 for (value = 0; value < 256; value++)
6252 ANYOF_BITMAP_SET(ret, value);
6259 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
6261 for (value = 0; value < 256; value++)
6262 if (!isUPPER(value))
6263 ANYOF_BITMAP_SET(ret, value);
6270 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
6272 for (value = 0; value < 256; value++)
6273 if (isXDIGIT(value))
6274 ANYOF_BITMAP_SET(ret, value);
6281 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
6283 for (value = 0; value < 256; value++)
6284 if (!isXDIGIT(value))
6285 ANYOF_BITMAP_SET(ret, value);
6291 /* this is to handle \p and \P */
6294 vFAIL("Invalid [::] class");
6298 /* Strings such as "+utf8::isWord\n" */
6299 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
6302 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
6305 } /* end of namedclass \blah */
6308 if (prevvalue > (IV)value) /* b-a */ {
6309 const int w = RExC_parse - rangebegin;
6310 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
6311 range = 0; /* not a valid range */
6315 prevvalue = value; /* save the beginning of the range */
6316 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
6317 RExC_parse[1] != ']') {
6320 /* a bad range like \w-, [:word:]- ? */
6321 if (namedclass > OOB_NAMEDCLASS) {
6322 if (ckWARN(WARN_REGEXP)) {
6324 RExC_parse >= rangebegin ?
6325 RExC_parse - rangebegin : 0;
6327 "False [] range \"%*.*s\"",
6331 ANYOF_BITMAP_SET(ret, '-');
6333 range = 1; /* yeah, it's a range! */
6334 continue; /* but do it the next time */
6338 /* now is the next time */
6339 /*stored += (value - prevvalue + 1);*/
6341 if (prevvalue < 256) {
6342 const IV ceilvalue = value < 256 ? value : 255;
6345 /* In EBCDIC [\x89-\x91] should include
6346 * the \x8e but [i-j] should not. */
6347 if (literal_endpoint == 2 &&
6348 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
6349 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
6351 if (isLOWER(prevvalue)) {
6352 for (i = prevvalue; i <= ceilvalue; i++)
6354 ANYOF_BITMAP_SET(ret, i);
6356 for (i = prevvalue; i <= ceilvalue; i++)
6358 ANYOF_BITMAP_SET(ret, i);
6363 for (i = prevvalue; i <= ceilvalue; i++) {
6364 if (!ANYOF_BITMAP_TEST(ret,i)) {
6366 ANYOF_BITMAP_SET(ret, i);
6370 if (value > 255 || UTF) {
6371 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
6372 const UV natvalue = NATIVE_TO_UNI(value);
6373 stored+=2; /* can't optimize this class */
6374 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6375 if (prevnatvalue < natvalue) { /* what about > ? */
6376 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
6377 prevnatvalue, natvalue);
6379 else if (prevnatvalue == natvalue) {
6380 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
6382 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
6384 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
6386 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
6387 if (RExC_precomp[0] == ':' &&
6388 RExC_precomp[1] == '[' &&
6389 (f == 0xDF || f == 0x92)) {
6390 f = NATIVE_TO_UNI(f);
6393 /* If folding and foldable and a single
6394 * character, insert also the folded version
6395 * to the charclass. */
6397 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
6398 if ((RExC_precomp[0] == ':' &&
6399 RExC_precomp[1] == '[' &&
6401 (value == 0xFB05 || value == 0xFB06))) ?
6402 foldlen == ((STRLEN)UNISKIP(f) - 1) :
6403 foldlen == (STRLEN)UNISKIP(f) )
6405 if (foldlen == (STRLEN)UNISKIP(f))
6407 Perl_sv_catpvf(aTHX_ listsv,
6410 /* Any multicharacter foldings
6411 * require the following transform:
6412 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
6413 * where E folds into "pq" and F folds
6414 * into "rst", all other characters
6415 * fold to single characters. We save
6416 * away these multicharacter foldings,
6417 * to be later saved as part of the
6418 * additional "s" data. */
6421 if (!unicode_alternate)
6422 unicode_alternate = newAV();
6423 sv = newSVpvn((char*)foldbuf, foldlen);
6425 av_push(unicode_alternate, sv);
6429 /* If folding and the value is one of the Greek
6430 * sigmas insert a few more sigmas to make the
6431 * folding rules of the sigmas to work right.
6432 * Note that not all the possible combinations
6433 * are handled here: some of them are handled
6434 * by the standard folding rules, and some of
6435 * them (literal or EXACTF cases) are handled
6436 * during runtime in regexec.c:S_find_byclass(). */
6437 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
6438 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
6439 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
6440 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
6441 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
6443 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
6444 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
6445 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
6450 literal_endpoint = 0;
6454 range = 0; /* this range (if it was one) is done now */
6458 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
6460 RExC_size += ANYOF_CLASS_ADD_SKIP;
6462 RExC_emit += ANYOF_CLASS_ADD_SKIP;
6468 /****** !SIZE_ONLY AFTER HERE *********/
6470 if( stored == 1 && value < 256
6471 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
6473 /* optimize single char class to an EXACT node
6474 but *only* when its not a UTF/high char */
6475 const char * cur_parse= RExC_parse;
6476 RExC_emit = (regnode *)orig_emit;
6477 RExC_parse = (char *)orig_parse;
6478 ret = reg_node(pRExC_state,
6479 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
6480 RExC_parse = (char *)cur_parse;
6481 *STRING(ret)= (char)value;
6483 RExC_emit += STR_SZ(1);
6486 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
6487 if ( /* If the only flag is folding (plus possibly inversion). */
6488 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
6490 for (value = 0; value < 256; ++value) {
6491 if (ANYOF_BITMAP_TEST(ret, value)) {
6492 UV fold = PL_fold[value];
6495 ANYOF_BITMAP_SET(ret, fold);
6498 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
6501 /* optimize inverted simple patterns (e.g. [^a-z]) */
6502 if (optimize_invert &&
6503 /* If the only flag is inversion. */
6504 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
6505 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
6506 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
6507 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
6510 AV * const av = newAV();
6512 /* The 0th element stores the character class description
6513 * in its textual form: used later (regexec.c:Perl_regclass_swash())
6514 * to initialize the appropriate swash (which gets stored in
6515 * the 1st element), and also useful for dumping the regnode.
6516 * The 2nd element stores the multicharacter foldings,
6517 * used later (regexec.c:S_reginclass()). */
6518 av_store(av, 0, listsv);
6519 av_store(av, 1, NULL);
6520 av_store(av, 2, (SV*)unicode_alternate);
6521 rv = newRV_noinc((SV*)av);
6522 n = add_data(pRExC_state, 1, "s");
6523 RExC_rx->data->data[n] = (void*)rv;
6530 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
6532 char* const retval = RExC_parse++;
6535 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6536 RExC_parse[2] == '#') {
6537 while (*RExC_parse != ')') {
6538 if (RExC_parse == RExC_end)
6539 FAIL("Sequence (?#... not terminated");
6545 if (RExC_flags & PMf_EXTENDED) {
6546 if (isSPACE(*RExC_parse)) {
6550 else if (*RExC_parse == '#') {
6551 while (RExC_parse < RExC_end)
6552 if (*RExC_parse++ == '\n') break;
6561 - reg_node - emit a node
6563 STATIC regnode * /* Location. */
6564 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
6567 register regnode *ptr;
6568 regnode * const ret = RExC_emit;
6569 GET_RE_DEBUG_FLAGS_DECL;
6572 SIZE_ALIGN(RExC_size);
6576 NODE_ALIGN_FILL(ret);
6578 FILL_ADVANCE_NODE(ptr, op);
6579 if (RExC_offsets) { /* MJD */
6580 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
6581 "reg_node", __LINE__,
6583 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6584 ? "Overwriting end of array!\n" : "OK",
6585 (UV)(RExC_emit - RExC_emit_start),
6586 (UV)(RExC_parse - RExC_start),
6587 (UV)RExC_offsets[0]));
6588 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
6597 - reganode - emit a node with an argument
6599 STATIC regnode * /* Location. */
6600 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6603 register regnode *ptr;
6604 regnode * const ret = RExC_emit;
6605 GET_RE_DEBUG_FLAGS_DECL;
6608 SIZE_ALIGN(RExC_size);
6613 NODE_ALIGN_FILL(ret);
6615 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6616 if (RExC_offsets) { /* MJD */
6617 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6621 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6622 "Overwriting end of array!\n" : "OK",
6623 (UV)(RExC_emit - RExC_emit_start),
6624 (UV)(RExC_parse - RExC_start),
6625 (UV)RExC_offsets[0]));
6626 Set_Cur_Node_Offset;
6635 - reguni - emit (if appropriate) a Unicode character
6638 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6641 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6645 - reginsert - insert an operator in front of already-emitted operand
6647 * Means relocating the operand.
6650 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6653 register regnode *src;
6654 register regnode *dst;
6655 register regnode *place;
6656 const int offset = regarglen[(U8)op];
6657 GET_RE_DEBUG_FLAGS_DECL;
6658 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6661 RExC_size += NODE_STEP_REGNODE + offset;
6666 RExC_emit += NODE_STEP_REGNODE + offset;
6668 while (src > opnd) {
6669 StructCopy(--src, --dst, regnode);
6670 if (RExC_offsets) { /* MJD 20010112 */
6671 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6675 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6676 ? "Overwriting end of array!\n" : "OK",
6677 (UV)(src - RExC_emit_start),
6678 (UV)(dst - RExC_emit_start),
6679 (UV)RExC_offsets[0]));
6680 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6681 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6686 place = opnd; /* Op node, where operand used to be. */
6687 if (RExC_offsets) { /* MJD */
6688 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6692 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6693 ? "Overwriting end of array!\n" : "OK",
6694 (UV)(place - RExC_emit_start),
6695 (UV)(RExC_parse - RExC_start),
6696 (UV)RExC_offsets[0]));
6697 Set_Node_Offset(place, RExC_parse);
6698 Set_Node_Length(place, 1);
6700 src = NEXTOPER(place);
6701 FILL_ADVANCE_NODE(place, op);
6702 Zero(src, offset, regnode);
6706 - regtail - set the next-pointer at the end of a node chain of p to val.
6707 - SEE ALSO: regtail_study
6709 /* TODO: All three parms should be const */
6711 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6714 register regnode *scan;
6715 GET_RE_DEBUG_FLAGS_DECL;
6717 PERL_UNUSED_ARG(depth);
6723 /* Find last node. */
6726 regnode * const temp = regnext(scan);
6728 SV * const mysv=sv_newmortal();
6729 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6730 regprop(RExC_rx, mysv, scan);
6731 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6732 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6739 if (reg_off_by_arg[OP(scan)]) {
6740 ARG_SET(scan, val - scan);
6743 NEXT_OFF(scan) = val - scan;
6749 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6750 - Look for optimizable sequences at the same time.
6751 - currently only looks for EXACT chains.
6753 This is expermental code. The idea is to use this routine to perform
6754 in place optimizations on branches and groups as they are constructed,
6755 with the long term intention of removing optimization from study_chunk so
6756 that it is purely analytical.
6758 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6759 to control which is which.
6762 /* TODO: All four parms should be const */
6765 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6768 register regnode *scan;
6770 #ifdef EXPERIMENTAL_INPLACESCAN
6774 GET_RE_DEBUG_FLAGS_DECL;
6780 /* Find last node. */
6784 regnode * const temp = regnext(scan);
6785 #ifdef EXPERIMENTAL_INPLACESCAN
6786 if (PL_regkind[OP(scan)] == EXACT)
6787 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6795 if( exact == PSEUDO )
6797 else if ( exact != OP(scan) )
6806 SV * const mysv=sv_newmortal();
6807 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6808 regprop(RExC_rx, mysv, scan);
6809 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6810 SvPV_nolen_const(mysv),
6812 REG_NODE_NUM(scan));
6819 SV * const mysv_val=sv_newmortal();
6820 DEBUG_PARSE_MSG("");
6821 regprop(RExC_rx, mysv_val, val);
6822 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6823 SvPV_nolen_const(mysv_val),
6828 if (reg_off_by_arg[OP(scan)]) {
6829 ARG_SET(scan, val - scan);
6832 NEXT_OFF(scan) = val - scan;
6840 - regcurly - a little FSA that accepts {\d+,?\d*}
6843 S_regcurly(register const char *s)
6862 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6865 Perl_regdump(pTHX_ const regexp *r)
6869 SV * const sv = sv_newmortal();
6870 SV *dsv= sv_newmortal();
6872 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
6874 /* Header fields of interest. */
6875 if (r->anchored_substr) {
6876 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
6877 RE_SV_DUMPLEN(r->anchored_substr), 30);
6878 PerlIO_printf(Perl_debug_log,
6879 "anchored %s%s at %"IVdf" ",
6880 s, RE_SV_TAIL(r->anchored_substr),
6881 (IV)r->anchored_offset);
6882 } else if (r->anchored_utf8) {
6883 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
6884 RE_SV_DUMPLEN(r->anchored_utf8), 30);
6885 PerlIO_printf(Perl_debug_log,
6886 "anchored utf8 %s%s at %"IVdf" ",
6887 s, RE_SV_TAIL(r->anchored_utf8),
6888 (IV)r->anchored_offset);
6890 if (r->float_substr) {
6891 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
6892 RE_SV_DUMPLEN(r->float_substr), 30);
6893 PerlIO_printf(Perl_debug_log,
6894 "floating %s%s at %"IVdf"..%"UVuf" ",
6895 s, RE_SV_TAIL(r->float_substr),
6896 (IV)r->float_min_offset, (UV)r->float_max_offset);
6897 } else if (r->float_utf8) {
6898 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
6899 RE_SV_DUMPLEN(r->float_utf8), 30);
6900 PerlIO_printf(Perl_debug_log,
6901 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
6902 s, RE_SV_TAIL(r->float_utf8),
6903 (IV)r->float_min_offset, (UV)r->float_max_offset);
6905 if (r->check_substr || r->check_utf8)
6906 PerlIO_printf(Perl_debug_log,
6908 (r->check_substr == r->float_substr
6909 && r->check_utf8 == r->float_utf8
6910 ? "(checking floating" : "(checking anchored"));
6911 if (r->reganch & ROPT_NOSCAN)
6912 PerlIO_printf(Perl_debug_log, " noscan");
6913 if (r->reganch & ROPT_CHECK_ALL)
6914 PerlIO_printf(Perl_debug_log, " isall");
6915 if (r->check_substr || r->check_utf8)
6916 PerlIO_printf(Perl_debug_log, ") ");
6918 if (r->regstclass) {
6919 regprop(r, sv, r->regstclass);
6920 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
6922 if (r->reganch & ROPT_ANCH) {
6923 PerlIO_printf(Perl_debug_log, "anchored");
6924 if (r->reganch & ROPT_ANCH_BOL)
6925 PerlIO_printf(Perl_debug_log, "(BOL)");
6926 if (r->reganch & ROPT_ANCH_MBOL)
6927 PerlIO_printf(Perl_debug_log, "(MBOL)");
6928 if (r->reganch & ROPT_ANCH_SBOL)
6929 PerlIO_printf(Perl_debug_log, "(SBOL)");
6930 if (r->reganch & ROPT_ANCH_GPOS)
6931 PerlIO_printf(Perl_debug_log, "(GPOS)");
6932 PerlIO_putc(Perl_debug_log, ' ');
6934 if (r->reganch & ROPT_GPOS_SEEN)
6935 PerlIO_printf(Perl_debug_log, "GPOS ");
6936 if (r->reganch & ROPT_SKIP)
6937 PerlIO_printf(Perl_debug_log, "plus ");
6938 if (r->reganch & ROPT_IMPLICIT)
6939 PerlIO_printf(Perl_debug_log, "implicit ");
6940 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6941 if (r->reganch & ROPT_EVAL_SEEN)
6942 PerlIO_printf(Perl_debug_log, "with eval ");
6943 PerlIO_printf(Perl_debug_log, "\n");
6945 PERL_UNUSED_CONTEXT;
6947 #endif /* DEBUGGING */
6951 - regprop - printable representation of opcode
6954 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6959 GET_RE_DEBUG_FLAGS_DECL;
6961 sv_setpvn(sv, "", 0);
6962 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6963 /* It would be nice to FAIL() here, but this may be called from
6964 regexec.c, and it would be hard to supply pRExC_state. */
6965 Perl_croak(aTHX_ "Corrupted regexp opcode");
6966 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6968 k = PL_regkind[OP(o)];
6971 SV * const dsv = sv_2mortal(newSVpvs(""));
6972 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
6973 * is a crude hack but it may be the best for now since
6974 * we have no flag "this EXACTish node was UTF-8"
6976 const char * const s =
6977 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
6978 PL_colors[0], PL_colors[1],
6979 PERL_PV_ESCAPE_UNI_DETECT |
6980 PERL_PV_PRETTY_ELIPSES |
6983 Perl_sv_catpvf(aTHX_ sv, " %s", s );
6984 } else if (k == TRIE) {
6985 /* print the details of the trie in dumpuntil instead, as
6986 * prog->data isn't available here */
6987 const char op = OP(o);
6988 const I32 n = ARG(o);
6989 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
6990 (reg_ac_data *)prog->data->data[n] :
6992 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
6993 (reg_trie_data*)prog->data->data[n] :
6996 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6997 DEBUG_TRIE_COMPILE_r(
6998 Perl_sv_catpvf(aTHX_ sv,
6999 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
7000 (UV)trie->startstate,
7001 (IV)trie->laststate-1,
7002 (UV)trie->wordcount,
7005 (UV)TRIE_CHARCOUNT(trie),
7006 (UV)trie->uniquecharcount
7009 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
7011 int rangestart = -1;
7012 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
7013 Perl_sv_catpvf(aTHX_ sv, "[");
7014 for (i = 0; i <= 256; i++) {
7015 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7016 if (rangestart == -1)
7018 } else if (rangestart != -1) {
7019 if (i <= rangestart + 3)
7020 for (; rangestart < i; rangestart++)
7021 put_byte(sv, rangestart);
7023 put_byte(sv, rangestart);
7025 put_byte(sv, i - 1);
7030 Perl_sv_catpvf(aTHX_ sv, "]");
7033 } else if (k == CURLY) {
7034 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
7035 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
7036 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
7038 else if (k == WHILEM && o->flags) /* Ordinal/of */
7039 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
7040 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
7041 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
7042 else if (k == LOGICAL)
7043 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
7044 else if (k == ANYOF) {
7045 int i, rangestart = -1;
7046 const U8 flags = ANYOF_FLAGS(o);
7048 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
7049 static const char * const anyofs[] = {
7082 if (flags & ANYOF_LOCALE)
7083 sv_catpvs(sv, "{loc}");
7084 if (flags & ANYOF_FOLD)
7085 sv_catpvs(sv, "{i}");
7086 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
7087 if (flags & ANYOF_INVERT)
7089 for (i = 0; i <= 256; i++) {
7090 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
7091 if (rangestart == -1)
7093 } else if (rangestart != -1) {
7094 if (i <= rangestart + 3)
7095 for (; rangestart < i; rangestart++)
7096 put_byte(sv, rangestart);
7098 put_byte(sv, rangestart);
7100 put_byte(sv, i - 1);
7106 if (o->flags & ANYOF_CLASS)
7107 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
7108 if (ANYOF_CLASS_TEST(o,i))
7109 sv_catpv(sv, anyofs[i]);
7111 if (flags & ANYOF_UNICODE)
7112 sv_catpvs(sv, "{unicode}");
7113 else if (flags & ANYOF_UNICODE_ALL)
7114 sv_catpvs(sv, "{unicode_all}");
7118 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
7122 U8 s[UTF8_MAXBYTES_CASE+1];
7124 for (i = 0; i <= 256; i++) { /* just the first 256 */
7125 uvchr_to_utf8(s, i);
7127 if (i < 256 && swash_fetch(sw, s, TRUE)) {
7128 if (rangestart == -1)
7130 } else if (rangestart != -1) {
7131 if (i <= rangestart + 3)
7132 for (; rangestart < i; rangestart++) {
7133 const U8 * const e = uvchr_to_utf8(s,rangestart);
7135 for(p = s; p < e; p++)
7139 const U8 *e = uvchr_to_utf8(s,rangestart);
7141 for (p = s; p < e; p++)
7144 e = uvchr_to_utf8(s, i-1);
7145 for (p = s; p < e; p++)
7152 sv_catpvs(sv, "..."); /* et cetera */
7156 char *s = savesvpv(lv);
7157 char * const origs = s;
7159 while (*s && *s != '\n')
7163 const char * const t = ++s;
7181 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
7183 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
7184 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
7186 PERL_UNUSED_CONTEXT;
7187 PERL_UNUSED_ARG(sv);
7189 PERL_UNUSED_ARG(prog);
7190 #endif /* DEBUGGING */
7194 Perl_re_intuit_string(pTHX_ regexp *prog)
7195 { /* Assume that RE_INTUIT is set */
7197 GET_RE_DEBUG_FLAGS_DECL;
7198 PERL_UNUSED_CONTEXT;
7202 const char * const s = SvPV_nolen_const(prog->check_substr
7203 ? prog->check_substr : prog->check_utf8);
7205 if (!PL_colorset) reginitcolors();
7206 PerlIO_printf(Perl_debug_log,
7207 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
7209 prog->check_substr ? "" : "utf8 ",
7210 PL_colors[5],PL_colors[0],
7213 (strlen(s) > 60 ? "..." : ""));
7216 return prog->check_substr ? prog->check_substr : prog->check_utf8;
7220 Perl_pregfree(pTHX_ struct regexp *r)
7224 GET_RE_DEBUG_FLAGS_DECL;
7226 if (!r || (--r->refcnt > 0))
7232 SV *dsv= sv_newmortal();
7233 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
7234 dsv, r->precomp, r->prelen, 60);
7235 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
7236 PL_colors[4],PL_colors[5],s);
7240 /* gcov results gave these as non-null 100% of the time, so there's no
7241 optimisation in checking them before calling Safefree */
7242 Safefree(r->precomp);
7243 Safefree(r->offsets); /* 20010421 MJD */
7244 RX_MATCH_COPY_FREE(r);
7245 #ifdef PERL_OLD_COPY_ON_WRITE
7247 SvREFCNT_dec(r->saved_copy);
7250 if (r->anchored_substr)
7251 SvREFCNT_dec(r->anchored_substr);
7252 if (r->anchored_utf8)
7253 SvREFCNT_dec(r->anchored_utf8);
7254 if (r->float_substr)
7255 SvREFCNT_dec(r->float_substr);
7257 SvREFCNT_dec(r->float_utf8);
7258 Safefree(r->substrs);
7261 int n = r->data->count;
7262 PAD* new_comppad = NULL;
7267 /* If you add a ->what type here, update the comment in regcomp.h */
7268 switch (r->data->what[n]) {
7270 SvREFCNT_dec((SV*)r->data->data[n]);
7273 Safefree(r->data->data[n]);
7276 new_comppad = (AV*)r->data->data[n];
7279 if (new_comppad == NULL)
7280 Perl_croak(aTHX_ "panic: pregfree comppad");
7281 PAD_SAVE_LOCAL(old_comppad,
7282 /* Watch out for global destruction's random ordering. */
7283 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
7286 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
7289 op_free((OP_4tree*)r->data->data[n]);
7291 PAD_RESTORE_LOCAL(old_comppad);
7292 SvREFCNT_dec((SV*)new_comppad);
7298 { /* Aho Corasick add-on structure for a trie node.
7299 Used in stclass optimization only */
7301 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
7303 refcount = --aho->refcount;
7306 Safefree(aho->states);
7307 Safefree(aho->fail);
7308 aho->trie=NULL; /* not necessary to free this as it is
7309 handled by the 't' case */
7310 Safefree(r->data->data[n]); /* do this last!!!! */
7311 Safefree(r->regstclass);
7317 /* trie structure. */
7319 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
7321 refcount = --trie->refcount;
7324 Safefree(trie->charmap);
7325 if (trie->widecharmap)
7326 SvREFCNT_dec((SV*)trie->widecharmap);
7327 Safefree(trie->states);
7328 Safefree(trie->trans);
7330 Safefree(trie->bitmap);
7332 Safefree(trie->wordlen);
7334 Safefree(trie->jump);
7336 Safefree(trie->nextword);
7340 SvREFCNT_dec((SV*)trie->words);
7341 if (trie->revcharmap)
7342 SvREFCNT_dec((SV*)trie->revcharmap);
7345 Safefree(r->data->data[n]); /* do this last!!!! */
7350 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
7353 Safefree(r->data->what);
7356 Safefree(r->startp);
7361 #ifndef PERL_IN_XSUB_RE
7363 - regnext - dig the "next" pointer out of a node
7366 Perl_regnext(pTHX_ register regnode *p)
7369 register I32 offset;
7371 if (p == &PL_regdummy)
7374 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
7383 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
7386 STRLEN l1 = strlen(pat1);
7387 STRLEN l2 = strlen(pat2);
7390 const char *message;
7396 Copy(pat1, buf, l1 , char);
7397 Copy(pat2, buf + l1, l2 , char);
7398 buf[l1 + l2] = '\n';
7399 buf[l1 + l2 + 1] = '\0';
7401 /* ANSI variant takes additional second argument */
7402 va_start(args, pat2);
7406 msv = vmess(buf, &args);
7408 message = SvPV_const(msv,l1);
7411 Copy(message, buf, l1 , char);
7412 buf[l1-1] = '\0'; /* Overwrite \n */
7413 Perl_croak(aTHX_ "%s", buf);
7416 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
7418 #ifndef PERL_IN_XSUB_RE
7420 Perl_save_re_context(pTHX)
7424 struct re_save_state *state;
7426 SAVEVPTR(PL_curcop);
7427 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
7429 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
7430 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
7431 SSPUSHINT(SAVEt_RE_STATE);
7433 Copy(&PL_reg_state, state, 1, struct re_save_state);
7435 PL_reg_start_tmp = 0;
7436 PL_reg_start_tmpl = 0;
7437 PL_reg_oldsaved = NULL;
7438 PL_reg_oldsavedlen = 0;
7440 PL_reg_leftiter = 0;
7441 PL_reg_poscache = NULL;
7442 PL_reg_poscache_size = 0;
7443 #ifdef PERL_OLD_COPY_ON_WRITE
7447 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
7449 const REGEXP * const rx = PM_GETRE(PL_curpm);
7452 for (i = 1; i <= rx->nparens; i++) {
7453 char digits[TYPE_CHARS(long)];
7454 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
7455 GV *const *const gvp
7456 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
7459 GV * const gv = *gvp;
7460 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
7470 clear_re(pTHX_ void *r)
7473 ReREFCNT_dec((regexp *)r);
7479 S_put_byte(pTHX_ SV *sv, int c)
7481 if (isCNTRL(c) || c == 255 || !isPRINT(c))
7482 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
7483 else if (c == '-' || c == ']' || c == '\\' || c == '^')
7484 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
7486 Perl_sv_catpvf(aTHX_ sv, "%c", c);
7490 #define CLEAR_OPTSTART \
7491 if (optstart) STMT_START { \
7492 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
7496 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
7498 STATIC const regnode *
7499 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
7500 const regnode *last, const regnode *plast,
7501 SV* sv, I32 indent, U32 depth)
7504 register U8 op = PSEUDO; /* Arbitrary non-END op. */
7505 register const regnode *next;
7506 const regnode *optstart= NULL;
7507 GET_RE_DEBUG_FLAGS_DECL;
7509 #ifdef DEBUG_DUMPUNTIL
7510 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
7511 last ? last-start : 0,plast ? plast-start : 0);
7514 if (plast && plast < last)
7517 while (PL_regkind[op] != END && (!last || node < last)) {
7518 /* While that wasn't END last time... */
7524 next = regnext((regnode *)node);
7527 if (OP(node) == OPTIMIZED) {
7528 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
7535 regprop(r, sv, node);
7536 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
7537 (int)(2*indent + 1), "", SvPVX_const(sv));
7539 if (OP(node) != OPTIMIZED) {
7540 if (next == NULL) /* Next ptr. */
7541 PerlIO_printf(Perl_debug_log, "(0)");
7542 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
7543 PerlIO_printf(Perl_debug_log, "(FAIL)");
7545 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
7547 /*if (PL_regkind[(U8)op] != TRIE)*/
7548 (void)PerlIO_putc(Perl_debug_log, '\n');
7552 if (PL_regkind[(U8)op] == BRANCHJ) {
7555 register const regnode *nnode = (OP(next) == LONGJMP
7556 ? regnext((regnode *)next)
7558 if (last && nnode > last)
7560 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
7563 else if (PL_regkind[(U8)op] == BRANCH) {
7565 DUMPUNTIL(NEXTOPER(node), next);
7567 else if ( PL_regkind[(U8)op] == TRIE ) {
7568 const char op = OP(node);
7569 const I32 n = ARG(node);
7570 const reg_ac_data * const ac = op>=AHOCORASICK ?
7571 (reg_ac_data *)r->data->data[n] :
7573 const reg_trie_data * const trie = op<AHOCORASICK ?
7574 (reg_trie_data*)r->data->data[n] :
7576 const regnode *nextbranch= NULL;
7578 sv_setpvn(sv, "", 0);
7579 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
7580 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
7582 PerlIO_printf(Perl_debug_log, "%*s%s ",
7583 (int)(2*(indent+3)), "",
7584 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
7585 PL_colors[0], PL_colors[1],
7586 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
7587 PERL_PV_PRETTY_ELIPSES |
7593 U16 dist= trie->jump[word_idx+1];
7594 PerlIO_printf(Perl_debug_log, "(%u)\n",(next - dist) - start);
7597 nextbranch= next - trie->jump[0];
7598 DUMPUNTIL(next - dist, nextbranch);
7600 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
7601 nextbranch= regnext((regnode *)nextbranch);
7603 PerlIO_printf(Perl_debug_log, "\n");
7606 if (last && next > last)
7611 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
7612 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
7613 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
7615 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
7617 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
7619 else if ( op == PLUS || op == STAR) {
7620 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
7622 else if (op == ANYOF) {
7623 /* arglen 1 + class block */
7624 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7625 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7626 node = NEXTOPER(node);
7628 else if (PL_regkind[(U8)op] == EXACT) {
7629 /* Literal string, where present. */
7630 node += NODE_SZ_STR(node) - 1;
7631 node = NEXTOPER(node);
7634 node = NEXTOPER(node);
7635 node += regarglen[(U8)op];
7637 if (op == CURLYX || op == OPEN)
7639 else if (op == WHILEM)
7643 #ifdef DEBUG_DUMPUNTIL
7644 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
7649 #endif /* DEBUGGING */
7653 * c-indentation-style: bsd
7655 * indent-tabs-mode: t
7658 * ex: set ts=8 sts=4 sw=4 noet: