5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
106 char *start; /* Start of input for compile */
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
110 regnode *emit_start; /* Start of emitted-code area */
111 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
122 char *starttry; /* -Dr: where regtry was called. */
123 #define RExC_starttry (pRExC_state->starttry)
126 const char *lastparse;
128 #define RExC_lastparse (pRExC_state->lastparse)
129 #define RExC_lastnum (pRExC_state->lastnum)
133 #define RExC_flags (pRExC_state->flags)
134 #define RExC_precomp (pRExC_state->precomp)
135 #define RExC_rx (pRExC_state->rx)
136 #define RExC_start (pRExC_state->start)
137 #define RExC_end (pRExC_state->end)
138 #define RExC_parse (pRExC_state->parse)
139 #define RExC_whilem_seen (pRExC_state->whilem_seen)
140 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
141 #define RExC_emit (pRExC_state->emit)
142 #define RExC_emit_start (pRExC_state->emit_start)
143 #define RExC_naughty (pRExC_state->naughty)
144 #define RExC_sawback (pRExC_state->sawback)
145 #define RExC_seen (pRExC_state->seen)
146 #define RExC_size (pRExC_state->size)
147 #define RExC_npar (pRExC_state->npar)
148 #define RExC_extralen (pRExC_state->extralen)
149 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151 #define RExC_utf8 (pRExC_state->utf8)
153 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
158 #undef SPSTART /* dratted cpp namespace... */
161 * Flags to be passed up and down.
163 #define WORST 0 /* Worst case. */
164 #define HASWIDTH 0x1 /* Known to match non-null strings. */
165 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 0x4 /* Starts with * or +. */
167 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
169 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
171 /* whether trie related optimizations are enabled */
172 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173 #define TRIE_STUDY_OPT
174 #define FULL_TRIE_STUDY
179 /* About scan_data_t.
181 During optimisation we recurse through the regexp program performing
182 various inplace (keyhole style) optimisations. In addition study_chunk
183 and scan_commit populate this data structure with information about
184 what strings MUST appear in the pattern. We look for the longest
185 string that must appear for at a fixed location, and we look for the
186 longest string that may appear at a floating location. So for instance
191 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
192 strings (because they follow a .* construct). study_chunk will identify
193 both FOO and BAR as being the longest fixed and floating strings respectively.
195 The strings can be composites, for instance
199 will result in a composite fixed substring 'foo'.
201 For each string some basic information is maintained:
203 - offset or min_offset
204 This is the position the string must appear at, or not before.
205 It also implicitly (when combined with minlenp) tells us how many
206 character must match before the string we are searching.
207 Likewise when combined with minlenp and the length of the string
208 tells us how many characters must appear after the string we have
212 Only used for floating strings. This is the rightmost point that
213 the string can appear at. Ifset to I32 max it indicates that the
214 string can occur infinitely far to the right.
217 A pointer to the minimum length of the pattern that the string
218 was found inside. This is important as in the case of positive
219 lookahead or positive lookbehind we can have multiple patterns
224 The minimum length of the pattern overall is 3, the minimum length
225 of the lookahead part is 3, but the minimum length of the part that
226 will actually match is 1. So 'FOO's minimum length is 3, but the
227 minimum length for the F is 1. This is important as the minimum length
228 is used to determine offsets in front of and behind the string being
229 looked for. Since strings can be composites this is the length of the
230 pattern at the time it was commited with a scan_commit. Note that
231 the length is calculated by study_chunk, so that the minimum lengths
232 are not known until the full pattern has been compiled, thus the
233 pointer to the value.
237 In the case of lookbehind the string being searched for can be
238 offset past the start point of the final matching string.
239 If this value was just blithely removed from the min_offset it would
240 invalidate some of the calculations for how many chars must match
241 before or after (as they are derived from min_offset and minlen and
242 the length of the string being searched for).
243 When the final pattern is compiled and the data is moved from the
244 scan_data_t structure into the regexp structure the information
245 about lookbehind is factored in, with the information that would
246 have been lost precalculated in the end_shift field for the
249 The fields pos_min and pos_delta are used to store the minimum offset
250 and the delta to the maximum offset at the current point in the pattern.
254 typedef struct scan_data_t {
255 /*I32 len_min; unused */
256 /*I32 len_delta; unused */
260 I32 last_end; /* min value, <0 unless valid. */
263 SV **longest; /* Either &l_fixed, or &l_float. */
264 SV *longest_fixed; /* longest fixed string found in pattern */
265 I32 offset_fixed; /* offset where it starts */
266 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
267 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
268 SV *longest_float; /* longest floating string found in pattern */
269 I32 offset_float_min; /* earliest point in string it can appear */
270 I32 offset_float_max; /* latest point in string it can appear */
271 I32 *minlen_float; /* pointer to the minlen relevent to the string */
272 I32 lookbehind_float; /* is the position of the string modified by LB */
276 struct regnode_charclass_class *start_class;
280 * Forward declarations for pregcomp()'s friends.
283 static const scan_data_t zero_scan_data =
284 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
286 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
287 #define SF_BEFORE_SEOL 0x0001
288 #define SF_BEFORE_MEOL 0x0002
289 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
290 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
293 # define SF_FIX_SHIFT_EOL (0+2)
294 # define SF_FL_SHIFT_EOL (0+4)
296 # define SF_FIX_SHIFT_EOL (+2)
297 # define SF_FL_SHIFT_EOL (+4)
300 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
301 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
303 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
304 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
305 #define SF_IS_INF 0x0040
306 #define SF_HAS_PAR 0x0080
307 #define SF_IN_PAR 0x0100
308 #define SF_HAS_EVAL 0x0200
309 #define SCF_DO_SUBSTR 0x0400
310 #define SCF_DO_STCLASS_AND 0x0800
311 #define SCF_DO_STCLASS_OR 0x1000
312 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
313 #define SCF_WHILEM_VISITED_POS 0x2000
315 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
318 #define UTF (RExC_utf8 != 0)
319 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
320 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
322 #define OOB_UNICODE 12345678
323 #define OOB_NAMEDCLASS -1
325 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
326 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
329 /* length of regex to show in messages that don't mark a position within */
330 #define RegexLengthToShowInErrorMessages 127
333 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
334 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
335 * op/pragma/warn/regcomp.
337 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
338 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
340 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
343 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
344 * arg. Show regex, up to a maximum length. If it's too long, chop and add
347 #define FAIL(msg) STMT_START { \
348 const char *ellipses = ""; \
349 IV len = RExC_end - RExC_precomp; \
352 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
353 if (len > RegexLengthToShowInErrorMessages) { \
354 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
355 len = RegexLengthToShowInErrorMessages - 10; \
358 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
359 msg, (int)len, RExC_precomp, ellipses); \
363 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
365 #define Simple_vFAIL(m) STMT_START { \
366 const IV offset = RExC_parse - RExC_precomp; \
367 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
368 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
372 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
374 #define vFAIL(m) STMT_START { \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 * Like Simple_vFAIL(), but accepts two arguments.
383 #define Simple_vFAIL2(m,a1) STMT_START { \
384 const IV offset = RExC_parse - RExC_precomp; \
385 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
386 (int)offset, RExC_precomp, RExC_precomp + offset); \
390 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
392 #define vFAIL2(m,a1) STMT_START { \
394 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
395 Simple_vFAIL2(m, a1); \
400 * Like Simple_vFAIL(), but accepts three arguments.
402 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
403 const IV offset = RExC_parse - RExC_precomp; \
404 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
405 (int)offset, RExC_precomp, RExC_precomp + offset); \
409 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
411 #define vFAIL3(m,a1,a2) STMT_START { \
413 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
414 Simple_vFAIL3(m, a1, a2); \
418 * Like Simple_vFAIL(), but accepts four arguments.
420 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
421 const IV offset = RExC_parse - RExC_precomp; \
422 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
423 (int)offset, RExC_precomp, RExC_precomp + offset); \
426 #define vWARN(loc,m) STMT_START { \
427 const IV offset = loc - RExC_precomp; \
428 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
432 #define vWARNdep(loc,m) STMT_START { \
433 const IV offset = loc - RExC_precomp; \
434 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
435 "%s" REPORT_LOCATION, \
436 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
440 #define vWARN2(loc, m, a1) STMT_START { \
441 const IV offset = loc - RExC_precomp; \
442 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
443 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
446 #define vWARN3(loc, m, a1, a2) STMT_START { \
447 const IV offset = loc - RExC_precomp; \
448 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
449 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
452 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
453 const IV offset = loc - RExC_precomp; \
454 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
455 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
458 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
459 const IV offset = loc - RExC_precomp; \
460 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
461 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
465 /* Allow for side effects in s */
466 #define REGC(c,s) STMT_START { \
467 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
470 /* Macros for recording node offsets. 20001227 mjd@plover.com
471 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
472 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
473 * Element 0 holds the number n.
474 * Position is 1 indexed.
477 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
479 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
480 __LINE__, (node), (int)(byte))); \
482 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
484 RExC_offsets[2*(node)-1] = (byte); \
489 #define Set_Node_Offset(node,byte) \
490 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
491 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
493 #define Set_Node_Length_To_R(node,len) STMT_START { \
495 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
496 __LINE__, (int)(node), (int)(len))); \
498 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
500 RExC_offsets[2*(node)] = (len); \
505 #define Set_Node_Length(node,len) \
506 Set_Node_Length_To_R((node)-RExC_emit_start, len)
507 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
508 #define Set_Node_Cur_Length(node) \
509 Set_Node_Length(node, RExC_parse - parse_start)
511 /* Get offsets and lengths */
512 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
513 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
515 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
516 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
517 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
521 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
522 #define EXPERIMENTAL_INPLACESCAN
525 #define DEBUG_STUDYDATA(data,depth) \
526 DEBUG_OPTIMISE_r(if(data){ \
527 PerlIO_printf(Perl_debug_log, \
528 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
529 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
530 (int)(depth)*2, "", \
531 (IV)((data)->pos_min), \
532 (IV)((data)->pos_delta), \
533 (IV)((data)->flags), \
534 (IV)((data)->whilem_c), \
535 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
537 if ((data)->last_found) \
538 PerlIO_printf(Perl_debug_log, \
539 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
540 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
541 SvPVX_const((data)->last_found), \
542 (IV)((data)->last_end), \
543 (IV)((data)->last_start_min), \
544 (IV)((data)->last_start_max), \
545 ((data)->longest && \
546 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
547 SvPVX_const((data)->longest_fixed), \
548 (IV)((data)->offset_fixed), \
549 ((data)->longest && \
550 (data)->longest==&((data)->longest_float)) ? "*" : "", \
551 SvPVX_const((data)->longest_float), \
552 (IV)((data)->offset_float_min), \
553 (IV)((data)->offset_float_max) \
555 PerlIO_printf(Perl_debug_log,"\n"); \
558 static void clear_re(pTHX_ void *r);
560 /* Mark that we cannot extend a found fixed substring at this point.
561 Update the longest found anchored substring and the longest found
562 floating substrings if needed. */
565 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
567 const STRLEN l = CHR_SVLEN(data->last_found);
568 const STRLEN old_l = CHR_SVLEN(*data->longest);
569 GET_RE_DEBUG_FLAGS_DECL;
571 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
572 SvSetMagicSV(*data->longest, data->last_found);
573 if (*data->longest == data->longest_fixed) {
574 data->offset_fixed = l ? data->last_start_min : data->pos_min;
575 if (data->flags & SF_BEFORE_EOL)
577 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
579 data->flags &= ~SF_FIX_BEFORE_EOL;
580 data->minlen_fixed=minlenp;
581 data->lookbehind_fixed=0;
584 data->offset_float_min = l ? data->last_start_min : data->pos_min;
585 data->offset_float_max = (l
586 ? data->last_start_max
587 : data->pos_min + data->pos_delta);
588 if ((U32)data->offset_float_max > (U32)I32_MAX)
589 data->offset_float_max = I32_MAX;
590 if (data->flags & SF_BEFORE_EOL)
592 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
594 data->flags &= ~SF_FL_BEFORE_EOL;
595 data->minlen_float=minlenp;
596 data->lookbehind_float=0;
599 SvCUR_set(data->last_found, 0);
601 SV * const sv = data->last_found;
602 if (SvUTF8(sv) && SvMAGICAL(sv)) {
603 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
609 data->flags &= ~SF_BEFORE_EOL;
610 DEBUG_STUDYDATA(data,0);
613 /* Can match anything (initialization) */
615 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
617 ANYOF_CLASS_ZERO(cl);
618 ANYOF_BITMAP_SETALL(cl);
619 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
621 cl->flags |= ANYOF_LOCALE;
624 /* Can match anything (initialization) */
626 S_cl_is_anything(const struct regnode_charclass_class *cl)
630 for (value = 0; value <= ANYOF_MAX; value += 2)
631 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
633 if (!(cl->flags & ANYOF_UNICODE_ALL))
635 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
640 /* Can match anything (initialization) */
642 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
644 Zero(cl, 1, struct regnode_charclass_class);
646 cl_anything(pRExC_state, cl);
650 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
652 Zero(cl, 1, struct regnode_charclass_class);
654 cl_anything(pRExC_state, cl);
656 cl->flags |= ANYOF_LOCALE;
659 /* 'And' a given class with another one. Can create false positives */
660 /* We assume that cl is not inverted */
662 S_cl_and(struct regnode_charclass_class *cl,
663 const struct regnode_charclass_class *and_with)
665 if (!(and_with->flags & ANYOF_CLASS)
666 && !(cl->flags & ANYOF_CLASS)
667 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
668 && !(and_with->flags & ANYOF_FOLD)
669 && !(cl->flags & ANYOF_FOLD)) {
672 if (and_with->flags & ANYOF_INVERT)
673 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
674 cl->bitmap[i] &= ~and_with->bitmap[i];
676 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
677 cl->bitmap[i] &= and_with->bitmap[i];
678 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
679 if (!(and_with->flags & ANYOF_EOS))
680 cl->flags &= ~ANYOF_EOS;
682 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
683 !(and_with->flags & ANYOF_INVERT)) {
684 cl->flags &= ~ANYOF_UNICODE_ALL;
685 cl->flags |= ANYOF_UNICODE;
686 ARG_SET(cl, ARG(and_with));
688 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
689 !(and_with->flags & ANYOF_INVERT))
690 cl->flags &= ~ANYOF_UNICODE_ALL;
691 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
692 !(and_with->flags & ANYOF_INVERT))
693 cl->flags &= ~ANYOF_UNICODE;
696 /* 'OR' a given class with another one. Can create false positives */
697 /* We assume that cl is not inverted */
699 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
701 if (or_with->flags & ANYOF_INVERT) {
703 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
704 * <= (B1 | !B2) | (CL1 | !CL2)
705 * which is wasteful if CL2 is small, but we ignore CL2:
706 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
707 * XXXX Can we handle case-fold? Unclear:
708 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
709 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
711 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
712 && !(or_with->flags & ANYOF_FOLD)
713 && !(cl->flags & ANYOF_FOLD) ) {
716 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
717 cl->bitmap[i] |= ~or_with->bitmap[i];
718 } /* XXXX: logic is complicated otherwise */
720 cl_anything(pRExC_state, cl);
723 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
724 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
725 && (!(or_with->flags & ANYOF_FOLD)
726 || (cl->flags & ANYOF_FOLD)) ) {
729 /* OR char bitmap and class bitmap separately */
730 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
731 cl->bitmap[i] |= or_with->bitmap[i];
732 if (or_with->flags & ANYOF_CLASS) {
733 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
734 cl->classflags[i] |= or_with->classflags[i];
735 cl->flags |= ANYOF_CLASS;
738 else { /* XXXX: logic is complicated, leave it along for a moment. */
739 cl_anything(pRExC_state, cl);
742 if (or_with->flags & ANYOF_EOS)
743 cl->flags |= ANYOF_EOS;
745 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
746 ARG(cl) != ARG(or_with)) {
747 cl->flags |= ANYOF_UNICODE_ALL;
748 cl->flags &= ~ANYOF_UNICODE;
750 if (or_with->flags & ANYOF_UNICODE_ALL) {
751 cl->flags |= ANYOF_UNICODE_ALL;
752 cl->flags &= ~ANYOF_UNICODE;
756 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
757 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
758 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
759 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
765 dump_trie_interim_list(trie,next_alloc)
766 dump_trie_interim_table(trie,next_alloc)
768 These routines dump out a trie in a somewhat readable format.
769 The _interim_ variants are used for debugging the interim
770 tables that are used to generate the final compressed
771 representation which is what dump_trie expects.
773 Part of the reason for their existance is to provide a form
774 of documentation as to how the different representations function.
780 Dumps the final compressed table form of the trie to Perl_debug_log.
781 Used for debugging make_trie().
785 S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
788 SV *sv=sv_newmortal();
789 int colwidth= trie->widecharmap ? 6 : 4;
790 GET_RE_DEBUG_FLAGS_DECL;
793 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
794 (int)depth * 2 + 2,"",
795 "Match","Base","Ofs" );
797 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
798 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
800 PerlIO_printf( Perl_debug_log, "%*s",
802 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
803 PL_colors[0], PL_colors[1],
804 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
805 PERL_PV_ESCAPE_FIRSTCHAR
810 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
811 (int)depth * 2 + 2,"");
813 for( state = 0 ; state < trie->uniquecharcount ; state++ )
814 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
815 PerlIO_printf( Perl_debug_log, "\n");
817 for( state = 1 ; state < trie->laststate ; state++ ) {
818 const U32 base = trie->states[ state ].trans.base;
820 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
822 if ( trie->states[ state ].wordnum ) {
823 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
825 PerlIO_printf( Perl_debug_log, "%6s", "" );
828 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
833 while( ( base + ofs < trie->uniquecharcount ) ||
834 ( base + ofs - trie->uniquecharcount < trie->lasttrans
835 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
838 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
840 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
841 if ( ( base + ofs >= trie->uniquecharcount ) &&
842 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
843 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
845 PerlIO_printf( Perl_debug_log, "%*"UVXf,
847 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
849 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
853 PerlIO_printf( Perl_debug_log, "]");
856 PerlIO_printf( Perl_debug_log, "\n" );
860 dump_trie_interim_list(trie,next_alloc)
861 Dumps a fully constructed but uncompressed trie in list form.
862 List tries normally only are used for construction when the number of
863 possible chars (trie->uniquecharcount) is very high.
864 Used for debugging make_trie().
867 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
870 SV *sv=sv_newmortal();
871 int colwidth= trie->widecharmap ? 6 : 4;
872 GET_RE_DEBUG_FLAGS_DECL;
873 /* print out the table precompression. */
874 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
875 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
876 "------:-----+-----------------\n" );
878 for( state=1 ; state < next_alloc ; state ++ ) {
881 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
882 (int)depth * 2 + 2,"", (UV)state );
883 if ( ! trie->states[ state ].wordnum ) {
884 PerlIO_printf( Perl_debug_log, "%5s| ","");
886 PerlIO_printf( Perl_debug_log, "W%4x| ",
887 trie->states[ state ].wordnum
890 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
891 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
893 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
895 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
896 PL_colors[0], PL_colors[1],
897 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
898 PERL_PV_ESCAPE_FIRSTCHAR
900 TRIE_LIST_ITEM(state,charid).forid,
901 (UV)TRIE_LIST_ITEM(state,charid).newstate
905 PerlIO_printf( Perl_debug_log, "\n");
910 dump_trie_interim_table(trie,next_alloc)
911 Dumps a fully constructed but uncompressed trie in table form.
912 This is the normal DFA style state transition table, with a few
913 twists to facilitate compression later.
914 Used for debugging make_trie().
917 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
921 SV *sv=sv_newmortal();
922 int colwidth= trie->widecharmap ? 6 : 4;
923 GET_RE_DEBUG_FLAGS_DECL;
926 print out the table precompression so that we can do a visual check
927 that they are identical.
930 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
932 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
933 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
935 PerlIO_printf( Perl_debug_log, "%*s",
937 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
938 PL_colors[0], PL_colors[1],
939 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
940 PERL_PV_ESCAPE_FIRSTCHAR
946 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
948 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
949 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
952 PerlIO_printf( Perl_debug_log, "\n" );
954 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
956 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
957 (int)depth * 2 + 2,"",
958 (UV)TRIE_NODENUM( state ) );
960 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
961 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
963 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
965 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
967 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
968 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
970 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
971 trie->states[ TRIE_NODENUM( state ) ].wordnum );
978 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
979 startbranch: the first branch in the whole branch sequence
980 first : start branch of sequence of branch-exact nodes.
981 May be the same as startbranch
982 last : Thing following the last branch.
983 May be the same as tail.
984 tail : item following the branch sequence
985 count : words in the sequence
986 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
989 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
991 A trie is an N'ary tree where the branches are determined by digital
992 decomposition of the key. IE, at the root node you look up the 1st character and
993 follow that branch repeat until you find the end of the branches. Nodes can be
994 marked as "accepting" meaning they represent a complete word. Eg:
998 would convert into the following structure. Numbers represent states, letters
999 following numbers represent valid transitions on the letter from that state, if
1000 the number is in square brackets it represents an accepting state, otherwise it
1001 will be in parenthesis.
1003 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1007 (1) +-i->(6)-+-s->[7]
1009 +-s->(3)-+-h->(4)-+-e->[5]
1011 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1013 This shows that when matching against the string 'hers' we will begin at state 1
1014 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1015 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1016 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1017 single traverse. We store a mapping from accepting to state to which word was
1018 matched, and then when we have multiple possibilities we try to complete the
1019 rest of the regex in the order in which they occured in the alternation.
1021 The only prior NFA like behaviour that would be changed by the TRIE support is
1022 the silent ignoring of duplicate alternations which are of the form:
1024 / (DUPE|DUPE) X? (?{ ... }) Y /x
1026 Thus EVAL blocks follwing a trie may be called a different number of times with
1027 and without the optimisation. With the optimisations dupes will be silently
1028 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1029 the following demonstrates:
1031 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1033 which prints out 'word' three times, but
1035 'words'=~/(word|word|word)(?{ print $1 })S/
1037 which doesnt print it out at all. This is due to other optimisations kicking in.
1039 Example of what happens on a structural level:
1041 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1043 1: CURLYM[1] {1,32767}(18)
1054 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1055 and should turn into:
1057 1: CURLYM[1] {1,32767}(18)
1059 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1067 Cases where tail != last would be like /(?foo|bar)baz/:
1077 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1078 and would end up looking like:
1081 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1088 d = uvuni_to_utf8_flags(d, uv, 0);
1090 is the recommended Unicode-aware way of saying
1095 #define TRIE_STORE_REVCHAR \
1097 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
1098 if (UTF) SvUTF8_on(tmp); \
1099 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1102 #define TRIE_READ_CHAR STMT_START { \
1106 if ( foldlen > 0 ) { \
1107 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1112 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1113 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1114 foldlen -= UNISKIP( uvc ); \
1115 scan = foldbuf + UNISKIP( uvc ); \
1118 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1128 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1129 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1130 TRIE_LIST_LEN( state ) *= 2; \
1131 Renew( trie->states[ state ].trans.list, \
1132 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1134 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1135 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1136 TRIE_LIST_CUR( state )++; \
1139 #define TRIE_LIST_NEW(state) STMT_START { \
1140 Newxz( trie->states[ state ].trans.list, \
1141 4, reg_trie_trans_le ); \
1142 TRIE_LIST_CUR( state ) = 1; \
1143 TRIE_LIST_LEN( state ) = 4; \
1146 #define TRIE_HANDLE_WORD(state) STMT_START { \
1147 U16 dupe= trie->states[ state ].wordnum; \
1148 regnode * const noper_next = regnext( noper ); \
1150 if (trie->wordlen) \
1151 trie->wordlen[ curword ] = wordlen; \
1153 /* store the word for dumping */ \
1155 if (OP(noper) != NOTHING) \
1156 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1158 tmp = newSVpvn( "", 0 ); \
1159 if ( UTF ) SvUTF8_on( tmp ); \
1160 av_push( trie->words, tmp ); \
1165 if ( noper_next < tail ) { \
1167 Newxz( trie->jump, word_count + 1, U16); \
1168 trie->jump[curword] = (U16)(tail - noper_next); \
1170 jumper = noper_next; \
1172 nextbranch= regnext(cur); \
1176 /* So it's a dupe. This means we need to maintain a */\
1177 /* linked-list from the first to the next. */\
1178 /* we only allocate the nextword buffer when there */\
1179 /* a dupe, so first time we have to do the allocation */\
1180 if (!trie->nextword) \
1181 Newxz( trie->nextword, word_count + 1, U16); \
1182 while ( trie->nextword[dupe] ) \
1183 dupe= trie->nextword[dupe]; \
1184 trie->nextword[dupe]= curword; \
1186 /* we haven't inserted this word yet. */ \
1187 trie->states[ state ].wordnum = curword; \
1192 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1193 ( ( base + charid >= ucharcount \
1194 && base + charid < ubound \
1195 && state == trie->trans[ base - ucharcount + charid ].check \
1196 && trie->trans[ base - ucharcount + charid ].next ) \
1197 ? trie->trans[ base - ucharcount + charid ].next \
1198 : ( state==1 ? special : 0 ) \
1202 #define MADE_JUMP_TRIE 2
1203 #define MADE_EXACT_TRIE 4
1206 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1209 /* first pass, loop through and scan words */
1210 reg_trie_data *trie;
1212 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1217 regnode *jumper = NULL;
1218 regnode *nextbranch = NULL;
1219 /* we just use folder as a flag in utf8 */
1220 const U8 * const folder = ( flags == EXACTF
1222 : ( flags == EXACTFL
1228 const U32 data_slot = add_data( pRExC_state, 1, "t" );
1229 SV *re_trie_maxbuff;
1231 /* these are only used during construction but are useful during
1232 * debugging so we store them in the struct when debugging.
1234 STRLEN trie_charcount=0;
1235 AV *trie_revcharmap;
1237 GET_RE_DEBUG_FLAGS_DECL;
1239 PERL_UNUSED_ARG(depth);
1242 Newxz( trie, 1, reg_trie_data );
1244 trie->startstate = 1;
1245 trie->wordcount = word_count;
1246 RExC_rx->data->data[ data_slot ] = (void*)trie;
1247 Newxz( trie->charmap, 256, U16 );
1248 if (!(UTF && folder))
1249 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
1251 trie->words = newAV();
1253 TRIE_REVCHARMAP(trie) = newAV();
1255 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1256 if (!SvIOK(re_trie_maxbuff)) {
1257 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1260 PerlIO_printf( Perl_debug_log,
1261 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1262 (int)depth * 2 + 2, "",
1263 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1264 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1267 /* -- First loop and Setup --
1269 We first traverse the branches and scan each word to determine if it
1270 contains widechars, and how many unique chars there are, this is
1271 important as we have to build a table with at least as many columns as we
1274 We use an array of integers to represent the character codes 0..255
1275 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1276 native representation of the character value as the key and IV's for the
1279 *TODO* If we keep track of how many times each character is used we can
1280 remap the columns so that the table compression later on is more
1281 efficient in terms of memory by ensuring most common value is in the
1282 middle and the least common are on the outside. IMO this would be better
1283 than a most to least common mapping as theres a decent chance the most
1284 common letter will share a node with the least common, meaning the node
1285 will not be compressable. With a middle is most common approach the worst
1286 case is when we have the least common nodes twice.
1290 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1291 regnode * const noper = NEXTOPER( cur );
1292 const U8 *uc = (U8*)STRING( noper );
1293 const U8 * const e = uc + STR_LEN( noper );
1295 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1296 const U8 *scan = (U8*)NULL;
1297 U32 wordlen = 0; /* required init */
1300 if (OP(noper) == NOTHING) {
1305 TRIE_BITMAP_SET(trie,*uc);
1306 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1308 for ( ; uc < e ; uc += len ) {
1309 TRIE_CHARCOUNT(trie)++;
1313 if ( !trie->charmap[ uvc ] ) {
1314 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1316 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1321 if ( !trie->widecharmap )
1322 trie->widecharmap = newHV();
1324 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1327 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1329 if ( !SvTRUE( *svpp ) ) {
1330 sv_setiv( *svpp, ++trie->uniquecharcount );
1335 if( cur == first ) {
1338 } else if (chars < trie->minlen) {
1340 } else if (chars > trie->maxlen) {
1344 } /* end first pass */
1345 DEBUG_TRIE_COMPILE_r(
1346 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1347 (int)depth * 2 + 2,"",
1348 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1349 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1350 (int)trie->minlen, (int)trie->maxlen )
1352 Newxz( trie->wordlen, word_count, U32 );
1355 We now know what we are dealing with in terms of unique chars and
1356 string sizes so we can calculate how much memory a naive
1357 representation using a flat table will take. If it's over a reasonable
1358 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1359 conservative but potentially much slower representation using an array
1362 At the end we convert both representations into the same compressed
1363 form that will be used in regexec.c for matching with. The latter
1364 is a form that cannot be used to construct with but has memory
1365 properties similar to the list form and access properties similar
1366 to the table form making it both suitable for fast searches and
1367 small enough that its feasable to store for the duration of a program.
1369 See the comment in the code where the compressed table is produced
1370 inplace from the flat tabe representation for an explanation of how
1371 the compression works.
1376 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1378 Second Pass -- Array Of Lists Representation
1380 Each state will be represented by a list of charid:state records
1381 (reg_trie_trans_le) the first such element holds the CUR and LEN
1382 points of the allocated array. (See defines above).
1384 We build the initial structure using the lists, and then convert
1385 it into the compressed table form which allows faster lookups
1386 (but cant be modified once converted).
1389 STRLEN transcount = 1;
1391 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1395 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1397 regnode * const noper = NEXTOPER( cur );
1398 U8 *uc = (U8*)STRING( noper );
1399 const U8 * const e = uc + STR_LEN( noper );
1400 U32 state = 1; /* required init */
1401 U16 charid = 0; /* sanity init */
1402 U8 *scan = (U8*)NULL; /* sanity init */
1403 STRLEN foldlen = 0; /* required init */
1404 U32 wordlen = 0; /* required init */
1405 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1407 if (OP(noper) != NOTHING) {
1408 for ( ; uc < e ; uc += len ) {
1413 charid = trie->charmap[ uvc ];
1415 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1419 charid=(U16)SvIV( *svpp );
1422 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1429 if ( !trie->states[ state ].trans.list ) {
1430 TRIE_LIST_NEW( state );
1432 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1433 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1434 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1439 newstate = next_alloc++;
1440 TRIE_LIST_PUSH( state, charid, newstate );
1445 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1449 TRIE_HANDLE_WORD(state);
1451 } /* end second pass */
1453 trie->laststate = next_alloc;
1454 Renew( trie->states, next_alloc, reg_trie_state );
1456 /* and now dump it out before we compress it */
1457 DEBUG_TRIE_COMPILE_MORE_r(
1458 dump_trie_interim_list(trie,next_alloc,depth+1)
1461 Newxz( trie->trans, transcount ,reg_trie_trans );
1468 for( state=1 ; state < next_alloc ; state ++ ) {
1472 DEBUG_TRIE_COMPILE_MORE_r(
1473 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1477 if (trie->states[state].trans.list) {
1478 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1482 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1483 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1484 if ( forid < minid ) {
1486 } else if ( forid > maxid ) {
1490 if ( transcount < tp + maxid - minid + 1) {
1492 Renew( trie->trans, transcount, reg_trie_trans );
1493 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1495 base = trie->uniquecharcount + tp - minid;
1496 if ( maxid == minid ) {
1498 for ( ; zp < tp ; zp++ ) {
1499 if ( ! trie->trans[ zp ].next ) {
1500 base = trie->uniquecharcount + zp - minid;
1501 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1502 trie->trans[ zp ].check = state;
1508 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1509 trie->trans[ tp ].check = state;
1514 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1515 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1516 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1517 trie->trans[ tid ].check = state;
1519 tp += ( maxid - minid + 1 );
1521 Safefree(trie->states[ state ].trans.list);
1524 DEBUG_TRIE_COMPILE_MORE_r(
1525 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1528 trie->states[ state ].trans.base=base;
1530 trie->lasttrans = tp + 1;
1534 Second Pass -- Flat Table Representation.
1536 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1537 We know that we will need Charcount+1 trans at most to store the data
1538 (one row per char at worst case) So we preallocate both structures
1539 assuming worst case.
1541 We then construct the trie using only the .next slots of the entry
1544 We use the .check field of the first entry of the node temporarily to
1545 make compression both faster and easier by keeping track of how many non
1546 zero fields are in the node.
1548 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1551 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1552 number representing the first entry of the node, and state as a
1553 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1554 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1555 are 2 entrys per node. eg:
1563 The table is internally in the right hand, idx form. However as we also
1564 have to deal with the states array which is indexed by nodenum we have to
1565 use TRIE_NODENUM() to convert.
1570 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
1572 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
1573 next_alloc = trie->uniquecharcount + 1;
1576 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1578 regnode * const noper = NEXTOPER( cur );
1579 const U8 *uc = (U8*)STRING( noper );
1580 const U8 * const e = uc + STR_LEN( noper );
1582 U32 state = 1; /* required init */
1584 U16 charid = 0; /* sanity init */
1585 U32 accept_state = 0; /* sanity init */
1586 U8 *scan = (U8*)NULL; /* sanity init */
1588 STRLEN foldlen = 0; /* required init */
1589 U32 wordlen = 0; /* required init */
1590 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1592 if ( OP(noper) != NOTHING ) {
1593 for ( ; uc < e ; uc += len ) {
1598 charid = trie->charmap[ uvc ];
1600 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1601 charid = svpp ? (U16)SvIV(*svpp) : 0;
1605 if ( !trie->trans[ state + charid ].next ) {
1606 trie->trans[ state + charid ].next = next_alloc;
1607 trie->trans[ state ].check++;
1608 next_alloc += trie->uniquecharcount;
1610 state = trie->trans[ state + charid ].next;
1612 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1614 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1617 accept_state = TRIE_NODENUM( state );
1618 TRIE_HANDLE_WORD(accept_state);
1620 } /* end second pass */
1622 /* and now dump it out before we compress it */
1623 DEBUG_TRIE_COMPILE_MORE_r(
1624 dump_trie_interim_table(trie,next_alloc,depth+1)
1629 * Inplace compress the table.*
1631 For sparse data sets the table constructed by the trie algorithm will
1632 be mostly 0/FAIL transitions or to put it another way mostly empty.
1633 (Note that leaf nodes will not contain any transitions.)
1635 This algorithm compresses the tables by eliminating most such
1636 transitions, at the cost of a modest bit of extra work during lookup:
1638 - Each states[] entry contains a .base field which indicates the
1639 index in the state[] array wheres its transition data is stored.
1641 - If .base is 0 there are no valid transitions from that node.
1643 - If .base is nonzero then charid is added to it to find an entry in
1646 -If trans[states[state].base+charid].check!=state then the
1647 transition is taken to be a 0/Fail transition. Thus if there are fail
1648 transitions at the front of the node then the .base offset will point
1649 somewhere inside the previous nodes data (or maybe even into a node
1650 even earlier), but the .check field determines if the transition is
1654 The following process inplace converts the table to the compressed
1655 table: We first do not compress the root node 1,and mark its all its
1656 .check pointers as 1 and set its .base pointer as 1 as well. This
1657 allows to do a DFA construction from the compressed table later, and
1658 ensures that any .base pointers we calculate later are greater than
1661 - We set 'pos' to indicate the first entry of the second node.
1663 - We then iterate over the columns of the node, finding the first and
1664 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1665 and set the .check pointers accordingly, and advance pos
1666 appropriately and repreat for the next node. Note that when we copy
1667 the next pointers we have to convert them from the original
1668 NODEIDX form to NODENUM form as the former is not valid post
1671 - If a node has no transitions used we mark its base as 0 and do not
1672 advance the pos pointer.
1674 - If a node only has one transition we use a second pointer into the
1675 structure to fill in allocated fail transitions from other states.
1676 This pointer is independent of the main pointer and scans forward
1677 looking for null transitions that are allocated to a state. When it
1678 finds one it writes the single transition into the "hole". If the
1679 pointer doesnt find one the single transition is appended as normal.
1681 - Once compressed we can Renew/realloc the structures to release the
1684 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1685 specifically Fig 3.47 and the associated pseudocode.
1689 const U32 laststate = TRIE_NODENUM( next_alloc );
1692 trie->laststate = laststate;
1694 for ( state = 1 ; state < laststate ; state++ ) {
1696 const U32 stateidx = TRIE_NODEIDX( state );
1697 const U32 o_used = trie->trans[ stateidx ].check;
1698 U32 used = trie->trans[ stateidx ].check;
1699 trie->trans[ stateidx ].check = 0;
1701 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1702 if ( flag || trie->trans[ stateidx + charid ].next ) {
1703 if ( trie->trans[ stateidx + charid ].next ) {
1705 for ( ; zp < pos ; zp++ ) {
1706 if ( ! trie->trans[ zp ].next ) {
1710 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1711 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1712 trie->trans[ zp ].check = state;
1713 if ( ++zp > pos ) pos = zp;
1720 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1722 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1723 trie->trans[ pos ].check = state;
1728 trie->lasttrans = pos + 1;
1729 Renew( trie->states, laststate + 1, reg_trie_state);
1730 DEBUG_TRIE_COMPILE_MORE_r(
1731 PerlIO_printf( Perl_debug_log,
1732 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1733 (int)depth * 2 + 2,"",
1734 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1737 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1740 } /* end table compress */
1742 /* resize the trans array to remove unused space */
1743 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
1745 /* and now dump out the compressed format */
1746 DEBUG_TRIE_COMPILE_r(
1747 dump_trie(trie,depth+1)
1750 { /* Modify the program and insert the new TRIE node*/
1752 U8 nodetype =(U8)(flags & 0xFF);
1761 This means we convert either the first branch or the first Exact,
1762 depending on whether the thing following (in 'last') is a branch
1763 or not and whther first is the startbranch (ie is it a sub part of
1764 the alternation or is it the whole thing.)
1765 Assuming its a sub part we conver the EXACT otherwise we convert
1766 the whole branch sequence, including the first.
1768 /* Find the node we are going to overwrite */
1769 if ( first == startbranch && OP( last ) != BRANCH ) {
1770 /* whole branch chain */
1773 const regnode *nop = NEXTOPER( convert );
1774 mjd_offset= Node_Offset((nop));
1775 mjd_nodelen= Node_Length((nop));
1778 /* branch sub-chain */
1779 convert = NEXTOPER( first );
1780 NEXT_OFF( first ) = (U16)(last - first);
1782 mjd_offset= Node_Offset((convert));
1783 mjd_nodelen= Node_Length((convert));
1787 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1788 (int)depth * 2 + 2, "",
1789 (UV)mjd_offset, (UV)mjd_nodelen)
1792 /* But first we check to see if there is a common prefix we can
1793 split out as an EXACT and put in front of the TRIE node. */
1794 trie->startstate= 1;
1795 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
1798 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1799 (int)depth * 2 + 2, "",
1800 (UV)trie->laststate)
1802 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
1806 const U32 base = trie->states[ state ].trans.base;
1808 if ( trie->states[state].wordnum )
1811 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1812 if ( ( base + ofs >= trie->uniquecharcount ) &&
1813 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1814 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1816 if ( ++count > 1 ) {
1817 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
1818 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1819 if ( state == 1 ) break;
1821 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1823 PerlIO_printf(Perl_debug_log,
1824 "%*sNew Start State=%"UVuf" Class: [",
1825 (int)depth * 2 + 2, "",
1828 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1829 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1831 TRIE_BITMAP_SET(trie,*ch);
1833 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1835 PerlIO_printf(Perl_debug_log, (char*)ch)
1839 TRIE_BITMAP_SET(trie,*ch);
1841 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1842 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1848 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1849 const char *ch = SvPV_nolen_const( *tmp );
1851 PerlIO_printf( Perl_debug_log,
1852 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1853 (int)depth * 2 + 2, "",
1854 (UV)state, (UV)idx, ch)
1857 OP( convert ) = nodetype;
1858 str=STRING(convert);
1867 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1873 regnode *n = convert+NODE_SZ_STR(convert);
1874 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1875 trie->startstate = state;
1876 trie->minlen -= (state - 1);
1877 trie->maxlen -= (state - 1);
1879 regnode *fix = convert;
1881 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1882 while( ++fix < n ) {
1883 Set_Node_Offset_Length(fix, 0, 0);
1889 NEXT_OFF(convert) = (U16)(tail - convert);
1893 if ( trie->maxlen ) {
1894 NEXT_OFF( convert ) = (U16)(tail - convert);
1895 ARG_SET( convert, data_slot );
1896 /* Store the offset to the first unabsorbed branch in
1897 jump[0], which is otherwise unused by the jump logic.
1898 We use this when dumping a trie and during optimisation. */
1900 trie->jump[0] = (U16)(tail - nextbranch);
1904 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1905 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1907 OP( convert ) = TRIEC;
1908 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1909 Safefree(trie->bitmap);
1912 OP( convert ) = TRIE;
1914 /* store the type in the flags */
1915 convert->flags = nodetype;
1916 /* XXX We really should free up the resource in trie now, as we wont use them */
1918 /* needed for dumping*/
1920 regnode *optimize = convert
1922 + regarglen[ OP( convert ) ];
1923 regnode *opt = convert;
1924 while (++opt<optimize) {
1925 Set_Node_Offset_Length(opt,0,0);
1928 Try to clean up some of the debris left after the
1931 while( optimize < jumper ) {
1932 mjd_nodelen += Node_Length((optimize));
1933 OP( optimize ) = OPTIMIZED;
1934 Set_Node_Offset_Length(optimize,0,0);
1937 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
1939 } /* end node insert */
1941 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
1945 : trie->startstate>1
1951 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1953 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
1955 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1956 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1959 We find the fail state for each state in the trie, this state is the longest proper
1960 suffix of the current states 'word' that is also a proper prefix of another word in our
1961 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1962 the DFA not to have to restart after its tried and failed a word at a given point, it
1963 simply continues as though it had been matching the other word in the first place.
1965 'abcdgu'=~/abcdefg|cdgu/
1966 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1967 fail, which would bring use to the state representing 'd' in the second word where we would
1968 try 'g' and succeed, prodceding to match 'cdgu'.
1970 /* add a fail transition */
1971 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1973 const U32 ucharcount = trie->uniquecharcount;
1974 const U32 numstates = trie->laststate;
1975 const U32 ubound = trie->lasttrans + ucharcount;
1979 U32 base = trie->states[ 1 ].trans.base;
1982 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1983 GET_RE_DEBUG_FLAGS_DECL;
1985 PERL_UNUSED_ARG(depth);
1989 ARG_SET( stclass, data_slot );
1990 Newxz( aho, 1, reg_ac_data );
1991 RExC_rx->data->data[ data_slot ] = (void*)aho;
1993 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1994 (trie->laststate+1)*sizeof(reg_trie_state));
1995 Newxz( q, numstates, U32);
1996 Newxz( aho->fail, numstates, U32 );
1999 /* initialize fail[0..1] to be 1 so that we always have
2000 a valid final fail state */
2001 fail[ 0 ] = fail[ 1 ] = 1;
2003 for ( charid = 0; charid < ucharcount ; charid++ ) {
2004 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2006 q[ q_write ] = newstate;
2007 /* set to point at the root */
2008 fail[ q[ q_write++ ] ]=1;
2011 while ( q_read < q_write) {
2012 const U32 cur = q[ q_read++ % numstates ];
2013 base = trie->states[ cur ].trans.base;
2015 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2016 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2018 U32 fail_state = cur;
2021 fail_state = fail[ fail_state ];
2022 fail_base = aho->states[ fail_state ].trans.base;
2023 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2025 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2026 fail[ ch_state ] = fail_state;
2027 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2029 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2031 q[ q_write++ % numstates] = ch_state;
2035 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2036 when we fail in state 1, this allows us to use the
2037 charclass scan to find a valid start char. This is based on the principle
2038 that theres a good chance the string being searched contains lots of stuff
2039 that cant be a start char.
2041 fail[ 0 ] = fail[ 1 ] = 0;
2042 DEBUG_TRIE_COMPILE_r({
2043 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
2044 for( q_read=1; q_read<numstates; q_read++ ) {
2045 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2047 PerlIO_printf(Perl_debug_log, "\n");
2050 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2055 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2056 * These need to be revisited when a newer toolchain becomes available.
2058 #if defined(__sparc64__) && defined(__GNUC__)
2059 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2060 # undef SPARC64_GCC_WORKAROUND
2061 # define SPARC64_GCC_WORKAROUND 1
2065 #define DEBUG_PEEP(str,scan,depth) \
2066 DEBUG_OPTIMISE_r({ \
2067 SV * const mysv=sv_newmortal(); \
2068 regnode *Next = regnext(scan); \
2069 regprop(RExC_rx, mysv, scan); \
2070 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
2071 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2072 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2079 #define JOIN_EXACT(scan,min,flags) \
2080 if (PL_regkind[OP(scan)] == EXACT) \
2081 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2084 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2085 /* Merge several consecutive EXACTish nodes into one. */
2086 regnode *n = regnext(scan);
2088 regnode *next = scan + NODE_SZ_STR(scan);
2092 regnode *stop = scan;
2093 GET_RE_DEBUG_FLAGS_DECL;
2095 PERL_UNUSED_ARG(depth);
2097 #ifndef EXPERIMENTAL_INPLACESCAN
2098 PERL_UNUSED_ARG(flags);
2099 PERL_UNUSED_ARG(val);
2101 DEBUG_PEEP("join",scan,depth);
2103 /* Skip NOTHING, merge EXACT*. */
2105 ( PL_regkind[OP(n)] == NOTHING ||
2106 (stringok && (OP(n) == OP(scan))))
2108 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2110 if (OP(n) == TAIL || n > next)
2112 if (PL_regkind[OP(n)] == NOTHING) {
2113 DEBUG_PEEP("skip:",n,depth);
2114 NEXT_OFF(scan) += NEXT_OFF(n);
2115 next = n + NODE_STEP_REGNODE;
2122 else if (stringok) {
2123 const unsigned int oldl = STR_LEN(scan);
2124 regnode * const nnext = regnext(n);
2126 DEBUG_PEEP("merg",n,depth);
2129 if (oldl + STR_LEN(n) > U8_MAX)
2131 NEXT_OFF(scan) += NEXT_OFF(n);
2132 STR_LEN(scan) += STR_LEN(n);
2133 next = n + NODE_SZ_STR(n);
2134 /* Now we can overwrite *n : */
2135 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2143 #ifdef EXPERIMENTAL_INPLACESCAN
2144 if (flags && !NEXT_OFF(n)) {
2145 DEBUG_PEEP("atch", val, depth);
2146 if (reg_off_by_arg[OP(n)]) {
2147 ARG_SET(n, val - n);
2150 NEXT_OFF(n) = val - n;
2157 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2159 Two problematic code points in Unicode casefolding of EXACT nodes:
2161 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2162 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2168 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2169 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2171 This means that in case-insensitive matching (or "loose matching",
2172 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2173 length of the above casefolded versions) can match a target string
2174 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2175 This would rather mess up the minimum length computation.
2177 What we'll do is to look for the tail four bytes, and then peek
2178 at the preceding two bytes to see whether we need to decrease
2179 the minimum length by four (six minus two).
2181 Thanks to the design of UTF-8, there cannot be false matches:
2182 A sequence of valid UTF-8 bytes cannot be a subsequence of
2183 another valid sequence of UTF-8 bytes.
2186 char * const s0 = STRING(scan), *s, *t;
2187 char * const s1 = s0 + STR_LEN(scan) - 1;
2188 char * const s2 = s1 - 4;
2189 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2190 const char t0[] = "\xaf\x49\xaf\x42";
2192 const char t0[] = "\xcc\x88\xcc\x81";
2194 const char * const t1 = t0 + 3;
2197 s < s2 && (t = ninstr(s, s1, t0, t1));
2200 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2201 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2203 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2204 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2212 n = scan + NODE_SZ_STR(scan);
2214 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2221 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2225 /* REx optimizer. Converts nodes into quickier variants "in place".
2226 Finds fixed substrings. */
2228 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2229 to the position after last scanned or to NULL. */
2234 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2235 I32 *minlenp, I32 *deltap,
2236 regnode *last, scan_data_t *data, U32 flags, U32 depth)
2237 /* scanp: Start here (read-write). */
2238 /* deltap: Write maxlen-minlen here. */
2239 /* last: Stop before this one. */
2242 I32 min = 0, pars = 0, code;
2243 regnode *scan = *scanp, *next;
2245 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2246 int is_inf_internal = 0; /* The studied chunk is infinite */
2247 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2248 scan_data_t data_fake;
2249 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
2250 SV *re_trie_maxbuff = NULL;
2251 regnode *first_non_open = scan;
2254 GET_RE_DEBUG_FLAGS_DECL;
2256 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2259 while (first_non_open && OP(first_non_open) == OPEN)
2260 first_non_open=regnext(first_non_open);
2264 while (scan && OP(scan) != END && scan < last) {
2265 /* Peephole optimizer: */
2266 DEBUG_STUDYDATA(data,depth);
2267 DEBUG_PEEP("Peep",scan,depth);
2268 JOIN_EXACT(scan,&min,0);
2270 /* Follow the next-chain of the current node and optimize
2271 away all the NOTHINGs from it. */
2272 if (OP(scan) != CURLYX) {
2273 const int max = (reg_off_by_arg[OP(scan)]
2275 /* I32 may be smaller than U16 on CRAYs! */
2276 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2277 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2281 /* Skip NOTHING and LONGJMP. */
2282 while ((n = regnext(n))
2283 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2284 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2285 && off + noff < max)
2287 if (reg_off_by_arg[OP(scan)])
2290 NEXT_OFF(scan) = off;
2295 /* The principal pseudo-switch. Cannot be a switch, since we
2296 look into several different things. */
2297 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2298 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2299 next = regnext(scan);
2301 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2303 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
2304 /* NOTE - There is similar code to this block below for handling
2305 TRIE nodes on a re-study. If you change stuff here check there
2307 I32 max1 = 0, min1 = I32_MAX, num = 0;
2308 struct regnode_charclass_class accum;
2309 regnode * const startbranch=scan;
2311 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
2312 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2313 if (flags & SCF_DO_STCLASS)
2314 cl_init_zero(pRExC_state, &accum);
2316 while (OP(scan) == code) {
2317 I32 deltanext, minnext, f = 0, fake;
2318 struct regnode_charclass_class this_class;
2321 data_fake.flags = 0;
2323 data_fake.whilem_c = data->whilem_c;
2324 data_fake.last_closep = data->last_closep;
2327 data_fake.last_closep = &fake;
2328 next = regnext(scan);
2329 scan = NEXTOPER(scan);
2331 scan = NEXTOPER(scan);
2332 if (flags & SCF_DO_STCLASS) {
2333 cl_init(pRExC_state, &this_class);
2334 data_fake.start_class = &this_class;
2335 f = SCF_DO_STCLASS_AND;
2337 if (flags & SCF_WHILEM_VISITED_POS)
2338 f |= SCF_WHILEM_VISITED_POS;
2340 /* we suppose the run is continuous, last=next...*/
2341 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2342 next, &data_fake, f,depth+1);
2345 if (max1 < minnext + deltanext)
2346 max1 = minnext + deltanext;
2347 if (deltanext == I32_MAX)
2348 is_inf = is_inf_internal = 1;
2350 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2353 if (data_fake.flags & SF_HAS_EVAL)
2354 data->flags |= SF_HAS_EVAL;
2355 data->whilem_c = data_fake.whilem_c;
2357 if (flags & SCF_DO_STCLASS)
2358 cl_or(pRExC_state, &accum, &this_class);
2359 if (code == SUSPEND)
2362 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2364 if (flags & SCF_DO_SUBSTR) {
2365 data->pos_min += min1;
2366 data->pos_delta += max1 - min1;
2367 if (max1 != min1 || is_inf)
2368 data->longest = &(data->longest_float);
2371 delta += max1 - min1;
2372 if (flags & SCF_DO_STCLASS_OR) {
2373 cl_or(pRExC_state, data->start_class, &accum);
2375 cl_and(data->start_class, &and_with);
2376 flags &= ~SCF_DO_STCLASS;
2379 else if (flags & SCF_DO_STCLASS_AND) {
2381 cl_and(data->start_class, &accum);
2382 flags &= ~SCF_DO_STCLASS;
2385 /* Switch to OR mode: cache the old value of
2386 * data->start_class */
2387 StructCopy(data->start_class, &and_with,
2388 struct regnode_charclass_class);
2389 flags &= ~SCF_DO_STCLASS_AND;
2390 StructCopy(&accum, data->start_class,
2391 struct regnode_charclass_class);
2392 flags |= SCF_DO_STCLASS_OR;
2393 data->start_class->flags |= ANYOF_EOS;
2397 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2400 Assuming this was/is a branch we are dealing with: 'scan' now
2401 points at the item that follows the branch sequence, whatever
2402 it is. We now start at the beginning of the sequence and look
2409 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2411 If we can find such a subseqence we need to turn the first
2412 element into a trie and then add the subsequent branch exact
2413 strings to the trie.
2417 1. patterns where the whole set of branch can be converted.
2419 2. patterns where only a subset can be converted.
2421 In case 1 we can replace the whole set with a single regop
2422 for the trie. In case 2 we need to keep the start and end
2425 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2426 becomes BRANCH TRIE; BRANCH X;
2428 There is an additional case, that being where there is a
2429 common prefix, which gets split out into an EXACT like node
2430 preceding the TRIE node.
2432 If x(1..n)==tail then we can do a simple trie, if not we make
2433 a "jump" trie, such that when we match the appropriate word
2434 we "jump" to the appopriate tail node. Essentailly we turn
2435 a nested if into a case structure of sorts.
2440 if (!re_trie_maxbuff) {
2441 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2442 if (!SvIOK(re_trie_maxbuff))
2443 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2445 if ( SvIV(re_trie_maxbuff)>=0 ) {
2447 regnode *first = (regnode *)NULL;
2448 regnode *last = (regnode *)NULL;
2449 regnode *tail = scan;
2454 SV * const mysv = sv_newmortal(); /* for dumping */
2456 /* var tail is used because there may be a TAIL
2457 regop in the way. Ie, the exacts will point to the
2458 thing following the TAIL, but the last branch will
2459 point at the TAIL. So we advance tail. If we
2460 have nested (?:) we may have to move through several
2464 while ( OP( tail ) == TAIL ) {
2465 /* this is the TAIL generated by (?:) */
2466 tail = regnext( tail );
2471 regprop(RExC_rx, mysv, tail );
2472 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2473 (int)depth * 2 + 2, "",
2474 "Looking for TRIE'able sequences. Tail node is: ",
2475 SvPV_nolen_const( mysv )
2481 step through the branches, cur represents each
2482 branch, noper is the first thing to be matched
2483 as part of that branch and noper_next is the
2484 regnext() of that node. if noper is an EXACT
2485 and noper_next is the same as scan (our current
2486 position in the regex) then the EXACT branch is
2487 a possible optimization target. Once we have
2488 two or more consequetive such branches we can
2489 create a trie of the EXACT's contents and stich
2490 it in place. If the sequence represents all of
2491 the branches we eliminate the whole thing and
2492 replace it with a single TRIE. If it is a
2493 subsequence then we need to stitch it in. This
2494 means the first branch has to remain, and needs
2495 to be repointed at the item on the branch chain
2496 following the last branch optimized. This could
2497 be either a BRANCH, in which case the
2498 subsequence is internal, or it could be the
2499 item following the branch sequence in which
2500 case the subsequence is at the end.
2504 /* dont use tail as the end marker for this traverse */
2505 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2506 regnode * const noper = NEXTOPER( cur );
2507 regnode * const noper_next = regnext( noper );
2510 regprop(RExC_rx, mysv, cur);
2511 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2512 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2514 regprop(RExC_rx, mysv, noper);
2515 PerlIO_printf( Perl_debug_log, " -> %s",
2516 SvPV_nolen_const(mysv));
2519 regprop(RExC_rx, mysv, noper_next );
2520 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2521 SvPV_nolen_const(mysv));
2523 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2524 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2526 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2527 : PL_regkind[ OP( noper ) ] == EXACT )
2528 || OP(noper) == NOTHING )
2530 && noper_next == tail
2535 if ( !first || optype == NOTHING ) {
2536 if (!first) first = cur;
2537 optype = OP( noper );
2543 make_trie( pRExC_state,
2544 startbranch, first, cur, tail, count,
2547 if ( PL_regkind[ OP( noper ) ] == EXACT
2549 && noper_next == tail
2554 optype = OP( noper );
2564 regprop(RExC_rx, mysv, cur);
2565 PerlIO_printf( Perl_debug_log,
2566 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2567 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2571 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2572 #ifdef TRIE_STUDY_OPT
2573 if ( ((made == MADE_EXACT_TRIE &&
2574 startbranch == first)
2575 || ( first_non_open == first )) &&
2577 flags |= SCF_TRIE_RESTUDY;
2585 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2586 scan = NEXTOPER(NEXTOPER(scan));
2587 } else /* single branch is optimized. */
2588 scan = NEXTOPER(scan);
2591 else if (OP(scan) == EXACT) {
2592 I32 l = STR_LEN(scan);
2595 const U8 * const s = (U8*)STRING(scan);
2596 l = utf8_length(s, s + l);
2597 uc = utf8_to_uvchr(s, NULL);
2599 uc = *((U8*)STRING(scan));
2602 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2603 /* The code below prefers earlier match for fixed
2604 offset, later match for variable offset. */
2605 if (data->last_end == -1) { /* Update the start info. */
2606 data->last_start_min = data->pos_min;
2607 data->last_start_max = is_inf
2608 ? I32_MAX : data->pos_min + data->pos_delta;
2610 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2612 SvUTF8_on(data->last_found);
2614 SV * const sv = data->last_found;
2615 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2616 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2617 if (mg && mg->mg_len >= 0)
2618 mg->mg_len += utf8_length((U8*)STRING(scan),
2619 (U8*)STRING(scan)+STR_LEN(scan));
2621 data->last_end = data->pos_min + l;
2622 data->pos_min += l; /* As in the first entry. */
2623 data->flags &= ~SF_BEFORE_EOL;
2625 if (flags & SCF_DO_STCLASS_AND) {
2626 /* Check whether it is compatible with what we know already! */
2630 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2631 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2632 && (!(data->start_class->flags & ANYOF_FOLD)
2633 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2636 ANYOF_CLASS_ZERO(data->start_class);
2637 ANYOF_BITMAP_ZERO(data->start_class);
2639 ANYOF_BITMAP_SET(data->start_class, uc);
2640 data->start_class->flags &= ~ANYOF_EOS;
2642 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2644 else if (flags & SCF_DO_STCLASS_OR) {
2645 /* false positive possible if the class is case-folded */
2647 ANYOF_BITMAP_SET(data->start_class, uc);
2649 data->start_class->flags |= ANYOF_UNICODE_ALL;
2650 data->start_class->flags &= ~ANYOF_EOS;
2651 cl_and(data->start_class, &and_with);
2653 flags &= ~SCF_DO_STCLASS;
2655 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2656 I32 l = STR_LEN(scan);
2657 UV uc = *((U8*)STRING(scan));
2659 /* Search for fixed substrings supports EXACT only. */
2660 if (flags & SCF_DO_SUBSTR) {
2662 scan_commit(pRExC_state, data, minlenp);
2665 const U8 * const s = (U8 *)STRING(scan);
2666 l = utf8_length(s, s + l);
2667 uc = utf8_to_uvchr(s, NULL);
2670 if (flags & SCF_DO_SUBSTR)
2672 if (flags & SCF_DO_STCLASS_AND) {
2673 /* Check whether it is compatible with what we know already! */
2677 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2678 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2679 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2681 ANYOF_CLASS_ZERO(data->start_class);
2682 ANYOF_BITMAP_ZERO(data->start_class);
2684 ANYOF_BITMAP_SET(data->start_class, uc);
2685 data->start_class->flags &= ~ANYOF_EOS;
2686 data->start_class->flags |= ANYOF_FOLD;
2687 if (OP(scan) == EXACTFL)
2688 data->start_class->flags |= ANYOF_LOCALE;
2691 else if (flags & SCF_DO_STCLASS_OR) {
2692 if (data->start_class->flags & ANYOF_FOLD) {
2693 /* false positive possible if the class is case-folded.
2694 Assume that the locale settings are the same... */
2696 ANYOF_BITMAP_SET(data->start_class, uc);
2697 data->start_class->flags &= ~ANYOF_EOS;
2699 cl_and(data->start_class, &and_with);
2701 flags &= ~SCF_DO_STCLASS;
2703 else if (strchr((const char*)PL_varies,OP(scan))) {
2704 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2705 I32 f = flags, pos_before = 0;
2706 regnode * const oscan = scan;
2707 struct regnode_charclass_class this_class;
2708 struct regnode_charclass_class *oclass = NULL;
2709 I32 next_is_eval = 0;
2711 switch (PL_regkind[OP(scan)]) {
2712 case WHILEM: /* End of (?:...)* . */
2713 scan = NEXTOPER(scan);
2716 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2717 next = NEXTOPER(scan);
2718 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2720 maxcount = REG_INFTY;
2721 next = regnext(scan);
2722 scan = NEXTOPER(scan);
2726 if (flags & SCF_DO_SUBSTR)
2731 if (flags & SCF_DO_STCLASS) {
2733 maxcount = REG_INFTY;
2734 next = regnext(scan);
2735 scan = NEXTOPER(scan);
2738 is_inf = is_inf_internal = 1;
2739 scan = regnext(scan);
2740 if (flags & SCF_DO_SUBSTR) {
2741 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2742 data->longest = &(data->longest_float);
2744 goto optimize_curly_tail;
2746 mincount = ARG1(scan);
2747 maxcount = ARG2(scan);
2748 next = regnext(scan);
2749 if (OP(scan) == CURLYX) {
2750 I32 lp = (data ? *(data->last_closep) : 0);
2751 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2753 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2754 next_is_eval = (OP(scan) == EVAL);
2756 if (flags & SCF_DO_SUBSTR) {
2757 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2758 pos_before = data->pos_min;
2762 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2764 data->flags |= SF_IS_INF;
2766 if (flags & SCF_DO_STCLASS) {
2767 cl_init(pRExC_state, &this_class);
2768 oclass = data->start_class;
2769 data->start_class = &this_class;
2770 f |= SCF_DO_STCLASS_AND;
2771 f &= ~SCF_DO_STCLASS_OR;
2773 /* These are the cases when once a subexpression
2774 fails at a particular position, it cannot succeed
2775 even after backtracking at the enclosing scope.
2777 XXXX what if minimal match and we are at the
2778 initial run of {n,m}? */
2779 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2780 f &= ~SCF_WHILEM_VISITED_POS;
2782 /* This will finish on WHILEM, setting scan, or on NULL: */
2783 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
2785 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
2787 if (flags & SCF_DO_STCLASS)
2788 data->start_class = oclass;
2789 if (mincount == 0 || minnext == 0) {
2790 if (flags & SCF_DO_STCLASS_OR) {
2791 cl_or(pRExC_state, data->start_class, &this_class);
2793 else if (flags & SCF_DO_STCLASS_AND) {
2794 /* Switch to OR mode: cache the old value of
2795 * data->start_class */
2796 StructCopy(data->start_class, &and_with,
2797 struct regnode_charclass_class);
2798 flags &= ~SCF_DO_STCLASS_AND;
2799 StructCopy(&this_class, data->start_class,
2800 struct regnode_charclass_class);
2801 flags |= SCF_DO_STCLASS_OR;
2802 data->start_class->flags |= ANYOF_EOS;
2804 } else { /* Non-zero len */
2805 if (flags & SCF_DO_STCLASS_OR) {
2806 cl_or(pRExC_state, data->start_class, &this_class);
2807 cl_and(data->start_class, &and_with);
2809 else if (flags & SCF_DO_STCLASS_AND)
2810 cl_and(data->start_class, &this_class);
2811 flags &= ~SCF_DO_STCLASS;
2813 if (!scan) /* It was not CURLYX, but CURLY. */
2815 if ( /* ? quantifier ok, except for (?{ ... }) */
2816 (next_is_eval || !(mincount == 0 && maxcount == 1))
2817 && (minnext == 0) && (deltanext == 0)
2818 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2819 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2820 && ckWARN(WARN_REGEXP))
2823 "Quantifier unexpected on zero-length expression");
2826 min += minnext * mincount;
2827 is_inf_internal |= ((maxcount == REG_INFTY
2828 && (minnext + deltanext) > 0)
2829 || deltanext == I32_MAX);
2830 is_inf |= is_inf_internal;
2831 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2833 /* Try powerful optimization CURLYX => CURLYN. */
2834 if ( OP(oscan) == CURLYX && data
2835 && data->flags & SF_IN_PAR
2836 && !(data->flags & SF_HAS_EVAL)
2837 && !deltanext && minnext == 1 ) {
2838 /* Try to optimize to CURLYN. */
2839 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
2840 regnode * const nxt1 = nxt;
2847 if (!strchr((const char*)PL_simple,OP(nxt))
2848 && !(PL_regkind[OP(nxt)] == EXACT
2849 && STR_LEN(nxt) == 1))
2855 if (OP(nxt) != CLOSE)
2857 /* Now we know that nxt2 is the only contents: */
2858 oscan->flags = (U8)ARG(nxt);
2860 OP(nxt1) = NOTHING; /* was OPEN. */
2862 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2863 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2864 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2865 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2866 OP(nxt + 1) = OPTIMIZED; /* was count. */
2867 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
2872 /* Try optimization CURLYX => CURLYM. */
2873 if ( OP(oscan) == CURLYX && data
2874 && !(data->flags & SF_HAS_PAR)
2875 && !(data->flags & SF_HAS_EVAL)
2876 && !deltanext /* atom is fixed width */
2877 && minnext != 0 /* CURLYM can't handle zero width */
2879 /* XXXX How to optimize if data == 0? */
2880 /* Optimize to a simpler form. */
2881 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2885 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2886 && (OP(nxt2) != WHILEM))
2888 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2889 /* Need to optimize away parenths. */
2890 if (data->flags & SF_IN_PAR) {
2891 /* Set the parenth number. */
2892 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2894 if (OP(nxt) != CLOSE)
2895 FAIL("Panic opt close");
2896 oscan->flags = (U8)ARG(nxt);
2897 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2898 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2900 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2901 OP(nxt + 1) = OPTIMIZED; /* was count. */
2902 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2903 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
2906 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2907 regnode *nnxt = regnext(nxt1);
2910 if (reg_off_by_arg[OP(nxt1)])
2911 ARG_SET(nxt1, nxt2 - nxt1);
2912 else if (nxt2 - nxt1 < U16_MAX)
2913 NEXT_OFF(nxt1) = nxt2 - nxt1;
2915 OP(nxt) = NOTHING; /* Cannot beautify */
2920 /* Optimize again: */
2921 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
2927 else if ((OP(oscan) == CURLYX)
2928 && (flags & SCF_WHILEM_VISITED_POS)
2929 /* See the comment on a similar expression above.
2930 However, this time it not a subexpression
2931 we care about, but the expression itself. */
2932 && (maxcount == REG_INFTY)
2933 && data && ++data->whilem_c < 16) {
2934 /* This stays as CURLYX, we can put the count/of pair. */
2935 /* Find WHILEM (as in regexec.c) */
2936 regnode *nxt = oscan + NEXT_OFF(oscan);
2938 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2940 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2941 | (RExC_whilem_seen << 4)); /* On WHILEM */
2943 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2945 if (flags & SCF_DO_SUBSTR) {
2946 SV *last_str = NULL;
2947 int counted = mincount != 0;
2949 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
2950 #if defined(SPARC64_GCC_WORKAROUND)
2953 const char *s = NULL;
2956 if (pos_before >= data->last_start_min)
2959 b = data->last_start_min;
2962 s = SvPV_const(data->last_found, l);
2963 old = b - data->last_start_min;
2966 I32 b = pos_before >= data->last_start_min
2967 ? pos_before : data->last_start_min;
2969 const char * const s = SvPV_const(data->last_found, l);
2970 I32 old = b - data->last_start_min;
2974 old = utf8_hop((U8*)s, old) - (U8*)s;
2977 /* Get the added string: */
2978 last_str = newSVpvn(s + old, l);
2980 SvUTF8_on(last_str);
2981 if (deltanext == 0 && pos_before == b) {
2982 /* What was added is a constant string */
2984 SvGROW(last_str, (mincount * l) + 1);
2985 repeatcpy(SvPVX(last_str) + l,
2986 SvPVX_const(last_str), l, mincount - 1);
2987 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2988 /* Add additional parts. */
2989 SvCUR_set(data->last_found,
2990 SvCUR(data->last_found) - l);
2991 sv_catsv(data->last_found, last_str);
2993 SV * sv = data->last_found;
2995 SvUTF8(sv) && SvMAGICAL(sv) ?
2996 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2997 if (mg && mg->mg_len >= 0)
2998 mg->mg_len += CHR_SVLEN(last_str);
3000 data->last_end += l * (mincount - 1);
3003 /* start offset must point into the last copy */
3004 data->last_start_min += minnext * (mincount - 1);
3005 data->last_start_max += is_inf ? I32_MAX
3006 : (maxcount - 1) * (minnext + data->pos_delta);
3009 /* It is counted once already... */
3010 data->pos_min += minnext * (mincount - counted);
3011 data->pos_delta += - counted * deltanext +
3012 (minnext + deltanext) * maxcount - minnext * mincount;
3013 if (mincount != maxcount) {
3014 /* Cannot extend fixed substrings found inside
3016 scan_commit(pRExC_state,data,minlenp);
3017 if (mincount && last_str) {
3018 SV * const sv = data->last_found;
3019 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3020 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3024 sv_setsv(sv, last_str);
3025 data->last_end = data->pos_min;
3026 data->last_start_min =
3027 data->pos_min - CHR_SVLEN(last_str);
3028 data->last_start_max = is_inf
3030 : data->pos_min + data->pos_delta
3031 - CHR_SVLEN(last_str);
3033 data->longest = &(data->longest_float);
3035 SvREFCNT_dec(last_str);
3037 if (data && (fl & SF_HAS_EVAL))
3038 data->flags |= SF_HAS_EVAL;
3039 optimize_curly_tail:
3040 if (OP(oscan) != CURLYX) {
3041 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3043 NEXT_OFF(oscan) += NEXT_OFF(next);
3046 default: /* REF and CLUMP only? */
3047 if (flags & SCF_DO_SUBSTR) {
3048 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3049 data->longest = &(data->longest_float);
3051 is_inf = is_inf_internal = 1;
3052 if (flags & SCF_DO_STCLASS_OR)
3053 cl_anything(pRExC_state, data->start_class);
3054 flags &= ~SCF_DO_STCLASS;
3058 else if (strchr((const char*)PL_simple,OP(scan))) {
3061 if (flags & SCF_DO_SUBSTR) {
3062 scan_commit(pRExC_state,data,minlenp);
3066 if (flags & SCF_DO_STCLASS) {
3067 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3069 /* Some of the logic below assumes that switching
3070 locale on will only add false positives. */
3071 switch (PL_regkind[OP(scan)]) {
3075 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3076 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3077 cl_anything(pRExC_state, data->start_class);
3080 if (OP(scan) == SANY)
3082 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3083 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3084 || (data->start_class->flags & ANYOF_CLASS));
3085 cl_anything(pRExC_state, data->start_class);
3087 if (flags & SCF_DO_STCLASS_AND || !value)
3088 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3091 if (flags & SCF_DO_STCLASS_AND)
3092 cl_and(data->start_class,
3093 (struct regnode_charclass_class*)scan);
3095 cl_or(pRExC_state, data->start_class,
3096 (struct regnode_charclass_class*)scan);
3099 if (flags & SCF_DO_STCLASS_AND) {
3100 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3101 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3102 for (value = 0; value < 256; value++)
3103 if (!isALNUM(value))
3104 ANYOF_BITMAP_CLEAR(data->start_class, value);
3108 if (data->start_class->flags & ANYOF_LOCALE)
3109 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3111 for (value = 0; value < 256; value++)
3113 ANYOF_BITMAP_SET(data->start_class, value);
3118 if (flags & SCF_DO_STCLASS_AND) {
3119 if (data->start_class->flags & ANYOF_LOCALE)
3120 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3123 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3124 data->start_class->flags |= ANYOF_LOCALE;
3128 if (flags & SCF_DO_STCLASS_AND) {
3129 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3130 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3131 for (value = 0; value < 256; value++)
3133 ANYOF_BITMAP_CLEAR(data->start_class, value);
3137 if (data->start_class->flags & ANYOF_LOCALE)
3138 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3140 for (value = 0; value < 256; value++)
3141 if (!isALNUM(value))
3142 ANYOF_BITMAP_SET(data->start_class, value);
3147 if (flags & SCF_DO_STCLASS_AND) {
3148 if (data->start_class->flags & ANYOF_LOCALE)
3149 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3152 data->start_class->flags |= ANYOF_LOCALE;
3153 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3157 if (flags & SCF_DO_STCLASS_AND) {
3158 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3159 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3160 for (value = 0; value < 256; value++)
3161 if (!isSPACE(value))
3162 ANYOF_BITMAP_CLEAR(data->start_class, value);
3166 if (data->start_class->flags & ANYOF_LOCALE)
3167 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3169 for (value = 0; value < 256; value++)
3171 ANYOF_BITMAP_SET(data->start_class, value);
3176 if (flags & SCF_DO_STCLASS_AND) {
3177 if (data->start_class->flags & ANYOF_LOCALE)
3178 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3181 data->start_class->flags |= ANYOF_LOCALE;
3182 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3186 if (flags & SCF_DO_STCLASS_AND) {
3187 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3188 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3189 for (value = 0; value < 256; value++)
3191 ANYOF_BITMAP_CLEAR(data->start_class, value);
3195 if (data->start_class->flags & ANYOF_LOCALE)
3196 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3198 for (value = 0; value < 256; value++)
3199 if (!isSPACE(value))
3200 ANYOF_BITMAP_SET(data->start_class, value);
3205 if (flags & SCF_DO_STCLASS_AND) {
3206 if (data->start_class->flags & ANYOF_LOCALE) {
3207 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3208 for (value = 0; value < 256; value++)
3209 if (!isSPACE(value))
3210 ANYOF_BITMAP_CLEAR(data->start_class, value);
3214 data->start_class->flags |= ANYOF_LOCALE;
3215 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3219 if (flags & SCF_DO_STCLASS_AND) {
3220 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3221 for (value = 0; value < 256; value++)
3222 if (!isDIGIT(value))
3223 ANYOF_BITMAP_CLEAR(data->start_class, value);
3226 if (data->start_class->flags & ANYOF_LOCALE)
3227 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3229 for (value = 0; value < 256; value++)
3231 ANYOF_BITMAP_SET(data->start_class, value);
3236 if (flags & SCF_DO_STCLASS_AND) {
3237 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3238 for (value = 0; value < 256; value++)
3240 ANYOF_BITMAP_CLEAR(data->start_class, value);
3243 if (data->start_class->flags & ANYOF_LOCALE)
3244 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3246 for (value = 0; value < 256; value++)
3247 if (!isDIGIT(value))
3248 ANYOF_BITMAP_SET(data->start_class, value);
3253 if (flags & SCF_DO_STCLASS_OR)
3254 cl_and(data->start_class, &and_with);
3255 flags &= ~SCF_DO_STCLASS;
3258 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3259 data->flags |= (OP(scan) == MEOL
3263 else if ( PL_regkind[OP(scan)] == BRANCHJ
3264 /* Lookbehind, or need to calculate parens/evals/stclass: */
3265 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3266 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3267 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3268 || OP(scan) == UNLESSM )
3270 /* Negative Lookahead/lookbehind
3271 In this case we can't do fixed string optimisation.
3274 I32 deltanext, minnext, fake = 0;
3276 struct regnode_charclass_class intrnl;
3279 data_fake.flags = 0;
3281 data_fake.whilem_c = data->whilem_c;
3282 data_fake.last_closep = data->last_closep;
3285 data_fake.last_closep = &fake;
3286 if ( flags & SCF_DO_STCLASS && !scan->flags
3287 && OP(scan) == IFMATCH ) { /* Lookahead */
3288 cl_init(pRExC_state, &intrnl);
3289 data_fake.start_class = &intrnl;
3290 f |= SCF_DO_STCLASS_AND;
3292 if (flags & SCF_WHILEM_VISITED_POS)
3293 f |= SCF_WHILEM_VISITED_POS;
3294 next = regnext(scan);
3295 nscan = NEXTOPER(NEXTOPER(scan));
3296 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
3299 vFAIL("Variable length lookbehind not implemented");
3301 else if (minnext > (I32)U8_MAX) {
3302 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3304 scan->flags = (U8)minnext;
3307 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3309 if (data_fake.flags & SF_HAS_EVAL)
3310 data->flags |= SF_HAS_EVAL;
3311 data->whilem_c = data_fake.whilem_c;
3313 if (f & SCF_DO_STCLASS_AND) {
3314 const int was = (data->start_class->flags & ANYOF_EOS);
3316 cl_and(data->start_class, &intrnl);
3318 data->start_class->flags |= ANYOF_EOS;
3321 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3323 /* Positive Lookahead/lookbehind
3324 In this case we can do fixed string optimisation,
3325 but we must be careful about it. Note in the case of
3326 lookbehind the positions will be offset by the minimum
3327 length of the pattern, something we won't know about
3328 until after the recurse.
3330 I32 deltanext, fake = 0;
3332 struct regnode_charclass_class intrnl;
3334 /* We use SAVEFREEPV so that when the full compile
3335 is finished perl will clean up the allocated
3336 minlens when its all done. This was we don't
3337 have to worry about freeing them when we know
3338 they wont be used, which would be a pain.
3341 Newx( minnextp, 1, I32 );
3342 SAVEFREEPV(minnextp);
3345 StructCopy(data, &data_fake, scan_data_t);
3346 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3349 scan_commit(pRExC_state, &data_fake,minlenp);
3350 data_fake.last_found=newSVsv(data->last_found);
3354 data_fake.last_closep = &fake;
3355 data_fake.flags = 0;
3357 data_fake.flags |= SF_IS_INF;
3358 if ( flags & SCF_DO_STCLASS && !scan->flags
3359 && OP(scan) == IFMATCH ) { /* Lookahead */
3360 cl_init(pRExC_state, &intrnl);
3361 data_fake.start_class = &intrnl;
3362 f |= SCF_DO_STCLASS_AND;
3364 if (flags & SCF_WHILEM_VISITED_POS)
3365 f |= SCF_WHILEM_VISITED_POS;
3366 next = regnext(scan);
3367 nscan = NEXTOPER(NEXTOPER(scan));
3369 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
3372 vFAIL("Variable length lookbehind not implemented");
3374 else if (*minnextp > (I32)U8_MAX) {
3375 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3377 scan->flags = (U8)*minnextp;
3383 if (f & SCF_DO_STCLASS_AND) {
3384 const int was = (data->start_class->flags & ANYOF_EOS);
3386 cl_and(data->start_class, &intrnl);
3388 data->start_class->flags |= ANYOF_EOS;
3391 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3393 if (data_fake.flags & SF_HAS_EVAL)
3394 data->flags |= SF_HAS_EVAL;
3395 data->whilem_c = data_fake.whilem_c;
3396 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3397 if (RExC_rx->minlen<*minnextp)
3398 RExC_rx->minlen=*minnextp;
3399 scan_commit(pRExC_state, &data_fake, minnextp);
3400 SvREFCNT_dec(data_fake.last_found);
3402 if ( data_fake.minlen_fixed != minlenp )
3404 data->offset_fixed= data_fake.offset_fixed;
3405 data->minlen_fixed= data_fake.minlen_fixed;
3406 data->lookbehind_fixed+= scan->flags;
3408 if ( data_fake.minlen_float != minlenp )
3410 data->minlen_float= data_fake.minlen_float;
3411 data->offset_float_min=data_fake.offset_float_min;
3412 data->offset_float_max=data_fake.offset_float_max;
3413 data->lookbehind_float+= scan->flags;
3422 else if (OP(scan) == OPEN) {
3425 else if (OP(scan) == CLOSE) {
3426 if ((I32)ARG(scan) == is_par) {
3427 next = regnext(scan);
3429 if ( next && (OP(next) != WHILEM) && next < last)
3430 is_par = 0; /* Disable optimization */
3433 *(data->last_closep) = ARG(scan);
3435 else if (OP(scan) == EVAL) {
3437 data->flags |= SF_HAS_EVAL;
3439 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
3440 if (flags & SCF_DO_SUBSTR) {
3441 scan_commit(pRExC_state,data,minlenp);
3442 data->longest = &(data->longest_float);
3444 is_inf = is_inf_internal = 1;
3445 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3446 cl_anything(pRExC_state, data->start_class);
3447 flags &= ~SCF_DO_STCLASS;
3449 #ifdef TRIE_STUDY_OPT
3450 #ifdef FULL_TRIE_STUDY
3451 else if (PL_regkind[OP(scan)] == TRIE) {
3452 /* NOTE - There is similar code to this block above for handling
3453 BRANCH nodes on the initial study. If you change stuff here
3455 regnode *tail= regnext(scan);
3456 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3457 I32 max1 = 0, min1 = I32_MAX;
3458 struct regnode_charclass_class accum;
3460 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3461 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3462 if (flags & SCF_DO_STCLASS)
3463 cl_init_zero(pRExC_state, &accum);
3469 const regnode *nextbranch= NULL;
3472 for ( word=1 ; word <= trie->wordcount ; word++)
3474 I32 deltanext=0, minnext=0, f = 0, fake;
3475 struct regnode_charclass_class this_class;
3477 data_fake.flags = 0;
3479 data_fake.whilem_c = data->whilem_c;
3480 data_fake.last_closep = data->last_closep;
3483 data_fake.last_closep = &fake;
3485 if (flags & SCF_DO_STCLASS) {
3486 cl_init(pRExC_state, &this_class);
3487 data_fake.start_class = &this_class;
3488 f = SCF_DO_STCLASS_AND;
3490 if (flags & SCF_WHILEM_VISITED_POS)
3491 f |= SCF_WHILEM_VISITED_POS;
3493 if (trie->jump[word]) {
3495 nextbranch = tail - trie->jump[0];
3496 scan= tail - trie->jump[word];
3497 /* We go from the jump point to the branch that follows
3498 it. Note this means we need the vestigal unused branches
3499 even though they arent otherwise used.
3501 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3502 (regnode *)nextbranch, &data_fake, f,depth+1);
3504 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3505 nextbranch= regnext((regnode*)nextbranch);
3507 if (min1 > (I32)(minnext + trie->minlen))
3508 min1 = minnext + trie->minlen;
3509 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3510 max1 = minnext + deltanext + trie->maxlen;
3511 if (deltanext == I32_MAX)
3512 is_inf = is_inf_internal = 1;
3514 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3518 if (data_fake.flags & SF_HAS_EVAL)
3519 data->flags |= SF_HAS_EVAL;
3520 data->whilem_c = data_fake.whilem_c;
3522 if (flags & SCF_DO_STCLASS)
3523 cl_or(pRExC_state, &accum, &this_class);
3526 if (flags & SCF_DO_SUBSTR) {
3527 data->pos_min += min1;
3528 data->pos_delta += max1 - min1;
3529 if (max1 != min1 || is_inf)
3530 data->longest = &(data->longest_float);
3533 delta += max1 - min1;
3534 if (flags & SCF_DO_STCLASS_OR) {
3535 cl_or(pRExC_state, data->start_class, &accum);
3537 cl_and(data->start_class, &and_with);
3538 flags &= ~SCF_DO_STCLASS;
3541 else if (flags & SCF_DO_STCLASS_AND) {
3543 cl_and(data->start_class, &accum);
3544 flags &= ~SCF_DO_STCLASS;
3547 /* Switch to OR mode: cache the old value of
3548 * data->start_class */
3549 StructCopy(data->start_class, &and_with,
3550 struct regnode_charclass_class);
3551 flags &= ~SCF_DO_STCLASS_AND;
3552 StructCopy(&accum, data->start_class,
3553 struct regnode_charclass_class);
3554 flags |= SCF_DO_STCLASS_OR;
3555 data->start_class->flags |= ANYOF_EOS;
3562 else if (PL_regkind[OP(scan)] == TRIE) {
3563 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3566 min += trie->minlen;
3567 delta += (trie->maxlen - trie->minlen);
3568 flags &= ~SCF_DO_STCLASS; /* xxx */
3569 if (flags & SCF_DO_SUBSTR) {
3570 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
3571 data->pos_min += trie->minlen;
3572 data->pos_delta += (trie->maxlen - trie->minlen);
3573 if (trie->maxlen != trie->minlen)
3574 data->longest = &(data->longest_float);
3576 if (trie->jump) /* no more substrings -- for now /grr*/
3577 flags &= ~SCF_DO_SUBSTR;
3579 #endif /* old or new */
3580 #endif /* TRIE_STUDY_OPT */
3581 /* Else: zero-length, ignore. */
3582 scan = regnext(scan);
3587 *deltap = is_inf_internal ? I32_MAX : delta;
3588 if (flags & SCF_DO_SUBSTR && is_inf)
3589 data->pos_delta = I32_MAX - data->pos_min;
3590 if (is_par > (I32)U8_MAX)
3592 if (is_par && pars==1 && data) {
3593 data->flags |= SF_IN_PAR;
3594 data->flags &= ~SF_HAS_PAR;
3596 else if (pars && data) {
3597 data->flags |= SF_HAS_PAR;
3598 data->flags &= ~SF_IN_PAR;
3600 if (flags & SCF_DO_STCLASS_OR)
3601 cl_and(data->start_class, &and_with);
3602 if (flags & SCF_TRIE_RESTUDY)
3603 data->flags |= SCF_TRIE_RESTUDY;
3605 DEBUG_STUDYDATA(data,depth);
3611 S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
3613 if (RExC_rx->data) {
3614 Renewc(RExC_rx->data,
3615 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
3616 char, struct reg_data);
3617 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3618 RExC_rx->data->count += n;
3621 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
3622 char, struct reg_data);
3623 Newx(RExC_rx->data->what, n, U8);
3624 RExC_rx->data->count = n;
3626 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3627 return RExC_rx->data->count - n;
3630 #ifndef PERL_IN_XSUB_RE
3632 Perl_reginitcolors(pTHX)
3635 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3637 char *t = savepv(s);
3641 t = strchr(t, '\t');
3647 PL_colors[i] = t = (char *)"";
3652 PL_colors[i++] = (char *)"";
3659 #ifdef TRIE_STUDY_OPT
3660 #define CHECK_RESTUDY_GOTO \
3662 (data.flags & SCF_TRIE_RESTUDY) \
3666 #define CHECK_RESTUDY_GOTO
3669 - pregcomp - compile a regular expression into internal code
3671 * We can't allocate space until we know how big the compiled form will be,
3672 * but we can't compile it (and thus know how big it is) until we've got a
3673 * place to put the code. So we cheat: we compile it twice, once with code
3674 * generation turned off and size counting turned on, and once "for real".
3675 * This also means that we don't allocate space until we are sure that the
3676 * thing really will compile successfully, and we never have to move the
3677 * code and thus invalidate pointers into it. (Note that it has to be in
3678 * one piece because free() must be able to free it all.) [NB: not true in perl]
3680 * Beware that the optimization-preparation code in here knows about some
3681 * of the structure of the compiled regexp. [I'll say.]
3684 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3695 RExC_state_t RExC_state;
3696 RExC_state_t * const pRExC_state = &RExC_state;
3697 #ifdef TRIE_STUDY_OPT
3699 RExC_state_t copyRExC_state;
3702 GET_RE_DEBUG_FLAGS_DECL;
3705 FAIL("NULL regexp argument");
3707 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
3710 DEBUG_r(if (!PL_colorset) reginitcolors());
3712 SV *dsv= sv_newmortal();
3713 RE_PV_QUOTED_DECL(s, RExC_utf8,
3714 dsv, RExC_precomp, (xend - exp), 60);
3715 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3716 PL_colors[4],PL_colors[5],s);
3718 RExC_flags = pm->op_pmflags;
3722 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3723 RExC_seen_evals = 0;
3726 /* First pass: determine size, legality. */
3733 RExC_emit = &PL_regdummy;
3734 RExC_whilem_seen = 0;
3735 #if 0 /* REGC() is (currently) a NOP at the first pass.
3736 * Clever compilers notice this and complain. --jhi */
3737 REGC((U8)REG_MAGIC, (char*)RExC_emit);
3739 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3740 if (reg(pRExC_state, 0, &flags,1) == NULL) {
3741 RExC_precomp = NULL;
3744 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3745 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3746 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
3749 RExC_lastparse=NULL;
3753 /* Small enough for pointer-storage convention?
3754 If extralen==0, this means that we will not need long jumps. */
3755 if (RExC_size >= 0x10000L && RExC_extralen)
3756 RExC_size += RExC_extralen;
3759 if (RExC_whilem_seen > 15)
3760 RExC_whilem_seen = 15;
3762 /* Allocate space and initialize. */
3763 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
3766 FAIL("Regexp out of space");
3769 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
3770 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
3773 r->prelen = xend - exp;
3774 r->precomp = savepvn(RExC_precomp, r->prelen);
3776 #ifdef PERL_OLD_COPY_ON_WRITE
3777 r->saved_copy = NULL;
3779 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
3780 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
3781 r->lastparen = 0; /* mg.c reads this. */
3783 r->substrs = 0; /* Useful during FAIL. */
3784 r->startp = 0; /* Useful during FAIL. */
3785 r->endp = 0; /* Useful during FAIL. */
3787 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
3789 r->offsets[0] = RExC_size;
3791 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
3792 "%s %"UVuf" bytes for offset annotations.\n",
3793 r->offsets ? "Got" : "Couldn't get",
3794 (UV)((2*RExC_size+1) * sizeof(U32))));
3798 /* Second pass: emit code. */
3799 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
3804 RExC_emit_start = r->program;
3805 RExC_emit = r->program;
3806 /* Store the count of eval-groups for security checks: */
3807 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
3808 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3810 if (reg(pRExC_state, 0, &flags,1) == NULL)
3812 /* XXXX To minimize changes to RE engine we always allocate
3813 3-units-long substrs field. */
3814 Newx(r->substrs, 1, struct reg_substr_data);
3817 r->minlen = minlen = sawplus = sawopen = 0;
3818 Zero(r->substrs, 1, struct reg_substr_data);
3819 StructCopy(&zero_scan_data, &data, scan_data_t);
3821 #ifdef TRIE_STUDY_OPT
3823 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3824 RExC_state=copyRExC_state;
3825 if (data.last_found) {
3826 SvREFCNT_dec(data.longest_fixed);
3827 SvREFCNT_dec(data.longest_float);
3828 SvREFCNT_dec(data.last_found);
3831 copyRExC_state=RExC_state;
3834 /* Dig out information for optimizations. */
3835 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
3836 pm->op_pmflags = RExC_flags;
3838 r->reganch |= ROPT_UTF8; /* Unicode in it? */
3839 r->regstclass = NULL;
3840 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
3841 r->reganch |= ROPT_NAUGHTY;
3842 scan = r->program + 1; /* First BRANCH. */
3844 /* testing for BRANCH here tells us whether there is "must appear"
3845 data in the pattern. If there is then we can use it for optimisations */
3846 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
3848 STRLEN longest_float_length, longest_fixed_length;
3849 struct regnode_charclass_class ch_class; /* pointed to by data */
3851 I32 last_close = 0; /* pointed to by data */
3854 /* Skip introductions and multiplicators >= 1. */
3855 while ((OP(first) == OPEN && (sawopen = 1)) ||
3856 /* An OR of *one* alternative - should not happen now. */
3857 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
3858 /* for now we can't handle lookbehind IFMATCH*/
3859 (OP(first) == IFMATCH && !first->flags) ||
3860 (OP(first) == PLUS) ||
3861 (OP(first) == MINMOD) ||
3862 /* An {n,m} with n>0 */
3863 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3866 if (OP(first) == PLUS)
3869 first += regarglen[OP(first)];
3870 if (OP(first) == IFMATCH) {
3871 first = NEXTOPER(first);
3872 first += EXTRA_STEP_2ARGS;
3873 } else /* XXX possible optimisation for /(?=)/ */
3874 first = NEXTOPER(first);
3877 /* Starting-point info. */
3879 DEBUG_PEEP("first:",first,0);
3880 /* Ignore EXACT as we deal with it later. */
3881 if (PL_regkind[OP(first)] == EXACT) {
3882 if (OP(first) == EXACT)
3883 NOOP; /* Empty, get anchored substr later. */
3884 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
3885 r->regstclass = first;
3888 else if (PL_regkind[OP(first)] == TRIE &&
3889 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3892 /* this can happen only on restudy */
3893 if ( OP(first) == TRIE ) {
3894 struct regnode_1 *trieop;
3895 Newxz(trieop,1,struct regnode_1);
3896 StructCopy(first,trieop,struct regnode_1);
3897 trie_op=(regnode *)trieop;
3899 struct regnode_charclass *trieop;
3900 Newxz(trieop,1,struct regnode_charclass);
3901 StructCopy(first,trieop,struct regnode_charclass);
3902 trie_op=(regnode *)trieop;
3905 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
3906 r->regstclass = trie_op;
3909 else if (strchr((const char*)PL_simple,OP(first)))
3910 r->regstclass = first;
3911 else if (PL_regkind[OP(first)] == BOUND ||
3912 PL_regkind[OP(first)] == NBOUND)
3913 r->regstclass = first;
3914 else if (PL_regkind[OP(first)] == BOL) {
3915 r->reganch |= (OP(first) == MBOL
3917 : (OP(first) == SBOL
3920 first = NEXTOPER(first);
3923 else if (OP(first) == GPOS) {
3924 r->reganch |= ROPT_ANCH_GPOS;
3925 first = NEXTOPER(first);
3928 else if (!sawopen && (OP(first) == STAR &&
3929 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
3930 !(r->reganch & ROPT_ANCH) )
3932 /* turn .* into ^.* with an implied $*=1 */
3934 (OP(NEXTOPER(first)) == REG_ANY)
3937 r->reganch |= type | ROPT_IMPLICIT;
3938 first = NEXTOPER(first);
3941 if (sawplus && (!sawopen || !RExC_sawback)
3942 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
3943 /* x+ must match at the 1st pos of run of x's */
3944 r->reganch |= ROPT_SKIP;
3946 /* Scan is after the zeroth branch, first is atomic matcher. */
3947 #ifdef TRIE_STUDY_OPT
3950 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3951 (IV)(first - scan + 1))
3955 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3956 (IV)(first - scan + 1))
3962 * If there's something expensive in the r.e., find the
3963 * longest literal string that must appear and make it the
3964 * regmust. Resolve ties in favor of later strings, since
3965 * the regstart check works with the beginning of the r.e.
3966 * and avoiding duplication strengthens checking. Not a
3967 * strong reason, but sufficient in the absence of others.
3968 * [Now we resolve ties in favor of the earlier string if
3969 * it happens that c_offset_min has been invalidated, since the
3970 * earlier string may buy us something the later one won't.]
3974 data.longest_fixed = newSVpvs("");
3975 data.longest_float = newSVpvs("");
3976 data.last_found = newSVpvs("");
3977 data.longest = &(data.longest_fixed);
3979 if (!r->regstclass) {
3980 cl_init(pRExC_state, &ch_class);
3981 data.start_class = &ch_class;
3982 stclass_flag = SCF_DO_STCLASS_AND;
3983 } else /* XXXX Check for BOUND? */
3985 data.last_closep = &last_close;
3987 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
3988 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
3994 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
3995 && data.last_start_min == 0 && data.last_end > 0
3996 && !RExC_seen_zerolen
3997 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
3998 r->reganch |= ROPT_CHECK_ALL;
3999 scan_commit(pRExC_state, &data,&minlen);
4000 SvREFCNT_dec(data.last_found);
4002 /* Note that code very similar to this but for anchored string
4003 follows immediately below, changes may need to be made to both.
4006 longest_float_length = CHR_SVLEN(data.longest_float);
4007 if (longest_float_length
4008 || (data.flags & SF_FL_BEFORE_EOL
4009 && (!(data.flags & SF_FL_BEFORE_MEOL)
4010 || (RExC_flags & PMf_MULTILINE))))
4014 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4015 && data.offset_fixed == data.offset_float_min
4016 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4017 goto remove_float; /* As in (a)+. */
4019 /* copy the information about the longest float from the reg_scan_data
4020 over to the program. */
4021 if (SvUTF8(data.longest_float)) {
4022 r->float_utf8 = data.longest_float;
4023 r->float_substr = NULL;
4025 r->float_substr = data.longest_float;
4026 r->float_utf8 = NULL;
4028 /* float_end_shift is how many chars that must be matched that
4029 follow this item. We calculate it ahead of time as once the
4030 lookbehind offset is added in we lose the ability to correctly
4032 ml = data.minlen_float ? *(data.minlen_float)
4033 : (I32)longest_float_length;
4034 r->float_end_shift = ml - data.offset_float_min
4035 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4036 + data.lookbehind_float;
4037 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4038 r->float_max_offset = data.offset_float_max;
4039 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4040 r->float_max_offset -= data.lookbehind_float;
4042 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4043 && (!(data.flags & SF_FL_BEFORE_MEOL)
4044 || (RExC_flags & PMf_MULTILINE)));
4045 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4049 r->float_substr = r->float_utf8 = NULL;
4050 SvREFCNT_dec(data.longest_float);
4051 longest_float_length = 0;
4054 /* Note that code very similar to this but for floating string
4055 is immediately above, changes may need to be made to both.
4058 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4059 if (longest_fixed_length
4060 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4061 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4062 || (RExC_flags & PMf_MULTILINE))))
4066 /* copy the information about the longest fixed
4067 from the reg_scan_data over to the program. */
4068 if (SvUTF8(data.longest_fixed)) {
4069 r->anchored_utf8 = data.longest_fixed;
4070 r->anchored_substr = NULL;
4072 r->anchored_substr = data.longest_fixed;
4073 r->anchored_utf8 = NULL;
4075 /* fixed_end_shift is how many chars that must be matched that
4076 follow this item. We calculate it ahead of time as once the
4077 lookbehind offset is added in we lose the ability to correctly
4079 ml = data.minlen_fixed ? *(data.minlen_fixed)
4080 : (I32)longest_fixed_length;
4081 r->anchored_end_shift = ml - data.offset_fixed
4082 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4083 + data.lookbehind_fixed;
4084 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4086 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4087 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4088 || (RExC_flags & PMf_MULTILINE)));
4089 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4092 r->anchored_substr = r->anchored_utf8 = NULL;
4093 SvREFCNT_dec(data.longest_fixed);
4094 longest_fixed_length = 0;
4097 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
4098 r->regstclass = NULL;
4099 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4101 && !(data.start_class->flags & ANYOF_EOS)
4102 && !cl_is_anything(data.start_class))
4104 const I32 n = add_data(pRExC_state, 1, "f");
4106 Newx(RExC_rx->data->data[n], 1,
4107 struct regnode_charclass_class);
4108 StructCopy(data.start_class,
4109 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4110 struct regnode_charclass_class);
4111 r->regstclass = (regnode*)RExC_rx->data->data[n];
4112 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4113 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4114 regprop(r, sv, (regnode*)data.start_class);
4115 PerlIO_printf(Perl_debug_log,
4116 "synthetic stclass \"%s\".\n",
4117 SvPVX_const(sv));});
4120 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4121 if (longest_fixed_length > longest_float_length) {
4122 r->check_end_shift = r->anchored_end_shift;
4123 r->check_substr = r->anchored_substr;
4124 r->check_utf8 = r->anchored_utf8;
4125 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4126 if (r->reganch & ROPT_ANCH_SINGLE)
4127 r->reganch |= ROPT_NOSCAN;
4130 r->check_end_shift = r->float_end_shift;
4131 r->check_substr = r->float_substr;
4132 r->check_utf8 = r->float_utf8;
4133 r->check_offset_min = r->float_min_offset;
4134 r->check_offset_max = r->float_max_offset;
4136 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4137 This should be changed ASAP! */
4138 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
4139 r->reganch |= RE_USE_INTUIT;
4140 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4141 r->reganch |= RE_INTUIT_TAIL;
4143 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4144 if ( (STRLEN)minlen < longest_float_length )
4145 minlen= longest_float_length;
4146 if ( (STRLEN)minlen < longest_fixed_length )
4147 minlen= longest_fixed_length;
4151 /* Several toplevels. Best we can is to set minlen. */
4153 struct regnode_charclass_class ch_class;
4156 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
4158 scan = r->program + 1;
4159 cl_init(pRExC_state, &ch_class);
4160 data.start_class = &ch_class;
4161 data.last_closep = &last_close;
4163 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4164 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4168 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4169 = r->float_substr = r->float_utf8 = NULL;
4170 if (!(data.start_class->flags & ANYOF_EOS)
4171 && !cl_is_anything(data.start_class))
4173 const I32 n = add_data(pRExC_state, 1, "f");
4175 Newx(RExC_rx->data->data[n], 1,
4176 struct regnode_charclass_class);
4177 StructCopy(data.start_class,
4178 (struct regnode_charclass_class*)RExC_rx->data->data[n],
4179 struct regnode_charclass_class);
4180 r->regstclass = (regnode*)RExC_rx->data->data[n];
4181 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
4182 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4183 regprop(r, sv, (regnode*)data.start_class);
4184 PerlIO_printf(Perl_debug_log,
4185 "synthetic stclass \"%s\".\n",
4186 SvPVX_const(sv));});
4190 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4191 the "real" pattern. */
4192 if (r->minlen < minlen)
4195 if (RExC_seen & REG_SEEN_GPOS)
4196 r->reganch |= ROPT_GPOS_SEEN;
4197 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4198 r->reganch |= ROPT_LOOKBEHIND_SEEN;
4199 if (RExC_seen & REG_SEEN_EVAL)
4200 r->reganch |= ROPT_EVAL_SEEN;
4201 if (RExC_seen & REG_SEEN_CANY)
4202 r->reganch |= ROPT_CANY_SEEN;
4203 Newxz(r->startp, RExC_npar, I32);
4204 Newxz(r->endp, RExC_npar, I32);
4206 DEBUG_r( RX_DEBUG_on(r) );
4208 PerlIO_printf(Perl_debug_log,"Final program:\n");
4211 DEBUG_OFFSETS_r(if (r->offsets) {
4212 const U32 len = r->offsets[0];
4214 GET_RE_DEBUG_FLAGS_DECL;
4215 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4216 for (i = 1; i <= len; i++) {
4217 if (r->offsets[i*2-1] || r->offsets[i*2])
4218 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4219 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
4221 PerlIO_printf(Perl_debug_log, "\n");
4227 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4228 int rem=(int)(RExC_end - RExC_parse); \
4237 if (RExC_lastparse!=RExC_parse) \
4238 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4241 iscut ? "..." : "<" \
4244 PerlIO_printf(Perl_debug_log,"%16s",""); \
4249 num=REG_NODE_NUM(RExC_emit); \
4250 if (RExC_lastnum!=num) \
4251 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4253 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4254 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4255 (int)((depth*2)), "", \
4259 RExC_lastparse=RExC_parse; \
4264 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4265 DEBUG_PARSE_MSG((funcname)); \
4266 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4269 - reg - regular expression, i.e. main body or parenthesized thing
4271 * Caller must absorb opening parenthesis.
4273 * Combining parenthesis handling with the base level of regular expression
4274 * is a trifle forced, but the need to tie the tails of the branches to what
4275 * follows makes it hard to avoid.
4277 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4279 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4281 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4285 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4286 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4289 register regnode *ret; /* Will be the head of the group. */
4290 register regnode *br;
4291 register regnode *lastbr;
4292 register regnode *ender = NULL;
4293 register I32 parno = 0;
4295 const I32 oregflags = RExC_flags;
4296 bool have_branch = 0;
4299 /* for (?g), (?gc), and (?o) warnings; warning
4300 about (?c) will warn about (?g) -- japhy */
4302 #define WASTED_O 0x01
4303 #define WASTED_G 0x02
4304 #define WASTED_C 0x04
4305 #define WASTED_GC (0x02|0x04)
4306 I32 wastedflags = 0x00;
4308 char * parse_start = RExC_parse; /* MJD */
4309 char * const oregcomp_parse = RExC_parse;
4311 GET_RE_DEBUG_FLAGS_DECL;
4312 DEBUG_PARSE("reg ");
4315 *flagp = 0; /* Tentatively. */
4318 /* Make an OPEN node, if parenthesized. */
4320 if (*RExC_parse == '?') { /* (?...) */
4321 U32 posflags = 0, negflags = 0;
4322 U32 *flagsp = &posflags;
4323 bool is_logical = 0;
4324 const char * const seqstart = RExC_parse;
4327 paren = *RExC_parse++;
4328 ret = NULL; /* For look-ahead/behind. */
4330 case '<': /* (?<...) */
4331 RExC_seen |= REG_SEEN_LOOKBEHIND;
4332 if (*RExC_parse == '!')
4334 if (*RExC_parse != '=' && *RExC_parse != '!')
4337 case '=': /* (?=...) */
4338 case '!': /* (?!...) */
4339 RExC_seen_zerolen++;
4340 case ':': /* (?:...) */
4341 case '>': /* (?>...) */
4343 case '$': /* (?$...) */
4344 case '@': /* (?@...) */
4345 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
4347 case '#': /* (?#...) */
4348 while (*RExC_parse && *RExC_parse != ')')
4350 if (*RExC_parse != ')')
4351 FAIL("Sequence (?#... not terminated");
4352 nextchar(pRExC_state);
4355 case 'p': /* (?p...) */
4356 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
4357 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
4359 case '?': /* (??...) */
4361 if (*RExC_parse != '{')
4363 paren = *RExC_parse++;
4365 case '{': /* (?{...}) */
4367 I32 count = 1, n = 0;
4369 char *s = RExC_parse;
4371 RExC_seen_zerolen++;
4372 RExC_seen |= REG_SEEN_EVAL;
4373 while (count && (c = *RExC_parse)) {
4384 if (*RExC_parse != ')') {
4386 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4390 OP_4tree *sop, *rop;
4391 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
4394 Perl_save_re_context(aTHX);
4395 rop = sv_compile_2op(sv, &sop, "re", &pad);
4396 sop->op_private |= OPpREFCOUNTED;
4397 /* re_dup will OpREFCNT_inc */
4398 OpREFCNT_set(sop, 1);
4401 n = add_data(pRExC_state, 3, "nop");
4402 RExC_rx->data->data[n] = (void*)rop;
4403 RExC_rx->data->data[n+1] = (void*)sop;
4404 RExC_rx->data->data[n+2] = (void*)pad;
4407 else { /* First pass */
4408 if (PL_reginterp_cnt < ++RExC_seen_evals
4410 /* No compiled RE interpolated, has runtime
4411 components ===> unsafe. */
4412 FAIL("Eval-group not allowed at runtime, use re 'eval'");
4413 if (PL_tainting && PL_tainted)
4414 FAIL("Eval-group in insecure regular expression");
4415 #if PERL_VERSION > 8
4416 if (IN_PERL_COMPILETIME)
4421 nextchar(pRExC_state);
4423 ret = reg_node(pRExC_state, LOGICAL);
4426 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
4427 /* deal with the length of this later - MJD */
4430 ret = reganode(pRExC_state, EVAL, n);
4431 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4432 Set_Node_Offset(ret, parse_start);
4435 case '(': /* (?(?{...})...) and (?(?=...)...) */
4437 if (RExC_parse[0] == '?') { /* (?(?...)) */
4438 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4439 || RExC_parse[1] == '<'
4440 || RExC_parse[1] == '{') { /* Lookahead or eval. */
4443 ret = reg_node(pRExC_state, LOGICAL);
4446 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
4450 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
4453 parno = atoi(RExC_parse++);
4455 while (isDIGIT(*RExC_parse))
4457 ret = reganode(pRExC_state, GROUPP, parno);
4459 if ((c = *nextchar(pRExC_state)) != ')')
4460 vFAIL("Switch condition not recognized");
4462 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4463 br = regbranch(pRExC_state, &flags, 1,depth+1);
4465 br = reganode(pRExC_state, LONGJMP, 0);
4467 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
4468 c = *nextchar(pRExC_state);
4472 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
4473 regbranch(pRExC_state, &flags, 1,depth+1);
4474 REGTAIL(pRExC_state, ret, lastbr);
4477 c = *nextchar(pRExC_state);
4482 vFAIL("Switch (?(condition)... contains too many branches");
4483 ender = reg_node(pRExC_state, TAIL);
4484 REGTAIL(pRExC_state, br, ender);
4486 REGTAIL(pRExC_state, lastbr, ender);
4487 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
4490 REGTAIL(pRExC_state, ret, ender);
4494 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
4498 RExC_parse--; /* for vFAIL to print correctly */
4499 vFAIL("Sequence (? incomplete");
4503 parse_flags: /* (?i) */
4504 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
4505 /* (?g), (?gc) and (?o) are useless here
4506 and must be globally applied -- japhy */
4508 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4509 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
4510 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
4511 if (! (wastedflags & wflagbit) ) {
4512 wastedflags |= wflagbit;
4515 "Useless (%s%c) - %suse /%c modifier",
4516 flagsp == &negflags ? "?-" : "?",
4518 flagsp == &negflags ? "don't " : "",
4524 else if (*RExC_parse == 'c') {
4525 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
4526 if (! (wastedflags & WASTED_C) ) {
4527 wastedflags |= WASTED_GC;
4530 "Useless (%sc) - %suse /gc modifier",
4531 flagsp == &negflags ? "?-" : "?",
4532 flagsp == &negflags ? "don't " : ""
4537 else { pmflag(flagsp, *RExC_parse); }
4541 if (*RExC_parse == '-') {
4543 wastedflags = 0; /* reset so (?g-c) warns twice */
4547 RExC_flags |= posflags;
4548 RExC_flags &= ~negflags;
4549 if (*RExC_parse == ':') {
4555 if (*RExC_parse != ')') {
4557 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
4559 nextchar(pRExC_state);
4567 ret = reganode(pRExC_state, OPEN, parno);
4568 Set_Node_Length(ret, 1); /* MJD */
4569 Set_Node_Offset(ret, RExC_parse); /* MJD */
4576 /* Pick up the branches, linking them together. */
4577 parse_start = RExC_parse; /* MJD */
4578 br = regbranch(pRExC_state, &flags, 1,depth+1);
4579 /* branch_len = (paren != 0); */
4583 if (*RExC_parse == '|') {
4584 if (!SIZE_ONLY && RExC_extralen) {
4585 reginsert(pRExC_state, BRANCHJ, br);
4588 reginsert(pRExC_state, BRANCH, br);
4589 Set_Node_Length(br, paren != 0);
4590 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4594 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
4596 else if (paren == ':') {
4597 *flagp |= flags&SIMPLE;
4599 if (is_open) { /* Starts with OPEN. */
4600 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
4602 else if (paren != '?') /* Not Conditional */
4604 *flagp |= flags & (SPSTART | HASWIDTH);
4606 while (*RExC_parse == '|') {
4607 if (!SIZE_ONLY && RExC_extralen) {
4608 ender = reganode(pRExC_state, LONGJMP,0);
4609 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
4612 RExC_extralen += 2; /* Account for LONGJMP. */
4613 nextchar(pRExC_state);
4614 br = regbranch(pRExC_state, &flags, 0, depth+1);
4618 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
4622 *flagp |= flags&SPSTART;
4625 if (have_branch || paren != ':') {
4626 /* Make a closing node, and hook it on the end. */
4629 ender = reg_node(pRExC_state, TAIL);
4632 ender = reganode(pRExC_state, CLOSE, parno);
4633 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4634 Set_Node_Length(ender,1); /* MJD */
4640 *flagp &= ~HASWIDTH;
4643 ender = reg_node(pRExC_state, SUCCEED);
4646 ender = reg_node(pRExC_state, END);
4649 REGTAIL_STUDY(pRExC_state, lastbr, ender);
4651 if (have_branch && !SIZE_ONLY) {
4652 /* Hook the tails of the branches to the closing node. */
4653 for (br = ret; br; br = regnext(br)) {
4654 const U8 op = PL_regkind[OP(br)];
4656 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
4658 else if (op == BRANCHJ) {
4659 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
4667 static const char parens[] = "=!<,>";
4669 if (paren && (p = strchr(parens, paren))) {
4670 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4671 int flag = (p - parens) > 1;
4674 node = SUSPEND, flag = 0;
4675 reginsert(pRExC_state, node,ret);
4676 Set_Node_Cur_Length(ret);
4677 Set_Node_Offset(ret, parse_start + 1);
4679 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
4683 /* Check for proper termination. */
4685 RExC_flags = oregflags;
4686 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4687 RExC_parse = oregcomp_parse;
4688 vFAIL("Unmatched (");
4691 else if (!paren && RExC_parse < RExC_end) {
4692 if (*RExC_parse == ')') {
4694 vFAIL("Unmatched )");
4697 FAIL("Junk on end of regexp"); /* "Can't happen". */
4705 - regbranch - one alternative of an | operator
4707 * Implements the concatenation operator.
4710 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4713 register regnode *ret;
4714 register regnode *chain = NULL;
4715 register regnode *latest;
4716 I32 flags = 0, c = 0;
4717 GET_RE_DEBUG_FLAGS_DECL;
4718 DEBUG_PARSE("brnc");
4722 if (!SIZE_ONLY && RExC_extralen)
4723 ret = reganode(pRExC_state, BRANCHJ,0);
4725 ret = reg_node(pRExC_state, BRANCH);
4726 Set_Node_Length(ret, 1);
4730 if (!first && SIZE_ONLY)
4731 RExC_extralen += 1; /* BRANCHJ */
4733 *flagp = WORST; /* Tentatively. */
4736 nextchar(pRExC_state);
4737 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4739 latest = regpiece(pRExC_state, &flags,depth+1);
4740 if (latest == NULL) {
4741 if (flags & TRYAGAIN)
4745 else if (ret == NULL)
4747 *flagp |= flags&HASWIDTH;
4748 if (chain == NULL) /* First piece. */
4749 *flagp |= flags&SPSTART;
4752 REGTAIL(pRExC_state, chain, latest);
4757 if (chain == NULL) { /* Loop ran zero times. */
4758 chain = reg_node(pRExC_state, NOTHING);
4763 *flagp |= flags&SIMPLE;
4770 - regpiece - something followed by possible [*+?]
4772 * Note that the branching code sequences used for ? and the general cases
4773 * of * and + are somewhat optimized: they use the same NOTHING node as
4774 * both the endmarker for their branch list and the body of the last branch.
4775 * It might seem that this node could be dispensed with entirely, but the
4776 * endmarker role is not redundant.
4779 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4782 register regnode *ret;
4784 register char *next;
4786 const char * const origparse = RExC_parse;
4788 I32 max = REG_INFTY;
4790 const char *maxpos = NULL;
4791 GET_RE_DEBUG_FLAGS_DECL;
4792 DEBUG_PARSE("piec");
4794 ret = regatom(pRExC_state, &flags,depth+1);
4796 if (flags & TRYAGAIN)
4803 if (op == '{' && regcurly(RExC_parse)) {
4805 parse_start = RExC_parse; /* MJD */
4806 next = RExC_parse + 1;
4807 while (isDIGIT(*next) || *next == ',') {
4816 if (*next == '}') { /* got one */
4820 min = atoi(RExC_parse);
4824 maxpos = RExC_parse;
4826 if (!max && *maxpos != '0')
4827 max = REG_INFTY; /* meaning "infinity" */
4828 else if (max >= REG_INFTY)
4829 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4831 nextchar(pRExC_state);
4834 if ((flags&SIMPLE)) {
4835 RExC_naughty += 2 + RExC_naughty / 2;
4836 reginsert(pRExC_state, CURLY, ret);
4837 Set_Node_Offset(ret, parse_start+1); /* MJD */
4838 Set_Node_Cur_Length(ret);
4841 regnode * const w = reg_node(pRExC_state, WHILEM);
4844 REGTAIL(pRExC_state, ret, w);
4845 if (!SIZE_ONLY && RExC_extralen) {
4846 reginsert(pRExC_state, LONGJMP,ret);
4847 reginsert(pRExC_state, NOTHING,ret);
4848 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4850 reginsert(pRExC_state, CURLYX,ret);
4852 Set_Node_Offset(ret, parse_start+1);
4853 Set_Node_Length(ret,
4854 op == '{' ? (RExC_parse - parse_start) : 1);
4856 if (!SIZE_ONLY && RExC_extralen)
4857 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
4858 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
4860 RExC_whilem_seen++, RExC_extralen += 3;
4861 RExC_naughty += 4 + RExC_naughty; /* compound interest */
4869 if (max && max < min)
4870 vFAIL("Can't do {n,m} with n > m");
4872 ARG1_SET(ret, (U16)min);
4873 ARG2_SET(ret, (U16)max);
4885 #if 0 /* Now runtime fix should be reliable. */
4887 /* if this is reinstated, don't forget to put this back into perldiag:
4889 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4891 (F) The part of the regexp subject to either the * or + quantifier
4892 could match an empty string. The {#} shows in the regular
4893 expression about where the problem was discovered.
4897 if (!(flags&HASWIDTH) && op != '?')
4898 vFAIL("Regexp *+ operand could be empty");
4901 parse_start = RExC_parse;
4902 nextchar(pRExC_state);
4904 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
4906 if (op == '*' && (flags&SIMPLE)) {
4907 reginsert(pRExC_state, STAR, ret);
4911 else if (op == '*') {
4915 else if (op == '+' && (flags&SIMPLE)) {
4916 reginsert(pRExC_state, PLUS, ret);
4920 else if (op == '+') {
4924 else if (op == '?') {
4929 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
4931 "%.*s matches null string many times",
4932 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
4936 if (*RExC_parse == '?') {
4937 nextchar(pRExC_state);
4938 reginsert(pRExC_state, MINMOD, ret);
4939 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
4941 if (ISMULT2(RExC_parse)) {
4943 vFAIL("Nested quantifiers");
4950 - regatom - the lowest level
4952 * Optimization: gobbles an entire sequence of ordinary characters so that
4953 * it can turn them into a single node, which is smaller to store and
4954 * faster to run. Backslashed characters are exceptions, each becoming a
4955 * separate node; the code is simpler that way and it's not worth fixing.
4957 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4958 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4961 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4964 register regnode *ret = NULL;
4966 char *parse_start = RExC_parse;
4967 GET_RE_DEBUG_FLAGS_DECL;
4968 DEBUG_PARSE("atom");
4969 *flagp = WORST; /* Tentatively. */
4972 switch (*RExC_parse) {
4974 RExC_seen_zerolen++;
4975 nextchar(pRExC_state);
4976 if (RExC_flags & PMf_MULTILINE)
4977 ret = reg_node(pRExC_state, MBOL);
4978 else if (RExC_flags & PMf_SINGLELINE)
4979 ret = reg_node(pRExC_state, SBOL);
4981 ret = reg_node(pRExC_state, BOL);
4982 Set_Node_Length(ret, 1); /* MJD */
4985 nextchar(pRExC_state);
4987 RExC_seen_zerolen++;
4988 if (RExC_flags & PMf_MULTILINE)
4989 ret = reg_node(pRExC_state, MEOL);
4990 else if (RExC_flags & PMf_SINGLELINE)
4991 ret = reg_node(pRExC_state, SEOL);
4993 ret = reg_node(pRExC_state, EOL);
4994 Set_Node_Length(ret, 1); /* MJD */
4997 nextchar(pRExC_state);
4998 if (RExC_flags & PMf_SINGLELINE)
4999 ret = reg_node(pRExC_state, SANY);
5001 ret = reg_node(pRExC_state, REG_ANY);
5002 *flagp |= HASWIDTH|SIMPLE;
5004 Set_Node_Length(ret, 1); /* MJD */
5008 char * const oregcomp_parse = ++RExC_parse;
5009 ret = regclass(pRExC_state,depth+1);
5010 if (*RExC_parse != ']') {
5011 RExC_parse = oregcomp_parse;
5012 vFAIL("Unmatched [");
5014 nextchar(pRExC_state);
5015 *flagp |= HASWIDTH|SIMPLE;
5016 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
5020 nextchar(pRExC_state);
5021 ret = reg(pRExC_state, 1, &flags,depth+1);
5023 if (flags & TRYAGAIN) {
5024 if (RExC_parse == RExC_end) {
5025 /* Make parent create an empty node if needed. */
5033 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
5037 if (flags & TRYAGAIN) {
5041 vFAIL("Internal urp");
5042 /* Supposed to be caught earlier. */
5045 if (!regcurly(RExC_parse)) {
5054 vFAIL("Quantifier follows nothing");
5057 switch (*++RExC_parse) {
5059 RExC_seen_zerolen++;
5060 ret = reg_node(pRExC_state, SBOL);
5062 nextchar(pRExC_state);
5063 Set_Node_Length(ret, 2); /* MJD */
5066 ret = reg_node(pRExC_state, GPOS);
5067 RExC_seen |= REG_SEEN_GPOS;
5069 nextchar(pRExC_state);
5070 Set_Node_Length(ret, 2); /* MJD */
5073 ret = reg_node(pRExC_state, SEOL);
5075 RExC_seen_zerolen++; /* Do not optimize RE away */
5076 nextchar(pRExC_state);
5079 ret = reg_node(pRExC_state, EOS);
5081 RExC_seen_zerolen++; /* Do not optimize RE away */
5082 nextchar(pRExC_state);
5083 Set_Node_Length(ret, 2); /* MJD */
5086 ret = reg_node(pRExC_state, CANY);
5087 RExC_seen |= REG_SEEN_CANY;
5088 *flagp |= HASWIDTH|SIMPLE;
5089 nextchar(pRExC_state);
5090 Set_Node_Length(ret, 2); /* MJD */
5093 ret = reg_node(pRExC_state, CLUMP);
5095 nextchar(pRExC_state);
5096 Set_Node_Length(ret, 2); /* MJD */
5099 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
5100 *flagp |= HASWIDTH|SIMPLE;
5101 nextchar(pRExC_state);
5102 Set_Node_Length(ret, 2); /* MJD */
5105 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
5106 *flagp |= HASWIDTH|SIMPLE;
5107 nextchar(pRExC_state);
5108 Set_Node_Length(ret, 2); /* MJD */
5111 RExC_seen_zerolen++;
5112 RExC_seen |= REG_SEEN_LOOKBEHIND;
5113 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
5115 nextchar(pRExC_state);
5116 Set_Node_Length(ret, 2); /* MJD */
5119 RExC_seen_zerolen++;
5120 RExC_seen |= REG_SEEN_LOOKBEHIND;
5121 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
5123 nextchar(pRExC_state);
5124 Set_Node_Length(ret, 2); /* MJD */
5127 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
5128 *flagp |= HASWIDTH|SIMPLE;
5129 nextchar(pRExC_state);
5130 Set_Node_Length(ret, 2); /* MJD */
5133 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
5134 *flagp |= HASWIDTH|SIMPLE;
5135 nextchar(pRExC_state);
5136 Set_Node_Length(ret, 2); /* MJD */
5139 ret = reg_node(pRExC_state, DIGIT);
5140 *flagp |= HASWIDTH|SIMPLE;
5141 nextchar(pRExC_state);
5142 Set_Node_Length(ret, 2); /* MJD */
5145 ret = reg_node(pRExC_state, NDIGIT);
5146 *flagp |= HASWIDTH|SIMPLE;
5147 nextchar(pRExC_state);
5148 Set_Node_Length(ret, 2); /* MJD */
5153 char* const oldregxend = RExC_end;
5154 char* parse_start = RExC_parse - 2;
5156 if (RExC_parse[1] == '{') {
5157 /* a lovely hack--pretend we saw [\pX] instead */
5158 RExC_end = strchr(RExC_parse, '}');
5160 const U8 c = (U8)*RExC_parse;
5162 RExC_end = oldregxend;
5163 vFAIL2("Missing right brace on \\%c{}", c);
5168 RExC_end = RExC_parse + 2;
5169 if (RExC_end > oldregxend)
5170 RExC_end = oldregxend;
5174 ret = regclass(pRExC_state,depth+1);
5176 RExC_end = oldregxend;
5179 Set_Node_Offset(ret, parse_start + 2);
5180 Set_Node_Cur_Length(ret);
5181 nextchar(pRExC_state);
5182 *flagp |= HASWIDTH|SIMPLE;
5195 case '1': case '2': case '3': case '4':
5196 case '5': case '6': case '7': case '8': case '9':
5198 const I32 num = atoi(RExC_parse);
5200 if (num > 9 && num >= RExC_npar)
5203 char * const parse_start = RExC_parse - 1; /* MJD */
5204 while (isDIGIT(*RExC_parse))
5207 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
5208 vFAIL("Reference to nonexistent group");
5210 ret = reganode(pRExC_state,
5211 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
5215 /* override incorrect value set in reganode MJD */
5216 Set_Node_Offset(ret, parse_start+1);
5217 Set_Node_Cur_Length(ret); /* MJD */
5219 nextchar(pRExC_state);
5224 if (RExC_parse >= RExC_end)
5225 FAIL("Trailing \\");
5228 /* Do not generate "unrecognized" warnings here, we fall
5229 back into the quick-grab loop below */
5236 if (RExC_flags & PMf_EXTENDED) {
5237 while (RExC_parse < RExC_end && *RExC_parse != '\n')
5239 if (RExC_parse < RExC_end)
5245 register STRLEN len;
5250 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5252 parse_start = RExC_parse - 1;
5258 ret = reg_node(pRExC_state,
5259 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5261 for (len = 0, p = RExC_parse - 1;
5262 len < 127 && p < RExC_end;
5265 char * const oldp = p;
5267 if (RExC_flags & PMf_EXTENDED)
5268 p = regwhite(p, RExC_end);
5315 ender = ASCII_TO_NATIVE('\033');
5319 ender = ASCII_TO_NATIVE('\007');
5324 char* const e = strchr(p, '}');
5328 vFAIL("Missing right brace on \\x{}");
5331 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5332 | PERL_SCAN_DISALLOW_PREFIX;
5333 STRLEN numlen = e - p - 1;
5334 ender = grok_hex(p + 1, &numlen, &flags, NULL);
5341 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5343 ender = grok_hex(p, &numlen, &flags, NULL);
5349 ender = UCHARAT(p++);
5350 ender = toCTRL(ender);
5352 case '0': case '1': case '2': case '3':case '4':
5353 case '5': case '6': case '7': case '8':case '9':
5355 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
5358 ender = grok_oct(p, &numlen, &flags, NULL);
5368 FAIL("Trailing \\");
5371 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
5372 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
5373 goto normal_default;
5378 if (UTF8_IS_START(*p) && UTF) {
5380 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
5381 &numlen, UTF8_ALLOW_DEFAULT);
5388 if (RExC_flags & PMf_EXTENDED)
5389 p = regwhite(p, RExC_end);
5391 /* Prime the casefolded buffer. */
5392 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
5394 if (ISMULT2(p)) { /* Back off on ?+*. */
5399 /* Emit all the Unicode characters. */
5401 for (foldbuf = tmpbuf;
5403 foldlen -= numlen) {
5404 ender = utf8_to_uvchr(foldbuf, &numlen);
5406 const STRLEN unilen = reguni(pRExC_state, ender, s);
5409 /* In EBCDIC the numlen
5410 * and unilen can differ. */
5412 if (numlen >= foldlen)
5416 break; /* "Can't happen." */
5420 const STRLEN unilen = reguni(pRExC_state, ender, s);
5429 REGC((char)ender, s++);
5435 /* Emit all the Unicode characters. */
5437 for (foldbuf = tmpbuf;
5439 foldlen -= numlen) {
5440 ender = utf8_to_uvchr(foldbuf, &numlen);
5442 const STRLEN unilen = reguni(pRExC_state, ender, s);
5445 /* In EBCDIC the numlen
5446 * and unilen can differ. */
5448 if (numlen >= foldlen)
5456 const STRLEN unilen = reguni(pRExC_state, ender, s);
5465 REGC((char)ender, s++);
5469 Set_Node_Cur_Length(ret); /* MJD */
5470 nextchar(pRExC_state);
5472 /* len is STRLEN which is unsigned, need to copy to signed */
5475 vFAIL("Internal disaster");
5479 if (len == 1 && UNI_IS_INVARIANT(ender))
5483 RExC_size += STR_SZ(len);
5486 RExC_emit += STR_SZ(len);
5492 /* If the encoding pragma is in effect recode the text of
5493 * any EXACT-kind nodes. */
5494 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
5495 const STRLEN oldlen = STR_LEN(ret);
5496 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
5500 if (sv_utf8_downgrade(sv, TRUE)) {
5501 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5502 const STRLEN newlen = SvCUR(sv);
5507 GET_RE_DEBUG_FLAGS_DECL;
5508 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
5509 (int)oldlen, STRING(ret),
5511 Copy(s, STRING(ret), newlen, char);
5512 STR_LEN(ret) += newlen - oldlen;
5513 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5515 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5523 S_regwhite(char *p, const char *e)
5528 else if (*p == '#') {
5531 } while (p < e && *p != '\n');
5539 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5540 Character classes ([:foo:]) can also be negated ([:^foo:]).
5541 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5542 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
5543 but trigger failures because they are currently unimplemented. */
5545 #define POSIXCC_DONE(c) ((c) == ':')
5546 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5547 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5550 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
5553 I32 namedclass = OOB_NAMEDCLASS;
5555 if (value == '[' && RExC_parse + 1 < RExC_end &&
5556 /* I smell either [: or [= or [. -- POSIX has been here, right? */
5557 POSIXCC(UCHARAT(RExC_parse))) {
5558 const char c = UCHARAT(RExC_parse);
5559 char* const s = RExC_parse++;
5561 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
5563 if (RExC_parse == RExC_end)
5564 /* Grandfather lone [:, [=, [. */
5567 const char* const t = RExC_parse++; /* skip over the c */
5570 if (UCHARAT(RExC_parse) == ']') {
5571 const char *posixcc = s + 1;
5572 RExC_parse++; /* skip over the ending ] */
5575 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5576 const I32 skip = t - posixcc;
5578 /* Initially switch on the length of the name. */
5581 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5582 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
5585 /* Names all of length 5. */
5586 /* alnum alpha ascii blank cntrl digit graph lower
5587 print punct space upper */
5588 /* Offset 4 gives the best switch position. */
5589 switch (posixcc[4]) {
5591 if (memEQ(posixcc, "alph", 4)) /* alpha */
5592 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
5595 if (memEQ(posixcc, "spac", 4)) /* space */
5596 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
5599 if (memEQ(posixcc, "grap", 4)) /* graph */
5600 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
5603 if (memEQ(posixcc, "asci", 4)) /* ascii */
5604 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
5607 if (memEQ(posixcc, "blan", 4)) /* blank */
5608 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
5611 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5612 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
5615 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5616 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
5619 if (memEQ(posixcc, "lowe", 4)) /* lower */
5620 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5621 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5622 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
5625 if (memEQ(posixcc, "digi", 4)) /* digit */
5626 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5627 else if (memEQ(posixcc, "prin", 4)) /* print */
5628 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5629 else if (memEQ(posixcc, "punc", 4)) /* punct */
5630 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
5635 if (memEQ(posixcc, "xdigit", 6))
5636 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
5640 if (namedclass == OOB_NAMEDCLASS)
5641 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5643 assert (posixcc[skip] == ':');
5644 assert (posixcc[skip+1] == ']');
5645 } else if (!SIZE_ONLY) {
5646 /* [[=foo=]] and [[.foo.]] are still future. */
5648 /* adjust RExC_parse so the warning shows after
5650 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
5652 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5655 /* Maternal grandfather:
5656 * "[:" ending in ":" but not in ":]" */
5666 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
5669 if (POSIXCC(UCHARAT(RExC_parse))) {
5670 const char *s = RExC_parse;
5671 const char c = *s++;
5675 if (*s && c == *s && s[1] == ']') {
5676 if (ckWARN(WARN_REGEXP))
5678 "POSIX syntax [%c %c] belongs inside character classes",
5681 /* [[=foo=]] and [[.foo.]] are still future. */
5682 if (POSIXCC_NOTYET(c)) {
5683 /* adjust RExC_parse so the error shows after
5685 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
5687 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5695 parse a class specification and produce either an ANYOF node that
5696 matches the pattern. If the pattern matches a single char only and
5697 that char is < 256 then we produce an EXACT node instead.
5700 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
5703 register UV value = 0;
5704 register UV nextvalue;
5705 register IV prevvalue = OOB_UNICODE;
5706 register IV range = 0;
5707 register regnode *ret;
5710 char *rangebegin = NULL;
5711 bool need_class = 0;
5714 bool optimize_invert = TRUE;
5715 AV* unicode_alternate = NULL;
5717 UV literal_endpoint = 0;
5719 UV stored = 0; /* number of chars stored in the class */
5721 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
5722 case we need to change the emitted regop to an EXACT. */
5723 const char * orig_parse = RExC_parse;
5724 GET_RE_DEBUG_FLAGS_DECL;
5726 PERL_UNUSED_ARG(depth);
5729 DEBUG_PARSE("clas");
5731 /* Assume we are going to generate an ANYOF node. */
5732 ret = reganode(pRExC_state, ANYOF, 0);
5735 ANYOF_FLAGS(ret) = 0;
5737 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
5741 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5745 RExC_size += ANYOF_SKIP;
5746 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5749 RExC_emit += ANYOF_SKIP;
5751 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5753 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
5754 ANYOF_BITMAP_ZERO(ret);
5755 listsv = newSVpvs("# comment\n");
5758 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5760 if (!SIZE_ONLY && POSIXCC(nextvalue))
5761 checkposixcc(pRExC_state);
5763 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5764 if (UCHARAT(RExC_parse) == ']')
5767 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
5771 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5774 rangebegin = RExC_parse;
5776 value = utf8n_to_uvchr((U8*)RExC_parse,
5777 RExC_end - RExC_parse,
5778 &numlen, UTF8_ALLOW_DEFAULT);
5779 RExC_parse += numlen;
5782 value = UCHARAT(RExC_parse++);
5784 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5785 if (value == '[' && POSIXCC(nextvalue))
5786 namedclass = regpposixcc(pRExC_state, value);
5787 else if (value == '\\') {
5789 value = utf8n_to_uvchr((U8*)RExC_parse,
5790 RExC_end - RExC_parse,
5791 &numlen, UTF8_ALLOW_DEFAULT);
5792 RExC_parse += numlen;
5795 value = UCHARAT(RExC_parse++);
5796 /* Some compilers cannot handle switching on 64-bit integer
5797 * values, therefore value cannot be an UV. Yes, this will
5798 * be a problem later if we want switch on Unicode.
5799 * A similar issue a little bit later when switching on
5800 * namedclass. --jhi */
5801 switch ((I32)value) {
5802 case 'w': namedclass = ANYOF_ALNUM; break;
5803 case 'W': namedclass = ANYOF_NALNUM; break;
5804 case 's': namedclass = ANYOF_SPACE; break;
5805 case 'S': namedclass = ANYOF_NSPACE; break;
5806 case 'd': namedclass = ANYOF_DIGIT; break;
5807 case 'D': namedclass = ANYOF_NDIGIT; break;
5812 if (RExC_parse >= RExC_end)
5813 vFAIL2("Empty \\%c{}", (U8)value);
5814 if (*RExC_parse == '{') {
5815 const U8 c = (U8)value;
5816 e = strchr(RExC_parse++, '}');
5818 vFAIL2("Missing right brace on \\%c{}", c);
5819 while (isSPACE(UCHARAT(RExC_parse)))
5821 if (e == RExC_parse)
5822 vFAIL2("Empty \\%c{}", c);
5824 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5832 if (UCHARAT(RExC_parse) == '^') {
5835 value = value == 'p' ? 'P' : 'p'; /* toggle */
5836 while (isSPACE(UCHARAT(RExC_parse))) {
5841 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5842 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
5845 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5846 namedclass = ANYOF_MAX; /* no official name, but it's named */
5849 case 'n': value = '\n'; break;
5850 case 'r': value = '\r'; break;
5851 case 't': value = '\t'; break;
5852 case 'f': value = '\f'; break;
5853 case 'b': value = '\b'; break;
5854 case 'e': value = ASCII_TO_NATIVE('\033');break;
5855 case 'a': value = ASCII_TO_NATIVE('\007');break;
5857 if (*RExC_parse == '{') {
5858 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5859 | PERL_SCAN_DISALLOW_PREFIX;
5860 char * const e = strchr(RExC_parse++, '}');
5862 vFAIL("Missing right brace on \\x{}");
5864 numlen = e - RExC_parse;
5865 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5869 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
5871 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
5872 RExC_parse += numlen;
5876 value = UCHARAT(RExC_parse++);
5877 value = toCTRL(value);
5879 case '0': case '1': case '2': case '3': case '4':
5880 case '5': case '6': case '7': case '8': case '9':
5884 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
5885 RExC_parse += numlen;
5889 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
5891 "Unrecognized escape \\%c in character class passed through",
5895 } /* end of \blah */
5901 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5903 if (!SIZE_ONLY && !need_class)
5904 ANYOF_CLASS_ZERO(ret);
5908 /* a bad range like a-\d, a-[:digit:] ? */
5911 if (ckWARN(WARN_REGEXP)) {
5913 RExC_parse >= rangebegin ?
5914 RExC_parse - rangebegin : 0;
5916 "False [] range \"%*.*s\"",
5919 if (prevvalue < 256) {
5920 ANYOF_BITMAP_SET(ret, prevvalue);
5921 ANYOF_BITMAP_SET(ret, '-');
5924 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5925 Perl_sv_catpvf(aTHX_ listsv,
5926 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
5930 range = 0; /* this was not a true range */
5934 const char *what = NULL;
5937 if (namedclass > OOB_NAMEDCLASS)
5938 optimize_invert = FALSE;
5939 /* Possible truncation here but in some 64-bit environments
5940 * the compiler gets heartburn about switch on 64-bit values.
5941 * A similar issue a little earlier when switching on value.
5943 switch ((I32)namedclass) {
5946 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
5948 for (value = 0; value < 256; value++)
5950 ANYOF_BITMAP_SET(ret, value);
5957 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
5959 for (value = 0; value < 256; value++)
5960 if (!isALNUM(value))
5961 ANYOF_BITMAP_SET(ret, value);
5968 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
5970 for (value = 0; value < 256; value++)
5971 if (isALNUMC(value))
5972 ANYOF_BITMAP_SET(ret, value);
5979 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
5981 for (value = 0; value < 256; value++)
5982 if (!isALNUMC(value))
5983 ANYOF_BITMAP_SET(ret, value);
5990 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
5992 for (value = 0; value < 256; value++)
5994 ANYOF_BITMAP_SET(ret, value);
6001 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
6003 for (value = 0; value < 256; value++)
6004 if (!isALPHA(value))
6005 ANYOF_BITMAP_SET(ret, value);
6012 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
6015 for (value = 0; value < 128; value++)
6016 ANYOF_BITMAP_SET(ret, value);
6018 for (value = 0; value < 256; value++) {
6020 ANYOF_BITMAP_SET(ret, value);
6029 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
6032 for (value = 128; value < 256; value++)
6033 ANYOF_BITMAP_SET(ret, value);
6035 for (value = 0; value < 256; value++) {
6036 if (!isASCII(value))
6037 ANYOF_BITMAP_SET(ret, value);
6046 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
6048 for (value = 0; value < 256; value++)
6050 ANYOF_BITMAP_SET(ret, value);
6057 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
6059 for (value = 0; value < 256; value++)
6060 if (!isBLANK(value))
6061 ANYOF_BITMAP_SET(ret, value);
6068 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
6070 for (value = 0; value < 256; value++)
6072 ANYOF_BITMAP_SET(ret, value);
6079 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
6081 for (value = 0; value < 256; value++)
6082 if (!isCNTRL(value))
6083 ANYOF_BITMAP_SET(ret, value);
6090 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
6092 /* consecutive digits assumed */
6093 for (value = '0'; value <= '9'; value++)
6094 ANYOF_BITMAP_SET(ret, value);
6101 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
6103 /* consecutive digits assumed */
6104 for (value = 0; value < '0'; value++)
6105 ANYOF_BITMAP_SET(ret, value);
6106 for (value = '9' + 1; value < 256; value++)
6107 ANYOF_BITMAP_SET(ret, value);
6114 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
6116 for (value = 0; value < 256; value++)
6118 ANYOF_BITMAP_SET(ret, value);
6125 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
6127 for (value = 0; value < 256; value++)
6128 if (!isGRAPH(value))
6129 ANYOF_BITMAP_SET(ret, value);
6136 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
6138 for (value = 0; value < 256; value++)
6140 ANYOF_BITMAP_SET(ret, value);
6147 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
6149 for (value = 0; value < 256; value++)
6150 if (!isLOWER(value))
6151 ANYOF_BITMAP_SET(ret, value);
6158 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
6160 for (value = 0; value < 256; value++)
6162 ANYOF_BITMAP_SET(ret, value);
6169 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
6171 for (value = 0; value < 256; value++)
6172 if (!isPRINT(value))
6173 ANYOF_BITMAP_SET(ret, value);
6180 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
6182 for (value = 0; value < 256; value++)
6183 if (isPSXSPC(value))
6184 ANYOF_BITMAP_SET(ret, value);
6191 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
6193 for (value = 0; value < 256; value++)
6194 if (!isPSXSPC(value))
6195 ANYOF_BITMAP_SET(ret, value);
6202 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
6204 for (value = 0; value < 256; value++)
6206 ANYOF_BITMAP_SET(ret, value);
6213 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
6215 for (value = 0; value < 256; value++)
6216 if (!isPUNCT(value))
6217 ANYOF_BITMAP_SET(ret, value);
6224 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
6226 for (value = 0; value < 256; value++)
6228 ANYOF_BITMAP_SET(ret, value);
6235 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
6237 for (value = 0; value < 256; value++)
6238 if (!isSPACE(value))
6239 ANYOF_BITMAP_SET(ret, value);
6246 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
6248 for (value = 0; value < 256; value++)
6250 ANYOF_BITMAP_SET(ret, value);
6257 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
6259 for (value = 0; value < 256; value++)
6260 if (!isUPPER(value))
6261 ANYOF_BITMAP_SET(ret, value);
6268 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
6270 for (value = 0; value < 256; value++)
6271 if (isXDIGIT(value))
6272 ANYOF_BITMAP_SET(ret, value);
6279 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
6281 for (value = 0; value < 256; value++)
6282 if (!isXDIGIT(value))
6283 ANYOF_BITMAP_SET(ret, value);
6289 /* this is to handle \p and \P */
6292 vFAIL("Invalid [::] class");
6296 /* Strings such as "+utf8::isWord\n" */
6297 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
6300 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
6303 } /* end of namedclass \blah */
6306 if (prevvalue > (IV)value) /* b-a */ {
6307 const int w = RExC_parse - rangebegin;
6308 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
6309 range = 0; /* not a valid range */
6313 prevvalue = value; /* save the beginning of the range */
6314 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
6315 RExC_parse[1] != ']') {
6318 /* a bad range like \w-, [:word:]- ? */
6319 if (namedclass > OOB_NAMEDCLASS) {
6320 if (ckWARN(WARN_REGEXP)) {
6322 RExC_parse >= rangebegin ?
6323 RExC_parse - rangebegin : 0;
6325 "False [] range \"%*.*s\"",
6329 ANYOF_BITMAP_SET(ret, '-');
6331 range = 1; /* yeah, it's a range! */
6332 continue; /* but do it the next time */
6336 /* now is the next time */
6337 /*stored += (value - prevvalue + 1);*/
6339 if (prevvalue < 256) {
6340 const IV ceilvalue = value < 256 ? value : 255;
6343 /* In EBCDIC [\x89-\x91] should include
6344 * the \x8e but [i-j] should not. */
6345 if (literal_endpoint == 2 &&
6346 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
6347 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
6349 if (isLOWER(prevvalue)) {
6350 for (i = prevvalue; i <= ceilvalue; i++)
6352 ANYOF_BITMAP_SET(ret, i);
6354 for (i = prevvalue; i <= ceilvalue; i++)
6356 ANYOF_BITMAP_SET(ret, i);
6361 for (i = prevvalue; i <= ceilvalue; i++) {
6362 if (!ANYOF_BITMAP_TEST(ret,i)) {
6364 ANYOF_BITMAP_SET(ret, i);
6368 if (value > 255 || UTF) {
6369 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
6370 const UV natvalue = NATIVE_TO_UNI(value);
6371 stored+=2; /* can't optimize this class */
6372 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6373 if (prevnatvalue < natvalue) { /* what about > ? */
6374 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
6375 prevnatvalue, natvalue);
6377 else if (prevnatvalue == natvalue) {
6378 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
6380 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
6382 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
6384 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
6385 if (RExC_precomp[0] == ':' &&
6386 RExC_precomp[1] == '[' &&
6387 (f == 0xDF || f == 0x92)) {
6388 f = NATIVE_TO_UNI(f);
6391 /* If folding and foldable and a single
6392 * character, insert also the folded version
6393 * to the charclass. */
6395 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
6396 if ((RExC_precomp[0] == ':' &&
6397 RExC_precomp[1] == '[' &&
6399 (value == 0xFB05 || value == 0xFB06))) ?
6400 foldlen == ((STRLEN)UNISKIP(f) - 1) :
6401 foldlen == (STRLEN)UNISKIP(f) )
6403 if (foldlen == (STRLEN)UNISKIP(f))
6405 Perl_sv_catpvf(aTHX_ listsv,
6408 /* Any multicharacter foldings
6409 * require the following transform:
6410 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
6411 * where E folds into "pq" and F folds
6412 * into "rst", all other characters
6413 * fold to single characters. We save
6414 * away these multicharacter foldings,
6415 * to be later saved as part of the
6416 * additional "s" data. */
6419 if (!unicode_alternate)
6420 unicode_alternate = newAV();
6421 sv = newSVpvn((char*)foldbuf, foldlen);
6423 av_push(unicode_alternate, sv);
6427 /* If folding and the value is one of the Greek
6428 * sigmas insert a few more sigmas to make the
6429 * folding rules of the sigmas to work right.
6430 * Note that not all the possible combinations
6431 * are handled here: some of them are handled
6432 * by the standard folding rules, and some of
6433 * them (literal or EXACTF cases) are handled
6434 * during runtime in regexec.c:S_find_byclass(). */
6435 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
6436 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
6437 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
6438 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
6439 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
6441 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
6442 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
6443 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
6448 literal_endpoint = 0;
6452 range = 0; /* this range (if it was one) is done now */
6456 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
6458 RExC_size += ANYOF_CLASS_ADD_SKIP;
6460 RExC_emit += ANYOF_CLASS_ADD_SKIP;
6466 /****** !SIZE_ONLY AFTER HERE *********/
6468 if( stored == 1 && value < 256
6469 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
6471 /* optimize single char class to an EXACT node
6472 but *only* when its not a UTF/high char */
6473 const char * cur_parse= RExC_parse;
6474 RExC_emit = (regnode *)orig_emit;
6475 RExC_parse = (char *)orig_parse;
6476 ret = reg_node(pRExC_state,
6477 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
6478 RExC_parse = (char *)cur_parse;
6479 *STRING(ret)= (char)value;
6481 RExC_emit += STR_SZ(1);
6484 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
6485 if ( /* If the only flag is folding (plus possibly inversion). */
6486 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
6488 for (value = 0; value < 256; ++value) {
6489 if (ANYOF_BITMAP_TEST(ret, value)) {
6490 UV fold = PL_fold[value];
6493 ANYOF_BITMAP_SET(ret, fold);
6496 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
6499 /* optimize inverted simple patterns (e.g. [^a-z]) */
6500 if (optimize_invert &&
6501 /* If the only flag is inversion. */
6502 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
6503 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
6504 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
6505 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
6508 AV * const av = newAV();
6510 /* The 0th element stores the character class description
6511 * in its textual form: used later (regexec.c:Perl_regclass_swash())
6512 * to initialize the appropriate swash (which gets stored in
6513 * the 1st element), and also useful for dumping the regnode.
6514 * The 2nd element stores the multicharacter foldings,
6515 * used later (regexec.c:S_reginclass()). */
6516 av_store(av, 0, listsv);
6517 av_store(av, 1, NULL);
6518 av_store(av, 2, (SV*)unicode_alternate);
6519 rv = newRV_noinc((SV*)av);
6520 n = add_data(pRExC_state, 1, "s");
6521 RExC_rx->data->data[n] = (void*)rv;
6528 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
6530 char* const retval = RExC_parse++;
6533 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6534 RExC_parse[2] == '#') {
6535 while (*RExC_parse != ')') {
6536 if (RExC_parse == RExC_end)
6537 FAIL("Sequence (?#... not terminated");
6543 if (RExC_flags & PMf_EXTENDED) {
6544 if (isSPACE(*RExC_parse)) {
6548 else if (*RExC_parse == '#') {
6549 while (RExC_parse < RExC_end)
6550 if (*RExC_parse++ == '\n') break;
6559 - reg_node - emit a node
6561 STATIC regnode * /* Location. */
6562 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
6565 register regnode *ptr;
6566 regnode * const ret = RExC_emit;
6567 GET_RE_DEBUG_FLAGS_DECL;
6570 SIZE_ALIGN(RExC_size);
6574 NODE_ALIGN_FILL(ret);
6576 FILL_ADVANCE_NODE(ptr, op);
6577 if (RExC_offsets) { /* MJD */
6578 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
6579 "reg_node", __LINE__,
6581 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6582 ? "Overwriting end of array!\n" : "OK",
6583 (UV)(RExC_emit - RExC_emit_start),
6584 (UV)(RExC_parse - RExC_start),
6585 (UV)RExC_offsets[0]));
6586 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
6595 - reganode - emit a node with an argument
6597 STATIC regnode * /* Location. */
6598 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
6601 register regnode *ptr;
6602 regnode * const ret = RExC_emit;
6603 GET_RE_DEBUG_FLAGS_DECL;
6606 SIZE_ALIGN(RExC_size);
6611 NODE_ALIGN_FILL(ret);
6613 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
6614 if (RExC_offsets) { /* MJD */
6615 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6619 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
6620 "Overwriting end of array!\n" : "OK",
6621 (UV)(RExC_emit - RExC_emit_start),
6622 (UV)(RExC_parse - RExC_start),
6623 (UV)RExC_offsets[0]));
6624 Set_Cur_Node_Offset;
6633 - reguni - emit (if appropriate) a Unicode character
6636 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
6639 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
6643 - reginsert - insert an operator in front of already-emitted operand
6645 * Means relocating the operand.
6648 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
6651 register regnode *src;
6652 register regnode *dst;
6653 register regnode *place;
6654 const int offset = regarglen[(U8)op];
6655 GET_RE_DEBUG_FLAGS_DECL;
6656 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6659 RExC_size += NODE_STEP_REGNODE + offset;
6664 RExC_emit += NODE_STEP_REGNODE + offset;
6666 while (src > opnd) {
6667 StructCopy(--src, --dst, regnode);
6668 if (RExC_offsets) { /* MJD 20010112 */
6669 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
6673 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6674 ? "Overwriting end of array!\n" : "OK",
6675 (UV)(src - RExC_emit_start),
6676 (UV)(dst - RExC_emit_start),
6677 (UV)RExC_offsets[0]));
6678 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6679 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
6684 place = opnd; /* Op node, where operand used to be. */
6685 if (RExC_offsets) { /* MJD */
6686 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
6690 (UV)(place - RExC_emit_start) > RExC_offsets[0]
6691 ? "Overwriting end of array!\n" : "OK",
6692 (UV)(place - RExC_emit_start),
6693 (UV)(RExC_parse - RExC_start),
6694 (UV)RExC_offsets[0]));
6695 Set_Node_Offset(place, RExC_parse);
6696 Set_Node_Length(place, 1);
6698 src = NEXTOPER(place);
6699 FILL_ADVANCE_NODE(place, op);
6700 Zero(src, offset, regnode);
6704 - regtail - set the next-pointer at the end of a node chain of p to val.
6705 - SEE ALSO: regtail_study
6707 /* TODO: All three parms should be const */
6709 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6712 register regnode *scan;
6713 GET_RE_DEBUG_FLAGS_DECL;
6715 PERL_UNUSED_ARG(depth);
6721 /* Find last node. */
6724 regnode * const temp = regnext(scan);
6726 SV * const mysv=sv_newmortal();
6727 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6728 regprop(RExC_rx, mysv, scan);
6729 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6730 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6737 if (reg_off_by_arg[OP(scan)]) {
6738 ARG_SET(scan, val - scan);
6741 NEXT_OFF(scan) = val - scan;
6747 - regtail_study - set the next-pointer at the end of a node chain of p to val.
6748 - Look for optimizable sequences at the same time.
6749 - currently only looks for EXACT chains.
6751 This is expermental code. The idea is to use this routine to perform
6752 in place optimizations on branches and groups as they are constructed,
6753 with the long term intention of removing optimization from study_chunk so
6754 that it is purely analytical.
6756 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6757 to control which is which.
6760 /* TODO: All four parms should be const */
6763 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6766 register regnode *scan;
6768 #ifdef EXPERIMENTAL_INPLACESCAN
6772 GET_RE_DEBUG_FLAGS_DECL;
6778 /* Find last node. */
6782 regnode * const temp = regnext(scan);
6783 #ifdef EXPERIMENTAL_INPLACESCAN
6784 if (PL_regkind[OP(scan)] == EXACT)
6785 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6793 if( exact == PSEUDO )
6795 else if ( exact != OP(scan) )
6804 SV * const mysv=sv_newmortal();
6805 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6806 regprop(RExC_rx, mysv, scan);
6807 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6808 SvPV_nolen_const(mysv),
6810 REG_NODE_NUM(scan));
6817 SV * const mysv_val=sv_newmortal();
6818 DEBUG_PARSE_MSG("");
6819 regprop(RExC_rx, mysv_val, val);
6820 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6821 SvPV_nolen_const(mysv_val),
6826 if (reg_off_by_arg[OP(scan)]) {
6827 ARG_SET(scan, val - scan);
6830 NEXT_OFF(scan) = val - scan;
6838 - regcurly - a little FSA that accepts {\d+,?\d*}
6841 S_regcurly(register const char *s)
6860 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
6863 Perl_regdump(pTHX_ const regexp *r)
6867 SV * const sv = sv_newmortal();
6868 SV *dsv= sv_newmortal();
6870 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
6872 /* Header fields of interest. */
6873 if (r->anchored_substr) {
6874 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
6875 RE_SV_DUMPLEN(r->anchored_substr), 30);
6876 PerlIO_printf(Perl_debug_log,
6877 "anchored %s%s at %"IVdf" ",
6878 s, RE_SV_TAIL(r->anchored_substr),
6879 (IV)r->anchored_offset);
6880 } else if (r->anchored_utf8) {
6881 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
6882 RE_SV_DUMPLEN(r->anchored_utf8), 30);
6883 PerlIO_printf(Perl_debug_log,
6884 "anchored utf8 %s%s at %"IVdf" ",
6885 s, RE_SV_TAIL(r->anchored_utf8),
6886 (IV)r->anchored_offset);
6888 if (r->float_substr) {
6889 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
6890 RE_SV_DUMPLEN(r->float_substr), 30);
6891 PerlIO_printf(Perl_debug_log,
6892 "floating %s%s at %"IVdf"..%"UVuf" ",
6893 s, RE_SV_TAIL(r->float_substr),
6894 (IV)r->float_min_offset, (UV)r->float_max_offset);
6895 } else if (r->float_utf8) {
6896 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
6897 RE_SV_DUMPLEN(r->float_utf8), 30);
6898 PerlIO_printf(Perl_debug_log,
6899 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
6900 s, RE_SV_TAIL(r->float_utf8),
6901 (IV)r->float_min_offset, (UV)r->float_max_offset);
6903 if (r->check_substr || r->check_utf8)
6904 PerlIO_printf(Perl_debug_log,
6906 (r->check_substr == r->float_substr
6907 && r->check_utf8 == r->float_utf8
6908 ? "(checking floating" : "(checking anchored"));
6909 if (r->reganch & ROPT_NOSCAN)
6910 PerlIO_printf(Perl_debug_log, " noscan");
6911 if (r->reganch & ROPT_CHECK_ALL)
6912 PerlIO_printf(Perl_debug_log, " isall");
6913 if (r->check_substr || r->check_utf8)
6914 PerlIO_printf(Perl_debug_log, ") ");
6916 if (r->regstclass) {
6917 regprop(r, sv, r->regstclass);
6918 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
6920 if (r->reganch & ROPT_ANCH) {
6921 PerlIO_printf(Perl_debug_log, "anchored");
6922 if (r->reganch & ROPT_ANCH_BOL)
6923 PerlIO_printf(Perl_debug_log, "(BOL)");
6924 if (r->reganch & ROPT_ANCH_MBOL)
6925 PerlIO_printf(Perl_debug_log, "(MBOL)");
6926 if (r->reganch & ROPT_ANCH_SBOL)
6927 PerlIO_printf(Perl_debug_log, "(SBOL)");
6928 if (r->reganch & ROPT_ANCH_GPOS)
6929 PerlIO_printf(Perl_debug_log, "(GPOS)");
6930 PerlIO_putc(Perl_debug_log, ' ');
6932 if (r->reganch & ROPT_GPOS_SEEN)
6933 PerlIO_printf(Perl_debug_log, "GPOS ");
6934 if (r->reganch & ROPT_SKIP)
6935 PerlIO_printf(Perl_debug_log, "plus ");
6936 if (r->reganch & ROPT_IMPLICIT)
6937 PerlIO_printf(Perl_debug_log, "implicit ");
6938 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
6939 if (r->reganch & ROPT_EVAL_SEEN)
6940 PerlIO_printf(Perl_debug_log, "with eval ");
6941 PerlIO_printf(Perl_debug_log, "\n");
6943 PERL_UNUSED_CONTEXT;
6945 #endif /* DEBUGGING */
6949 - regprop - printable representation of opcode
6952 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
6957 GET_RE_DEBUG_FLAGS_DECL;
6959 sv_setpvn(sv, "", 0);
6960 if (OP(o) >= reg_num) /* regnode.type is unsigned */
6961 /* It would be nice to FAIL() here, but this may be called from
6962 regexec.c, and it would be hard to supply pRExC_state. */
6963 Perl_croak(aTHX_ "Corrupted regexp opcode");
6964 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
6966 k = PL_regkind[OP(o)];
6969 SV * const dsv = sv_2mortal(newSVpvs(""));
6970 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
6971 * is a crude hack but it may be the best for now since
6972 * we have no flag "this EXACTish node was UTF-8"
6974 const char * const s =
6975 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
6976 PL_colors[0], PL_colors[1],
6977 PERL_PV_ESCAPE_UNI_DETECT |
6978 PERL_PV_PRETTY_ELIPSES |
6981 Perl_sv_catpvf(aTHX_ sv, " %s", s );
6982 } else if (k == TRIE) {
6983 /* print the details of the trie in dumpuntil instead, as
6984 * prog->data isn't available here */
6985 const char op = OP(o);
6986 const I32 n = ARG(o);
6987 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
6988 (reg_ac_data *)prog->data->data[n] :
6990 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
6991 (reg_trie_data*)prog->data->data[n] :
6994 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6995 DEBUG_TRIE_COMPILE_r(
6996 Perl_sv_catpvf(aTHX_ sv,
6997 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
6998 (UV)trie->startstate,
6999 (IV)trie->laststate-1,
7000 (UV)trie->wordcount,
7003 (UV)TRIE_CHARCOUNT(trie),
7004 (UV)trie->uniquecharcount
7007 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
7009 int rangestart = -1;
7010 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
7011 Perl_sv_catpvf(aTHX_ sv, "[");
7012 for (i = 0; i <= 256; i++) {
7013 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7014 if (rangestart == -1)
7016 } else if (rangestart != -1) {
7017 if (i <= rangestart + 3)
7018 for (; rangestart < i; rangestart++)
7019 put_byte(sv, rangestart);
7021 put_byte(sv, rangestart);
7023 put_byte(sv, i - 1);
7028 Perl_sv_catpvf(aTHX_ sv, "]");
7031 } else if (k == CURLY) {
7032 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
7033 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
7034 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
7036 else if (k == WHILEM && o->flags) /* Ordinal/of */
7037 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
7038 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
7039 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
7040 else if (k == LOGICAL)
7041 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
7042 else if (k == ANYOF) {
7043 int i, rangestart = -1;
7044 const U8 flags = ANYOF_FLAGS(o);
7046 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
7047 static const char * const anyofs[] = {
7080 if (flags & ANYOF_LOCALE)
7081 sv_catpvs(sv, "{loc}");
7082 if (flags & ANYOF_FOLD)
7083 sv_catpvs(sv, "{i}");
7084 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
7085 if (flags & ANYOF_INVERT)
7087 for (i = 0; i <= 256; i++) {
7088 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
7089 if (rangestart == -1)
7091 } else if (rangestart != -1) {
7092 if (i <= rangestart + 3)
7093 for (; rangestart < i; rangestart++)
7094 put_byte(sv, rangestart);
7096 put_byte(sv, rangestart);
7098 put_byte(sv, i - 1);
7104 if (o->flags & ANYOF_CLASS)
7105 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
7106 if (ANYOF_CLASS_TEST(o,i))
7107 sv_catpv(sv, anyofs[i]);
7109 if (flags & ANYOF_UNICODE)
7110 sv_catpvs(sv, "{unicode}");
7111 else if (flags & ANYOF_UNICODE_ALL)
7112 sv_catpvs(sv, "{unicode_all}");
7116 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
7120 U8 s[UTF8_MAXBYTES_CASE+1];
7122 for (i = 0; i <= 256; i++) { /* just the first 256 */
7123 uvchr_to_utf8(s, i);
7125 if (i < 256 && swash_fetch(sw, s, TRUE)) {
7126 if (rangestart == -1)
7128 } else if (rangestart != -1) {
7129 if (i <= rangestart + 3)
7130 for (; rangestart < i; rangestart++) {
7131 const U8 * const e = uvchr_to_utf8(s,rangestart);
7133 for(p = s; p < e; p++)
7137 const U8 *e = uvchr_to_utf8(s,rangestart);
7139 for (p = s; p < e; p++)
7142 e = uvchr_to_utf8(s, i-1);
7143 for (p = s; p < e; p++)
7150 sv_catpvs(sv, "..."); /* et cetera */
7154 char *s = savesvpv(lv);
7155 char * const origs = s;
7157 while (*s && *s != '\n')
7161 const char * const t = ++s;
7179 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
7181 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
7182 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
7184 PERL_UNUSED_CONTEXT;
7185 PERL_UNUSED_ARG(sv);
7187 PERL_UNUSED_ARG(prog);
7188 #endif /* DEBUGGING */
7192 Perl_re_intuit_string(pTHX_ regexp *prog)
7193 { /* Assume that RE_INTUIT is set */
7195 GET_RE_DEBUG_FLAGS_DECL;
7196 PERL_UNUSED_CONTEXT;
7200 const char * const s = SvPV_nolen_const(prog->check_substr
7201 ? prog->check_substr : prog->check_utf8);
7203 if (!PL_colorset) reginitcolors();
7204 PerlIO_printf(Perl_debug_log,
7205 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
7207 prog->check_substr ? "" : "utf8 ",
7208 PL_colors[5],PL_colors[0],
7211 (strlen(s) > 60 ? "..." : ""));
7214 return prog->check_substr ? prog->check_substr : prog->check_utf8;
7218 Perl_pregfree(pTHX_ struct regexp *r)
7222 GET_RE_DEBUG_FLAGS_DECL;
7224 if (!r || (--r->refcnt > 0))
7230 SV *dsv= sv_newmortal();
7231 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
7232 dsv, r->precomp, r->prelen, 60);
7233 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
7234 PL_colors[4],PL_colors[5],s);
7238 /* gcov results gave these as non-null 100% of the time, so there's no
7239 optimisation in checking them before calling Safefree */
7240 Safefree(r->precomp);
7241 Safefree(r->offsets); /* 20010421 MJD */
7242 RX_MATCH_COPY_FREE(r);
7243 #ifdef PERL_OLD_COPY_ON_WRITE
7245 SvREFCNT_dec(r->saved_copy);
7248 if (r->anchored_substr)
7249 SvREFCNT_dec(r->anchored_substr);
7250 if (r->anchored_utf8)
7251 SvREFCNT_dec(r->anchored_utf8);
7252 if (r->float_substr)
7253 SvREFCNT_dec(r->float_substr);
7255 SvREFCNT_dec(r->float_utf8);
7256 Safefree(r->substrs);
7259 int n = r->data->count;
7260 PAD* new_comppad = NULL;
7265 /* If you add a ->what type here, update the comment in regcomp.h */
7266 switch (r->data->what[n]) {
7268 SvREFCNT_dec((SV*)r->data->data[n]);
7271 Safefree(r->data->data[n]);
7274 new_comppad = (AV*)r->data->data[n];
7277 if (new_comppad == NULL)
7278 Perl_croak(aTHX_ "panic: pregfree comppad");
7279 PAD_SAVE_LOCAL(old_comppad,
7280 /* Watch out for global destruction's random ordering. */
7281 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
7284 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
7287 op_free((OP_4tree*)r->data->data[n]);
7289 PAD_RESTORE_LOCAL(old_comppad);
7290 SvREFCNT_dec((SV*)new_comppad);
7296 { /* Aho Corasick add-on structure for a trie node.
7297 Used in stclass optimization only */
7299 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
7301 refcount = --aho->refcount;
7304 Safefree(aho->states);
7305 Safefree(aho->fail);
7306 aho->trie=NULL; /* not necessary to free this as it is
7307 handled by the 't' case */
7308 Safefree(r->data->data[n]); /* do this last!!!! */
7309 Safefree(r->regstclass);
7315 /* trie structure. */
7317 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
7319 refcount = --trie->refcount;
7322 Safefree(trie->charmap);
7323 if (trie->widecharmap)
7324 SvREFCNT_dec((SV*)trie->widecharmap);
7325 Safefree(trie->states);
7326 Safefree(trie->trans);
7328 Safefree(trie->bitmap);
7330 Safefree(trie->wordlen);
7332 Safefree(trie->jump);
7334 Safefree(trie->nextword);
7338 SvREFCNT_dec((SV*)trie->words);
7339 if (trie->revcharmap)
7340 SvREFCNT_dec((SV*)trie->revcharmap);
7343 Safefree(r->data->data[n]); /* do this last!!!! */
7348 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
7351 Safefree(r->data->what);
7354 Safefree(r->startp);
7359 #ifndef PERL_IN_XSUB_RE
7361 - regnext - dig the "next" pointer out of a node
7364 Perl_regnext(pTHX_ register regnode *p)
7367 register I32 offset;
7369 if (p == &PL_regdummy)
7372 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
7381 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
7384 STRLEN l1 = strlen(pat1);
7385 STRLEN l2 = strlen(pat2);
7388 const char *message;
7394 Copy(pat1, buf, l1 , char);
7395 Copy(pat2, buf + l1, l2 , char);
7396 buf[l1 + l2] = '\n';
7397 buf[l1 + l2 + 1] = '\0';
7399 /* ANSI variant takes additional second argument */
7400 va_start(args, pat2);
7404 msv = vmess(buf, &args);
7406 message = SvPV_const(msv,l1);
7409 Copy(message, buf, l1 , char);
7410 buf[l1-1] = '\0'; /* Overwrite \n */
7411 Perl_croak(aTHX_ "%s", buf);
7414 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
7416 #ifndef PERL_IN_XSUB_RE
7418 Perl_save_re_context(pTHX)
7422 struct re_save_state *state;
7424 SAVEVPTR(PL_curcop);
7425 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
7427 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
7428 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
7429 SSPUSHINT(SAVEt_RE_STATE);
7431 Copy(&PL_reg_state, state, 1, struct re_save_state);
7433 PL_reg_start_tmp = 0;
7434 PL_reg_start_tmpl = 0;
7435 PL_reg_oldsaved = NULL;
7436 PL_reg_oldsavedlen = 0;
7438 PL_reg_leftiter = 0;
7439 PL_reg_poscache = NULL;
7440 PL_reg_poscache_size = 0;
7441 #ifdef PERL_OLD_COPY_ON_WRITE
7445 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
7447 const REGEXP * const rx = PM_GETRE(PL_curpm);
7450 for (i = 1; i <= rx->nparens; i++) {
7451 char digits[TYPE_CHARS(long)];
7452 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
7453 GV *const *const gvp
7454 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
7457 GV * const gv = *gvp;
7458 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
7468 clear_re(pTHX_ void *r)
7471 ReREFCNT_dec((regexp *)r);
7477 S_put_byte(pTHX_ SV *sv, int c)
7479 if (isCNTRL(c) || c == 255 || !isPRINT(c))
7480 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
7481 else if (c == '-' || c == ']' || c == '\\' || c == '^')
7482 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
7484 Perl_sv_catpvf(aTHX_ sv, "%c", c);
7488 #define CLEAR_OPTSTART \
7489 if (optstart) STMT_START { \
7490 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
7494 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
7496 STATIC const regnode *
7497 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
7498 const regnode *last, const regnode *plast,
7499 SV* sv, I32 indent, U32 depth)
7502 register U8 op = PSEUDO; /* Arbitrary non-END op. */
7503 register const regnode *next;
7504 const regnode *optstart= NULL;
7505 GET_RE_DEBUG_FLAGS_DECL;
7507 #ifdef DEBUG_DUMPUNTIL
7508 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
7509 last ? last-start : 0,plast ? plast-start : 0);
7512 if (plast && plast < last)
7515 while (PL_regkind[op] != END && (!last || node < last)) {
7516 /* While that wasn't END last time... */
7522 next = regnext((regnode *)node);
7525 if (OP(node) == OPTIMIZED) {
7526 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
7533 regprop(r, sv, node);
7534 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
7535 (int)(2*indent + 1), "", SvPVX_const(sv));
7537 if (OP(node) != OPTIMIZED) {
7538 if (next == NULL) /* Next ptr. */
7539 PerlIO_printf(Perl_debug_log, "(0)");
7540 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
7541 PerlIO_printf(Perl_debug_log, "(FAIL)");
7543 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
7545 /*if (PL_regkind[(U8)op] != TRIE)*/
7546 (void)PerlIO_putc(Perl_debug_log, '\n');
7550 if (PL_regkind[(U8)op] == BRANCHJ) {
7553 register const regnode *nnode = (OP(next) == LONGJMP
7554 ? regnext((regnode *)next)
7556 if (last && nnode > last)
7558 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
7561 else if (PL_regkind[(U8)op] == BRANCH) {
7563 DUMPUNTIL(NEXTOPER(node), next);
7565 else if ( PL_regkind[(U8)op] == TRIE ) {
7566 const char op = OP(node);
7567 const I32 n = ARG(node);
7568 const reg_ac_data * const ac = op>=AHOCORASICK ?
7569 (reg_ac_data *)r->data->data[n] :
7571 const reg_trie_data * const trie = op<AHOCORASICK ?
7572 (reg_trie_data*)r->data->data[n] :
7574 const regnode *nextbranch= NULL;
7576 sv_setpvn(sv, "", 0);
7577 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
7578 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
7580 PerlIO_printf(Perl_debug_log, "%*s%s ",
7581 (int)(2*(indent+3)), "",
7582 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
7583 PL_colors[0], PL_colors[1],
7584 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
7585 PERL_PV_PRETTY_ELIPSES |
7591 U16 dist= trie->jump[word_idx+1];
7592 PerlIO_printf(Perl_debug_log, "(%u)\n",(next - dist) - start);
7595 nextbranch= next - trie->jump[0];
7596 DUMPUNTIL(next - dist, nextbranch);
7598 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
7599 nextbranch= regnext((regnode *)nextbranch);
7601 PerlIO_printf(Perl_debug_log, "\n");
7604 if (last && next > last)
7609 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
7610 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
7611 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
7613 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
7615 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
7617 else if ( op == PLUS || op == STAR) {
7618 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
7620 else if (op == ANYOF) {
7621 /* arglen 1 + class block */
7622 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7623 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7624 node = NEXTOPER(node);
7626 else if (PL_regkind[(U8)op] == EXACT) {
7627 /* Literal string, where present. */
7628 node += NODE_SZ_STR(node) - 1;
7629 node = NEXTOPER(node);
7632 node = NEXTOPER(node);
7633 node += regarglen[(U8)op];
7635 if (op == CURLYX || op == OPEN)
7637 else if (op == WHILEM)
7641 #ifdef DEBUG_DUMPUNTIL
7642 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
7647 #endif /* DEBUGGING */
7651 * c-indentation-style: bsd
7653 * indent-tabs-mode: t
7656 * ex: set ts=8 sts=4 sw=4 noet: