5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
57 **** Alterations to Henry's code are...
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
71 #define PERL_IN_REGCOMP_C
74 #ifndef PERL_IN_XSUB_RE
79 #ifdef PERL_IN_XSUB_RE
90 # if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92 # pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94 # pragma optimize("w",on )
95 # endif /* BUGGY_MSC6 */
102 typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
107 char *start; /* Start of input for compile */
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
111 regnode *emit_start; /* Start of emitted-code area */
112 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
116 I32 size; /* Code size. */
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
119 I32 nestroot; /* root parens we are in - used by accept */
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
127 HV *charnames; /* cache of named sequences */
128 HV *paren_names; /* Paren names */
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
133 char *starttry; /* -Dr: where regtry was called. */
134 #define RExC_starttry (pRExC_state->starttry)
137 const char *lastparse;
139 AV *paren_name_list; /* idx -> name */
140 #define RExC_lastparse (pRExC_state->lastparse)
141 #define RExC_lastnum (pRExC_state->lastnum)
142 #define RExC_paren_name_list (pRExC_state->paren_name_list)
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_rxi (pRExC_state->rxi)
150 #define RExC_start (pRExC_state->start)
151 #define RExC_end (pRExC_state->end)
152 #define RExC_parse (pRExC_state->parse)
153 #define RExC_whilem_seen (pRExC_state->whilem_seen)
154 #define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
155 #define RExC_emit (pRExC_state->emit)
156 #define RExC_emit_start (pRExC_state->emit_start)
157 #define RExC_naughty (pRExC_state->naughty)
158 #define RExC_sawback (pRExC_state->sawback)
159 #define RExC_seen (pRExC_state->seen)
160 #define RExC_size (pRExC_state->size)
161 #define RExC_npar (pRExC_state->npar)
162 #define RExC_nestroot (pRExC_state->nestroot)
163 #define RExC_extralen (pRExC_state->extralen)
164 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
165 #define RExC_seen_evals (pRExC_state->seen_evals)
166 #define RExC_utf8 (pRExC_state->utf8)
167 #define RExC_charnames (pRExC_state->charnames)
168 #define RExC_open_parens (pRExC_state->open_parens)
169 #define RExC_close_parens (pRExC_state->close_parens)
170 #define RExC_opend (pRExC_state->opend)
171 #define RExC_paren_names (pRExC_state->paren_names)
172 #define RExC_recurse (pRExC_state->recurse)
173 #define RExC_recurse_count (pRExC_state->recurse_count)
176 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
177 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
178 ((*s) == '{' && regcurly(s)))
181 #undef SPSTART /* dratted cpp namespace... */
184 * Flags to be passed up and down.
186 #define WORST 0 /* Worst case. */
187 #define HASWIDTH 0x1 /* Known to match non-null strings. */
188 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
189 #define SPSTART 0x4 /* Starts with * or +. */
190 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
192 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
194 /* whether trie related optimizations are enabled */
195 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
196 #define TRIE_STUDY_OPT
197 #define FULL_TRIE_STUDY
203 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
204 #define PBITVAL(paren) (1 << ((paren) & 7))
205 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
206 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
207 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
210 /* About scan_data_t.
212 During optimisation we recurse through the regexp program performing
213 various inplace (keyhole style) optimisations. In addition study_chunk
214 and scan_commit populate this data structure with information about
215 what strings MUST appear in the pattern. We look for the longest
216 string that must appear for at a fixed location, and we look for the
217 longest string that may appear at a floating location. So for instance
222 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
223 strings (because they follow a .* construct). study_chunk will identify
224 both FOO and BAR as being the longest fixed and floating strings respectively.
226 The strings can be composites, for instance
230 will result in a composite fixed substring 'foo'.
232 For each string some basic information is maintained:
234 - offset or min_offset
235 This is the position the string must appear at, or not before.
236 It also implicitly (when combined with minlenp) tells us how many
237 character must match before the string we are searching.
238 Likewise when combined with minlenp and the length of the string
239 tells us how many characters must appear after the string we have
243 Only used for floating strings. This is the rightmost point that
244 the string can appear at. Ifset to I32 max it indicates that the
245 string can occur infinitely far to the right.
248 A pointer to the minimum length of the pattern that the string
249 was found inside. This is important as in the case of positive
250 lookahead or positive lookbehind we can have multiple patterns
255 The minimum length of the pattern overall is 3, the minimum length
256 of the lookahead part is 3, but the minimum length of the part that
257 will actually match is 1. So 'FOO's minimum length is 3, but the
258 minimum length for the F is 1. This is important as the minimum length
259 is used to determine offsets in front of and behind the string being
260 looked for. Since strings can be composites this is the length of the
261 pattern at the time it was commited with a scan_commit. Note that
262 the length is calculated by study_chunk, so that the minimum lengths
263 are not known until the full pattern has been compiled, thus the
264 pointer to the value.
268 In the case of lookbehind the string being searched for can be
269 offset past the start point of the final matching string.
270 If this value was just blithely removed from the min_offset it would
271 invalidate some of the calculations for how many chars must match
272 before or after (as they are derived from min_offset and minlen and
273 the length of the string being searched for).
274 When the final pattern is compiled and the data is moved from the
275 scan_data_t structure into the regexp structure the information
276 about lookbehind is factored in, with the information that would
277 have been lost precalculated in the end_shift field for the
280 The fields pos_min and pos_delta are used to store the minimum offset
281 and the delta to the maximum offset at the current point in the pattern.
285 typedef struct scan_data_t {
286 /*I32 len_min; unused */
287 /*I32 len_delta; unused */
291 I32 last_end; /* min value, <0 unless valid. */
294 SV **longest; /* Either &l_fixed, or &l_float. */
295 SV *longest_fixed; /* longest fixed string found in pattern */
296 I32 offset_fixed; /* offset where it starts */
297 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
299 SV *longest_float; /* longest floating string found in pattern */
300 I32 offset_float_min; /* earliest point in string it can appear */
301 I32 offset_float_max; /* latest point in string it can appear */
302 I32 *minlen_float; /* pointer to the minlen relevent to the string */
303 I32 lookbehind_float; /* is the position of the string modified by LB */
307 struct regnode_charclass_class *start_class;
311 * Forward declarations for pregcomp()'s friends.
314 static const scan_data_t zero_scan_data =
315 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
317 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
318 #define SF_BEFORE_SEOL 0x0001
319 #define SF_BEFORE_MEOL 0x0002
320 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
321 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
324 # define SF_FIX_SHIFT_EOL (0+2)
325 # define SF_FL_SHIFT_EOL (0+4)
327 # define SF_FIX_SHIFT_EOL (+2)
328 # define SF_FL_SHIFT_EOL (+4)
331 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
332 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
334 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
335 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
336 #define SF_IS_INF 0x0040
337 #define SF_HAS_PAR 0x0080
338 #define SF_IN_PAR 0x0100
339 #define SF_HAS_EVAL 0x0200
340 #define SCF_DO_SUBSTR 0x0400
341 #define SCF_DO_STCLASS_AND 0x0800
342 #define SCF_DO_STCLASS_OR 0x1000
343 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
344 #define SCF_WHILEM_VISITED_POS 0x2000
346 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
347 #define SCF_SEEN_ACCEPT 0x8000
349 #define UTF (RExC_utf8 != 0)
350 #define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
351 #define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
353 #define OOB_UNICODE 12345678
354 #define OOB_NAMEDCLASS -1
356 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
357 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
360 /* length of regex to show in messages that don't mark a position within */
361 #define RegexLengthToShowInErrorMessages 127
364 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
365 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
366 * op/pragma/warn/regcomp.
368 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
369 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
371 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
374 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
375 * arg. Show regex, up to a maximum length. If it's too long, chop and add
378 #define _FAIL(code) STMT_START { \
379 const char *ellipses = ""; \
380 IV len = RExC_end - RExC_precomp; \
383 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
384 if (len > RegexLengthToShowInErrorMessages) { \
385 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
386 len = RegexLengthToShowInErrorMessages - 10; \
392 #define FAIL(msg) _FAIL( \
393 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
394 msg, (int)len, RExC_precomp, ellipses))
396 #define FAIL2(msg,arg) _FAIL( \
397 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
398 arg, (int)len, RExC_precomp, ellipses))
401 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
403 #define Simple_vFAIL(m) STMT_START { \
404 const IV offset = RExC_parse - RExC_precomp; \
405 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
406 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
410 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
412 #define vFAIL(m) STMT_START { \
414 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
419 * Like Simple_vFAIL(), but accepts two arguments.
421 #define Simple_vFAIL2(m,a1) STMT_START { \
422 const IV offset = RExC_parse - RExC_precomp; \
423 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
424 (int)offset, RExC_precomp, RExC_precomp + offset); \
428 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
430 #define vFAIL2(m,a1) STMT_START { \
432 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
433 Simple_vFAIL2(m, a1); \
438 * Like Simple_vFAIL(), but accepts three arguments.
440 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
441 const IV offset = RExC_parse - RExC_precomp; \
442 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
443 (int)offset, RExC_precomp, RExC_precomp + offset); \
447 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
449 #define vFAIL3(m,a1,a2) STMT_START { \
451 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
452 Simple_vFAIL3(m, a1, a2); \
456 * Like Simple_vFAIL(), but accepts four arguments.
458 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
459 const IV offset = RExC_parse - RExC_precomp; \
460 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
461 (int)offset, RExC_precomp, RExC_precomp + offset); \
464 #define vWARN(loc,m) STMT_START { \
465 const IV offset = loc - RExC_precomp; \
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
467 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
470 #define vWARNdep(loc,m) STMT_START { \
471 const IV offset = loc - RExC_precomp; \
472 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
473 "%s" REPORT_LOCATION, \
474 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
478 #define vWARN2(loc, m, a1) STMT_START { \
479 const IV offset = loc - RExC_precomp; \
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
484 #define vWARN3(loc, m, a1, a2) STMT_START { \
485 const IV offset = loc - RExC_precomp; \
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
490 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
491 const IV offset = loc - RExC_precomp; \
492 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
493 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
496 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
497 const IV offset = loc - RExC_precomp; \
498 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
499 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
503 /* Allow for side effects in s */
504 #define REGC(c,s) STMT_START { \
505 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
508 /* Macros for recording node offsets. 20001227 mjd@plover.com
509 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
510 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
511 * Element 0 holds the number n.
512 * Position is 1 indexed.
515 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
517 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
518 __LINE__, (int)(node), (int)(byte))); \
520 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
522 RExC_offsets[2*(node)-1] = (byte); \
527 #define Set_Node_Offset(node,byte) \
528 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
529 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
531 #define Set_Node_Length_To_R(node,len) STMT_START { \
533 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
534 __LINE__, (int)(node), (int)(len))); \
536 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
538 RExC_offsets[2*(node)] = (len); \
543 #define Set_Node_Length(node,len) \
544 Set_Node_Length_To_R((node)-RExC_emit_start, len)
545 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
546 #define Set_Node_Cur_Length(node) \
547 Set_Node_Length(node, RExC_parse - parse_start)
549 /* Get offsets and lengths */
550 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
551 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
553 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
554 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
555 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
559 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
560 #define EXPERIMENTAL_INPLACESCAN
563 #define DEBUG_STUDYDATA(str,data,depth) \
564 DEBUG_OPTIMISE_MORE_r(if(data){ \
565 PerlIO_printf(Perl_debug_log, \
566 "%*s" str "Pos:%"IVdf"/%"IVdf \
567 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
568 (int)(depth)*2, "", \
569 (IV)((data)->pos_min), \
570 (IV)((data)->pos_delta), \
571 (UV)((data)->flags), \
572 (IV)((data)->whilem_c), \
573 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
574 is_inf ? "INF " : "" \
576 if ((data)->last_found) \
577 PerlIO_printf(Perl_debug_log, \
578 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
579 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
580 SvPVX_const((data)->last_found), \
581 (IV)((data)->last_end), \
582 (IV)((data)->last_start_min), \
583 (IV)((data)->last_start_max), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
586 SvPVX_const((data)->longest_fixed), \
587 (IV)((data)->offset_fixed), \
588 ((data)->longest && \
589 (data)->longest==&((data)->longest_float)) ? "*" : "", \
590 SvPVX_const((data)->longest_float), \
591 (IV)((data)->offset_float_min), \
592 (IV)((data)->offset_float_max) \
594 PerlIO_printf(Perl_debug_log,"\n"); \
597 static void clear_re(pTHX_ void *r);
599 /* Mark that we cannot extend a found fixed substring at this point.
600 Update the longest found anchored substring and the longest found
601 floating substrings if needed. */
604 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
606 const STRLEN l = CHR_SVLEN(data->last_found);
607 const STRLEN old_l = CHR_SVLEN(*data->longest);
608 GET_RE_DEBUG_FLAGS_DECL;
610 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
611 SvSetMagicSV(*data->longest, data->last_found);
612 if (*data->longest == data->longest_fixed) {
613 data->offset_fixed = l ? data->last_start_min : data->pos_min;
614 if (data->flags & SF_BEFORE_EOL)
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
618 data->flags &= ~SF_FIX_BEFORE_EOL;
619 data->minlen_fixed=minlenp;
620 data->lookbehind_fixed=0;
622 else { /* *data->longest == data->longest_float */
623 data->offset_float_min = l ? data->last_start_min : data->pos_min;
624 data->offset_float_max = (l
625 ? data->last_start_max
626 : data->pos_min + data->pos_delta);
627 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
628 data->offset_float_max = I32_MAX;
629 if (data->flags & SF_BEFORE_EOL)
631 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
633 data->flags &= ~SF_FL_BEFORE_EOL;
634 data->minlen_float=minlenp;
635 data->lookbehind_float=0;
638 SvCUR_set(data->last_found, 0);
640 SV * const sv = data->last_found;
641 if (SvUTF8(sv) && SvMAGICAL(sv)) {
642 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
648 data->flags &= ~SF_BEFORE_EOL;
649 DEBUG_STUDYDATA("cl_anything: ",data,0);
652 /* Can match anything (initialization) */
654 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
656 ANYOF_CLASS_ZERO(cl);
657 ANYOF_BITMAP_SETALL(cl);
658 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
660 cl->flags |= ANYOF_LOCALE;
663 /* Can match anything (initialization) */
665 S_cl_is_anything(const struct regnode_charclass_class *cl)
669 for (value = 0; value <= ANYOF_MAX; value += 2)
670 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
672 if (!(cl->flags & ANYOF_UNICODE_ALL))
674 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
679 /* Can match anything (initialization) */
681 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
683 Zero(cl, 1, struct regnode_charclass_class);
685 cl_anything(pRExC_state, cl);
689 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
691 Zero(cl, 1, struct regnode_charclass_class);
693 cl_anything(pRExC_state, cl);
695 cl->flags |= ANYOF_LOCALE;
698 /* 'And' a given class with another one. Can create false positives */
699 /* We assume that cl is not inverted */
701 S_cl_and(struct regnode_charclass_class *cl,
702 const struct regnode_charclass_class *and_with)
705 assert(and_with->type == ANYOF);
706 if (!(and_with->flags & ANYOF_CLASS)
707 && !(cl->flags & ANYOF_CLASS)
708 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
709 && !(and_with->flags & ANYOF_FOLD)
710 && !(cl->flags & ANYOF_FOLD)) {
713 if (and_with->flags & ANYOF_INVERT)
714 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
715 cl->bitmap[i] &= ~and_with->bitmap[i];
717 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
718 cl->bitmap[i] &= and_with->bitmap[i];
719 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
720 if (!(and_with->flags & ANYOF_EOS))
721 cl->flags &= ~ANYOF_EOS;
723 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
724 !(and_with->flags & ANYOF_INVERT)) {
725 cl->flags &= ~ANYOF_UNICODE_ALL;
726 cl->flags |= ANYOF_UNICODE;
727 ARG_SET(cl, ARG(and_with));
729 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
730 !(and_with->flags & ANYOF_INVERT))
731 cl->flags &= ~ANYOF_UNICODE_ALL;
732 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
733 !(and_with->flags & ANYOF_INVERT))
734 cl->flags &= ~ANYOF_UNICODE;
737 /* 'OR' a given class with another one. Can create false positives */
738 /* We assume that cl is not inverted */
740 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
742 if (or_with->flags & ANYOF_INVERT) {
744 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
745 * <= (B1 | !B2) | (CL1 | !CL2)
746 * which is wasteful if CL2 is small, but we ignore CL2:
747 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
748 * XXXX Can we handle case-fold? Unclear:
749 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
750 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
753 && !(or_with->flags & ANYOF_FOLD)
754 && !(cl->flags & ANYOF_FOLD) ) {
757 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
758 cl->bitmap[i] |= ~or_with->bitmap[i];
759 } /* XXXX: logic is complicated otherwise */
761 cl_anything(pRExC_state, cl);
764 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
765 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
766 && (!(or_with->flags & ANYOF_FOLD)
767 || (cl->flags & ANYOF_FOLD)) ) {
770 /* OR char bitmap and class bitmap separately */
771 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
772 cl->bitmap[i] |= or_with->bitmap[i];
773 if (or_with->flags & ANYOF_CLASS) {
774 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
775 cl->classflags[i] |= or_with->classflags[i];
776 cl->flags |= ANYOF_CLASS;
779 else { /* XXXX: logic is complicated, leave it along for a moment. */
780 cl_anything(pRExC_state, cl);
783 if (or_with->flags & ANYOF_EOS)
784 cl->flags |= ANYOF_EOS;
786 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
787 ARG(cl) != ARG(or_with)) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
791 if (or_with->flags & ANYOF_UNICODE_ALL) {
792 cl->flags |= ANYOF_UNICODE_ALL;
793 cl->flags &= ~ANYOF_UNICODE;
797 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
798 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
799 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
800 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
805 dump_trie(trie,widecharmap,revcharmap)
806 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
807 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
809 These routines dump out a trie in a somewhat readable format.
810 The _interim_ variants are used for debugging the interim
811 tables that are used to generate the final compressed
812 representation which is what dump_trie expects.
814 Part of the reason for their existance is to provide a form
815 of documentation as to how the different representations function.
820 Dumps the final compressed table form of the trie to Perl_debug_log.
821 Used for debugging make_trie().
825 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
826 AV *revcharmap, U32 depth)
829 SV *sv=sv_newmortal();
830 int colwidth= widecharmap ? 6 : 4;
831 GET_RE_DEBUG_FLAGS_DECL;
834 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
835 (int)depth * 2 + 2,"",
836 "Match","Base","Ofs" );
838 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
839 SV ** const tmp = av_fetch( revcharmap, state, 0);
841 PerlIO_printf( Perl_debug_log, "%*s",
843 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
844 PL_colors[0], PL_colors[1],
845 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
846 PERL_PV_ESCAPE_FIRSTCHAR
851 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
852 (int)depth * 2 + 2,"");
854 for( state = 0 ; state < trie->uniquecharcount ; state++ )
855 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
856 PerlIO_printf( Perl_debug_log, "\n");
858 for( state = 1 ; state < trie->statecount ; state++ ) {
859 const U32 base = trie->states[ state ].trans.base;
861 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
863 if ( trie->states[ state ].wordnum ) {
864 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
866 PerlIO_printf( Perl_debug_log, "%6s", "" );
869 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
874 while( ( base + ofs < trie->uniquecharcount ) ||
875 ( base + ofs - trie->uniquecharcount < trie->lasttrans
876 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
879 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
881 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
882 if ( ( base + ofs >= trie->uniquecharcount ) &&
883 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
884 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
886 PerlIO_printf( Perl_debug_log, "%*"UVXf,
888 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
890 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
894 PerlIO_printf( Perl_debug_log, "]");
897 PerlIO_printf( Perl_debug_log, "\n" );
901 Dumps a fully constructed but uncompressed trie in list form.
902 List tries normally only are used for construction when the number of
903 possible chars (trie->uniquecharcount) is very high.
904 Used for debugging make_trie().
907 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
908 HV *widecharmap, AV *revcharmap, U32 next_alloc,
912 SV *sv=sv_newmortal();
913 int colwidth= widecharmap ? 6 : 4;
914 GET_RE_DEBUG_FLAGS_DECL;
915 /* print out the table precompression. */
916 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
917 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
918 "------:-----+-----------------\n" );
920 for( state=1 ; state < next_alloc ; state ++ ) {
923 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
924 (int)depth * 2 + 2,"", (UV)state );
925 if ( ! trie->states[ state ].wordnum ) {
926 PerlIO_printf( Perl_debug_log, "%5s| ","");
928 PerlIO_printf( Perl_debug_log, "W%4x| ",
929 trie->states[ state ].wordnum
932 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
933 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
935 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
937 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
938 PL_colors[0], PL_colors[1],
939 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
940 PERL_PV_ESCAPE_FIRSTCHAR
942 TRIE_LIST_ITEM(state,charid).forid,
943 (UV)TRIE_LIST_ITEM(state,charid).newstate
946 PerlIO_printf(Perl_debug_log, "\n%*s| ",
947 (int)((depth * 2) + 14), "");
950 PerlIO_printf( Perl_debug_log, "\n");
955 Dumps a fully constructed but uncompressed trie in table form.
956 This is the normal DFA style state transition table, with a few
957 twists to facilitate compression later.
958 Used for debugging make_trie().
961 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
962 HV *widecharmap, AV *revcharmap, U32 next_alloc,
967 SV *sv=sv_newmortal();
968 int colwidth= widecharmap ? 6 : 4;
969 GET_RE_DEBUG_FLAGS_DECL;
972 print out the table precompression so that we can do a visual check
973 that they are identical.
976 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
978 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
979 SV ** const tmp = av_fetch( revcharmap, charid, 0);
981 PerlIO_printf( Perl_debug_log, "%*s",
983 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
984 PL_colors[0], PL_colors[1],
985 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
986 PERL_PV_ESCAPE_FIRSTCHAR
992 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
994 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
995 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
998 PerlIO_printf( Perl_debug_log, "\n" );
1000 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1002 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1003 (int)depth * 2 + 2,"",
1004 (UV)TRIE_NODENUM( state ) );
1006 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1007 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1009 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1011 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1013 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1014 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1016 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1017 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1024 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1025 startbranch: the first branch in the whole branch sequence
1026 first : start branch of sequence of branch-exact nodes.
1027 May be the same as startbranch
1028 last : Thing following the last branch.
1029 May be the same as tail.
1030 tail : item following the branch sequence
1031 count : words in the sequence
1032 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1033 depth : indent depth
1035 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1037 A trie is an N'ary tree where the branches are determined by digital
1038 decomposition of the key. IE, at the root node you look up the 1st character and
1039 follow that branch repeat until you find the end of the branches. Nodes can be
1040 marked as "accepting" meaning they represent a complete word. Eg:
1044 would convert into the following structure. Numbers represent states, letters
1045 following numbers represent valid transitions on the letter from that state, if
1046 the number is in square brackets it represents an accepting state, otherwise it
1047 will be in parenthesis.
1049 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1053 (1) +-i->(6)-+-s->[7]
1055 +-s->(3)-+-h->(4)-+-e->[5]
1057 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1059 This shows that when matching against the string 'hers' we will begin at state 1
1060 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1061 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1062 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1063 single traverse. We store a mapping from accepting to state to which word was
1064 matched, and then when we have multiple possibilities we try to complete the
1065 rest of the regex in the order in which they occured in the alternation.
1067 The only prior NFA like behaviour that would be changed by the TRIE support is
1068 the silent ignoring of duplicate alternations which are of the form:
1070 / (DUPE|DUPE) X? (?{ ... }) Y /x
1072 Thus EVAL blocks follwing a trie may be called a different number of times with
1073 and without the optimisation. With the optimisations dupes will be silently
1074 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1075 the following demonstrates:
1077 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1079 which prints out 'word' three times, but
1081 'words'=~/(word|word|word)(?{ print $1 })S/
1083 which doesnt print it out at all. This is due to other optimisations kicking in.
1085 Example of what happens on a structural level:
1087 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1089 1: CURLYM[1] {1,32767}(18)
1100 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1101 and should turn into:
1103 1: CURLYM[1] {1,32767}(18)
1105 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1113 Cases where tail != last would be like /(?foo|bar)baz/:
1123 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1124 and would end up looking like:
1127 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1134 d = uvuni_to_utf8_flags(d, uv, 0);
1136 is the recommended Unicode-aware way of saying
1141 #define TRIE_STORE_REVCHAR \
1143 SV *tmp = newSVpvs(""); \
1144 if (UTF) SvUTF8_on(tmp); \
1145 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1146 av_push( revcharmap, tmp ); \
1149 #define TRIE_READ_CHAR STMT_START { \
1153 if ( foldlen > 0 ) { \
1154 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1161 foldlen -= UNISKIP( uvc ); \
1162 scan = foldbuf + UNISKIP( uvc ); \
1165 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1175 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1176 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1177 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1178 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1180 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1181 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1182 TRIE_LIST_CUR( state )++; \
1185 #define TRIE_LIST_NEW(state) STMT_START { \
1186 Newxz( trie->states[ state ].trans.list, \
1187 4, reg_trie_trans_le ); \
1188 TRIE_LIST_CUR( state ) = 1; \
1189 TRIE_LIST_LEN( state ) = 4; \
1192 #define TRIE_HANDLE_WORD(state) STMT_START { \
1193 U16 dupe= trie->states[ state ].wordnum; \
1194 regnode * const noper_next = regnext( noper ); \
1196 if (trie->wordlen) \
1197 trie->wordlen[ curword ] = wordlen; \
1199 /* store the word for dumping */ \
1201 if (OP(noper) != NOTHING) \
1202 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1204 tmp = newSVpvn( "", 0 ); \
1205 if ( UTF ) SvUTF8_on( tmp ); \
1206 av_push( trie_words, tmp ); \
1211 if ( noper_next < tail ) { \
1213 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1214 trie->jump[curword] = (U16)(noper_next - convert); \
1216 jumper = noper_next; \
1218 nextbranch= regnext(cur); \
1222 /* So it's a dupe. This means we need to maintain a */\
1223 /* linked-list from the first to the next. */\
1224 /* we only allocate the nextword buffer when there */\
1225 /* a dupe, so first time we have to do the allocation */\
1226 if (!trie->nextword) \
1227 trie->nextword = (U16 *) \
1228 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1229 while ( trie->nextword[dupe] ) \
1230 dupe= trie->nextword[dupe]; \
1231 trie->nextword[dupe]= curword; \
1233 /* we haven't inserted this word yet. */ \
1234 trie->states[ state ].wordnum = curword; \
1239 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1240 ( ( base + charid >= ucharcount \
1241 && base + charid < ubound \
1242 && state == trie->trans[ base - ucharcount + charid ].check \
1243 && trie->trans[ base - ucharcount + charid ].next ) \
1244 ? trie->trans[ base - ucharcount + charid ].next \
1245 : ( state==1 ? special : 0 ) \
1249 #define MADE_JUMP_TRIE 2
1250 #define MADE_EXACT_TRIE 4
1253 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1256 /* first pass, loop through and scan words */
1257 reg_trie_data *trie;
1258 HV *widecharmap = NULL;
1259 AV *revcharmap = newAV();
1261 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1266 regnode *jumper = NULL;
1267 regnode *nextbranch = NULL;
1268 regnode *convert = NULL;
1269 /* we just use folder as a flag in utf8 */
1270 const U8 * const folder = ( flags == EXACTF
1272 : ( flags == EXACTFL
1279 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1280 AV *trie_words = NULL;
1281 /* along with revcharmap, this only used during construction but both are
1282 * useful during debugging so we store them in the struct when debugging.
1285 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1286 STRLEN trie_charcount=0;
1288 SV *re_trie_maxbuff;
1289 GET_RE_DEBUG_FLAGS_DECL;
1291 PERL_UNUSED_ARG(depth);
1294 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1296 trie->startstate = 1;
1297 trie->wordcount = word_count;
1298 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1299 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1300 if (!(UTF && folder))
1301 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1303 trie_words = newAV();
1306 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1307 if (!SvIOK(re_trie_maxbuff)) {
1308 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1311 PerlIO_printf( Perl_debug_log,
1312 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1313 (int)depth * 2 + 2, "",
1314 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1315 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1319 /* Find the node we are going to overwrite */
1320 if ( first == startbranch && OP( last ) != BRANCH ) {
1321 /* whole branch chain */
1324 /* branch sub-chain */
1325 convert = NEXTOPER( first );
1328 /* -- First loop and Setup --
1330 We first traverse the branches and scan each word to determine if it
1331 contains widechars, and how many unique chars there are, this is
1332 important as we have to build a table with at least as many columns as we
1335 We use an array of integers to represent the character codes 0..255
1336 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1337 native representation of the character value as the key and IV's for the
1340 *TODO* If we keep track of how many times each character is used we can
1341 remap the columns so that the table compression later on is more
1342 efficient in terms of memory by ensuring most common value is in the
1343 middle and the least common are on the outside. IMO this would be better
1344 than a most to least common mapping as theres a decent chance the most
1345 common letter will share a node with the least common, meaning the node
1346 will not be compressable. With a middle is most common approach the worst
1347 case is when we have the least common nodes twice.
1351 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1352 regnode * const noper = NEXTOPER( cur );
1353 const U8 *uc = (U8*)STRING( noper );
1354 const U8 * const e = uc + STR_LEN( noper );
1356 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1357 const U8 *scan = (U8*)NULL;
1358 U32 wordlen = 0; /* required init */
1361 if (OP(noper) == NOTHING) {
1366 TRIE_BITMAP_SET(trie,*uc);
1367 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1369 for ( ; uc < e ; uc += len ) {
1370 TRIE_CHARCOUNT(trie)++;
1374 if ( !trie->charmap[ uvc ] ) {
1375 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1377 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1383 widecharmap = newHV();
1385 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1388 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1390 if ( !SvTRUE( *svpp ) ) {
1391 sv_setiv( *svpp, ++trie->uniquecharcount );
1396 if( cur == first ) {
1399 } else if (chars < trie->minlen) {
1401 } else if (chars > trie->maxlen) {
1405 } /* end first pass */
1406 DEBUG_TRIE_COMPILE_r(
1407 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1408 (int)depth * 2 + 2,"",
1409 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1410 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1411 (int)trie->minlen, (int)trie->maxlen )
1413 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1416 We now know what we are dealing with in terms of unique chars and
1417 string sizes so we can calculate how much memory a naive
1418 representation using a flat table will take. If it's over a reasonable
1419 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1420 conservative but potentially much slower representation using an array
1423 At the end we convert both representations into the same compressed
1424 form that will be used in regexec.c for matching with. The latter
1425 is a form that cannot be used to construct with but has memory
1426 properties similar to the list form and access properties similar
1427 to the table form making it both suitable for fast searches and
1428 small enough that its feasable to store for the duration of a program.
1430 See the comment in the code where the compressed table is produced
1431 inplace from the flat tabe representation for an explanation of how
1432 the compression works.
1437 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1439 Second Pass -- Array Of Lists Representation
1441 Each state will be represented by a list of charid:state records
1442 (reg_trie_trans_le) the first such element holds the CUR and LEN
1443 points of the allocated array. (See defines above).
1445 We build the initial structure using the lists, and then convert
1446 it into the compressed table form which allows faster lookups
1447 (but cant be modified once converted).
1450 STRLEN transcount = 1;
1452 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1453 "%*sCompiling trie using list compiler\n",
1454 (int)depth * 2 + 2, ""));
1456 trie->states = (reg_trie_state *)
1457 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1458 sizeof(reg_trie_state) );
1462 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1464 regnode * const noper = NEXTOPER( cur );
1465 U8 *uc = (U8*)STRING( noper );
1466 const U8 * const e = uc + STR_LEN( noper );
1467 U32 state = 1; /* required init */
1468 U16 charid = 0; /* sanity init */
1469 U8 *scan = (U8*)NULL; /* sanity init */
1470 STRLEN foldlen = 0; /* required init */
1471 U32 wordlen = 0; /* required init */
1472 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1474 if (OP(noper) != NOTHING) {
1475 for ( ; uc < e ; uc += len ) {
1480 charid = trie->charmap[ uvc ];
1482 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1486 charid=(U16)SvIV( *svpp );
1489 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1496 if ( !trie->states[ state ].trans.list ) {
1497 TRIE_LIST_NEW( state );
1499 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1500 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1501 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1506 newstate = next_alloc++;
1507 TRIE_LIST_PUSH( state, charid, newstate );
1512 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1516 TRIE_HANDLE_WORD(state);
1518 } /* end second pass */
1520 /* next alloc is the NEXT state to be allocated */
1521 trie->statecount = next_alloc;
1522 trie->states = (reg_trie_state *)
1523 PerlMemShared_realloc( trie->states,
1525 * sizeof(reg_trie_state) );
1527 /* and now dump it out before we compress it */
1528 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1529 revcharmap, next_alloc,
1533 trie->trans = (reg_trie_trans *)
1534 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1541 for( state=1 ; state < next_alloc ; state ++ ) {
1545 DEBUG_TRIE_COMPILE_MORE_r(
1546 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1550 if (trie->states[state].trans.list) {
1551 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1555 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1556 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1557 if ( forid < minid ) {
1559 } else if ( forid > maxid ) {
1563 if ( transcount < tp + maxid - minid + 1) {
1565 trie->trans = (reg_trie_trans *)
1566 PerlMemShared_realloc( trie->trans,
1568 * sizeof(reg_trie_trans) );
1569 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1571 base = trie->uniquecharcount + tp - minid;
1572 if ( maxid == minid ) {
1574 for ( ; zp < tp ; zp++ ) {
1575 if ( ! trie->trans[ zp ].next ) {
1576 base = trie->uniquecharcount + zp - minid;
1577 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1578 trie->trans[ zp ].check = state;
1584 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1585 trie->trans[ tp ].check = state;
1590 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1591 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1592 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1593 trie->trans[ tid ].check = state;
1595 tp += ( maxid - minid + 1 );
1597 Safefree(trie->states[ state ].trans.list);
1600 DEBUG_TRIE_COMPILE_MORE_r(
1601 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1604 trie->states[ state ].trans.base=base;
1606 trie->lasttrans = tp + 1;
1610 Second Pass -- Flat Table Representation.
1612 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1613 We know that we will need Charcount+1 trans at most to store the data
1614 (one row per char at worst case) So we preallocate both structures
1615 assuming worst case.
1617 We then construct the trie using only the .next slots of the entry
1620 We use the .check field of the first entry of the node temporarily to
1621 make compression both faster and easier by keeping track of how many non
1622 zero fields are in the node.
1624 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1627 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1628 number representing the first entry of the node, and state as a
1629 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1630 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1631 are 2 entrys per node. eg:
1639 The table is internally in the right hand, idx form. However as we also
1640 have to deal with the states array which is indexed by nodenum we have to
1641 use TRIE_NODENUM() to convert.
1644 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1645 "%*sCompiling trie using table compiler\n",
1646 (int)depth * 2 + 2, ""));
1648 trie->trans = (reg_trie_trans *)
1649 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1650 * trie->uniquecharcount + 1,
1651 sizeof(reg_trie_trans) );
1652 trie->states = (reg_trie_state *)
1653 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1654 sizeof(reg_trie_state) );
1655 next_alloc = trie->uniquecharcount + 1;
1658 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1660 regnode * const noper = NEXTOPER( cur );
1661 const U8 *uc = (U8*)STRING( noper );
1662 const U8 * const e = uc + STR_LEN( noper );
1664 U32 state = 1; /* required init */
1666 U16 charid = 0; /* sanity init */
1667 U32 accept_state = 0; /* sanity init */
1668 U8 *scan = (U8*)NULL; /* sanity init */
1670 STRLEN foldlen = 0; /* required init */
1671 U32 wordlen = 0; /* required init */
1672 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1674 if ( OP(noper) != NOTHING ) {
1675 for ( ; uc < e ; uc += len ) {
1680 charid = trie->charmap[ uvc ];
1682 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1683 charid = svpp ? (U16)SvIV(*svpp) : 0;
1687 if ( !trie->trans[ state + charid ].next ) {
1688 trie->trans[ state + charid ].next = next_alloc;
1689 trie->trans[ state ].check++;
1690 next_alloc += trie->uniquecharcount;
1692 state = trie->trans[ state + charid ].next;
1694 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1696 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1699 accept_state = TRIE_NODENUM( state );
1700 TRIE_HANDLE_WORD(accept_state);
1702 } /* end second pass */
1704 /* and now dump it out before we compress it */
1705 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1707 next_alloc, depth+1));
1711 * Inplace compress the table.*
1713 For sparse data sets the table constructed by the trie algorithm will
1714 be mostly 0/FAIL transitions or to put it another way mostly empty.
1715 (Note that leaf nodes will not contain any transitions.)
1717 This algorithm compresses the tables by eliminating most such
1718 transitions, at the cost of a modest bit of extra work during lookup:
1720 - Each states[] entry contains a .base field which indicates the
1721 index in the state[] array wheres its transition data is stored.
1723 - If .base is 0 there are no valid transitions from that node.
1725 - If .base is nonzero then charid is added to it to find an entry in
1728 -If trans[states[state].base+charid].check!=state then the
1729 transition is taken to be a 0/Fail transition. Thus if there are fail
1730 transitions at the front of the node then the .base offset will point
1731 somewhere inside the previous nodes data (or maybe even into a node
1732 even earlier), but the .check field determines if the transition is
1736 The following process inplace converts the table to the compressed
1737 table: We first do not compress the root node 1,and mark its all its
1738 .check pointers as 1 and set its .base pointer as 1 as well. This
1739 allows to do a DFA construction from the compressed table later, and
1740 ensures that any .base pointers we calculate later are greater than
1743 - We set 'pos' to indicate the first entry of the second node.
1745 - We then iterate over the columns of the node, finding the first and
1746 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1747 and set the .check pointers accordingly, and advance pos
1748 appropriately and repreat for the next node. Note that when we copy
1749 the next pointers we have to convert them from the original
1750 NODEIDX form to NODENUM form as the former is not valid post
1753 - If a node has no transitions used we mark its base as 0 and do not
1754 advance the pos pointer.
1756 - If a node only has one transition we use a second pointer into the
1757 structure to fill in allocated fail transitions from other states.
1758 This pointer is independent of the main pointer and scans forward
1759 looking for null transitions that are allocated to a state. When it
1760 finds one it writes the single transition into the "hole". If the
1761 pointer doesnt find one the single transition is appended as normal.
1763 - Once compressed we can Renew/realloc the structures to release the
1766 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1767 specifically Fig 3.47 and the associated pseudocode.
1771 const U32 laststate = TRIE_NODENUM( next_alloc );
1774 trie->statecount = laststate;
1776 for ( state = 1 ; state < laststate ; state++ ) {
1778 const U32 stateidx = TRIE_NODEIDX( state );
1779 const U32 o_used = trie->trans[ stateidx ].check;
1780 U32 used = trie->trans[ stateidx ].check;
1781 trie->trans[ stateidx ].check = 0;
1783 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1784 if ( flag || trie->trans[ stateidx + charid ].next ) {
1785 if ( trie->trans[ stateidx + charid ].next ) {
1787 for ( ; zp < pos ; zp++ ) {
1788 if ( ! trie->trans[ zp ].next ) {
1792 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1793 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1794 trie->trans[ zp ].check = state;
1795 if ( ++zp > pos ) pos = zp;
1802 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1804 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1805 trie->trans[ pos ].check = state;
1810 trie->lasttrans = pos + 1;
1811 trie->states = (reg_trie_state *)
1812 PerlMemShared_realloc( trie->states, laststate
1813 * sizeof(reg_trie_state) );
1814 DEBUG_TRIE_COMPILE_MORE_r(
1815 PerlIO_printf( Perl_debug_log,
1816 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1817 (int)depth * 2 + 2,"",
1818 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1821 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1824 } /* end table compress */
1826 DEBUG_TRIE_COMPILE_MORE_r(
1827 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1828 (int)depth * 2 + 2, "",
1829 (UV)trie->statecount,
1830 (UV)trie->lasttrans)
1832 /* resize the trans array to remove unused space */
1833 trie->trans = (reg_trie_trans *)
1834 PerlMemShared_realloc( trie->trans, trie->lasttrans
1835 * sizeof(reg_trie_trans) );
1837 /* and now dump out the compressed format */
1838 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1840 { /* Modify the program and insert the new TRIE node*/
1841 U8 nodetype =(U8)(flags & 0xFF);
1845 regnode *optimize = NULL;
1847 U32 mjd_nodelen = 0;
1850 This means we convert either the first branch or the first Exact,
1851 depending on whether the thing following (in 'last') is a branch
1852 or not and whther first is the startbranch (ie is it a sub part of
1853 the alternation or is it the whole thing.)
1854 Assuming its a sub part we conver the EXACT otherwise we convert
1855 the whole branch sequence, including the first.
1857 /* Find the node we are going to overwrite */
1858 if ( first != startbranch || OP( last ) == BRANCH ) {
1859 /* branch sub-chain */
1860 NEXT_OFF( first ) = (U16)(last - first);
1862 mjd_offset= Node_Offset((convert));
1863 mjd_nodelen= Node_Length((convert));
1865 /* whole branch chain */
1868 const regnode *nop = NEXTOPER( convert );
1869 mjd_offset= Node_Offset((nop));
1870 mjd_nodelen= Node_Length((nop));
1875 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1876 (int)depth * 2 + 2, "",
1877 (UV)mjd_offset, (UV)mjd_nodelen)
1880 /* But first we check to see if there is a common prefix we can
1881 split out as an EXACT and put in front of the TRIE node. */
1882 trie->startstate= 1;
1883 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1885 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1889 const U32 base = trie->states[ state ].trans.base;
1891 if ( trie->states[state].wordnum )
1894 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1895 if ( ( base + ofs >= trie->uniquecharcount ) &&
1896 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1897 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1899 if ( ++count > 1 ) {
1900 SV **tmp = av_fetch( revcharmap, ofs, 0);
1901 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1902 if ( state == 1 ) break;
1904 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1906 PerlIO_printf(Perl_debug_log,
1907 "%*sNew Start State=%"UVuf" Class: [",
1908 (int)depth * 2 + 2, "",
1911 SV ** const tmp = av_fetch( revcharmap, idx, 0);
1912 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1914 TRIE_BITMAP_SET(trie,*ch);
1916 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1918 PerlIO_printf(Perl_debug_log, (char*)ch)
1922 TRIE_BITMAP_SET(trie,*ch);
1924 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1925 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1931 SV **tmp = av_fetch( revcharmap, idx, 0);
1932 char *ch = SvPV_nolen( *tmp );
1934 SV *sv=sv_newmortal();
1935 PerlIO_printf( Perl_debug_log,
1936 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1937 (int)depth * 2 + 2, "",
1939 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1940 PL_colors[0], PL_colors[1],
1941 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1942 PERL_PV_ESCAPE_FIRSTCHAR
1947 OP( convert ) = nodetype;
1948 str=STRING(convert);
1959 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1965 regnode *n = convert+NODE_SZ_STR(convert);
1966 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1967 trie->startstate = state;
1968 trie->minlen -= (state - 1);
1969 trie->maxlen -= (state - 1);
1971 regnode *fix = convert;
1972 U32 word = trie->wordcount;
1974 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1975 while( ++fix < n ) {
1976 Set_Node_Offset_Length(fix, 0, 0);
1979 SV ** const tmp = av_fetch( trie_words, word, 0 );
1981 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1982 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1984 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1991 NEXT_OFF(convert) = (U16)(tail - convert);
1992 DEBUG_r(optimize= n);
1998 if ( trie->maxlen ) {
1999 NEXT_OFF( convert ) = (U16)(tail - convert);
2000 ARG_SET( convert, data_slot );
2001 /* Store the offset to the first unabsorbed branch in
2002 jump[0], which is otherwise unused by the jump logic.
2003 We use this when dumping a trie and during optimisation. */
2005 trie->jump[0] = (U16)(nextbranch - convert);
2008 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2009 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2011 OP( convert ) = TRIEC;
2012 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2013 PerlMemShared_free(trie->bitmap);
2016 OP( convert ) = TRIE;
2018 /* store the type in the flags */
2019 convert->flags = nodetype;
2023 + regarglen[ OP( convert ) ];
2025 /* XXX We really should free up the resource in trie now,
2026 as we won't use them - (which resources?) dmq */
2028 /* needed for dumping*/
2029 DEBUG_r(if (optimize) {
2030 regnode *opt = convert;
2031 while ( ++opt < optimize) {
2032 Set_Node_Offset_Length(opt,0,0);
2035 Try to clean up some of the debris left after the
2038 while( optimize < jumper ) {
2039 mjd_nodelen += Node_Length((optimize));
2040 OP( optimize ) = OPTIMIZED;
2041 Set_Node_Offset_Length(optimize,0,0);
2044 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2046 } /* end node insert */
2047 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2049 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2050 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2052 SvREFCNT_dec(revcharmap);
2056 : trie->startstate>1
2062 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2064 /* The Trie is constructed and compressed now so we can build a fail array now if its needed
2066 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2067 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2070 We find the fail state for each state in the trie, this state is the longest proper
2071 suffix of the current states 'word' that is also a proper prefix of another word in our
2072 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2073 the DFA not to have to restart after its tried and failed a word at a given point, it
2074 simply continues as though it had been matching the other word in the first place.
2076 'abcdgu'=~/abcdefg|cdgu/
2077 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2078 fail, which would bring use to the state representing 'd' in the second word where we would
2079 try 'g' and succeed, prodceding to match 'cdgu'.
2081 /* add a fail transition */
2082 const U32 trie_offset = ARG(source);
2083 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2085 const U32 ucharcount = trie->uniquecharcount;
2086 const U32 numstates = trie->statecount;
2087 const U32 ubound = trie->lasttrans + ucharcount;
2091 U32 base = trie->states[ 1 ].trans.base;
2094 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2095 GET_RE_DEBUG_FLAGS_DECL;
2097 PERL_UNUSED_ARG(depth);
2101 ARG_SET( stclass, data_slot );
2102 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2103 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2104 aho->trie=trie_offset;
2105 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2106 Copy( trie->states, aho->states, numstates, reg_trie_state );
2107 Newxz( q, numstates, U32);
2108 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2111 /* initialize fail[0..1] to be 1 so that we always have
2112 a valid final fail state */
2113 fail[ 0 ] = fail[ 1 ] = 1;
2115 for ( charid = 0; charid < ucharcount ; charid++ ) {
2116 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2118 q[ q_write ] = newstate;
2119 /* set to point at the root */
2120 fail[ q[ q_write++ ] ]=1;
2123 while ( q_read < q_write) {
2124 const U32 cur = q[ q_read++ % numstates ];
2125 base = trie->states[ cur ].trans.base;
2127 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2128 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2130 U32 fail_state = cur;
2133 fail_state = fail[ fail_state ];
2134 fail_base = aho->states[ fail_state ].trans.base;
2135 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2137 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2138 fail[ ch_state ] = fail_state;
2139 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2141 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2143 q[ q_write++ % numstates] = ch_state;
2147 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2148 when we fail in state 1, this allows us to use the
2149 charclass scan to find a valid start char. This is based on the principle
2150 that theres a good chance the string being searched contains lots of stuff
2151 that cant be a start char.
2153 fail[ 0 ] = fail[ 1 ] = 0;
2154 DEBUG_TRIE_COMPILE_r({
2155 PerlIO_printf(Perl_debug_log,
2156 "%*sStclass Failtable (%"UVuf" states): 0",
2157 (int)(depth * 2), "", (UV)numstates
2159 for( q_read=1; q_read<numstates; q_read++ ) {
2160 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2162 PerlIO_printf(Perl_debug_log, "\n");
2165 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2170 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2171 * These need to be revisited when a newer toolchain becomes available.
2173 #if defined(__sparc64__) && defined(__GNUC__)
2174 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2175 # undef SPARC64_GCC_WORKAROUND
2176 # define SPARC64_GCC_WORKAROUND 1
2180 #define DEBUG_PEEP(str,scan,depth) \
2181 DEBUG_OPTIMISE_r({if (scan){ \
2182 SV * const mysv=sv_newmortal(); \
2183 regnode *Next = regnext(scan); \
2184 regprop(RExC_rx, mysv, scan); \
2185 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2186 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2187 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2194 #define JOIN_EXACT(scan,min,flags) \
2195 if (PL_regkind[OP(scan)] == EXACT) \
2196 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2199 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2200 /* Merge several consecutive EXACTish nodes into one. */
2201 regnode *n = regnext(scan);
2203 regnode *next = scan + NODE_SZ_STR(scan);
2207 regnode *stop = scan;
2208 GET_RE_DEBUG_FLAGS_DECL;
2210 PERL_UNUSED_ARG(depth);
2212 #ifndef EXPERIMENTAL_INPLACESCAN
2213 PERL_UNUSED_ARG(flags);
2214 PERL_UNUSED_ARG(val);
2216 DEBUG_PEEP("join",scan,depth);
2218 /* Skip NOTHING, merge EXACT*. */
2220 ( PL_regkind[OP(n)] == NOTHING ||
2221 (stringok && (OP(n) == OP(scan))))
2223 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2225 if (OP(n) == TAIL || n > next)
2227 if (PL_regkind[OP(n)] == NOTHING) {
2228 DEBUG_PEEP("skip:",n,depth);
2229 NEXT_OFF(scan) += NEXT_OFF(n);
2230 next = n + NODE_STEP_REGNODE;
2237 else if (stringok) {
2238 const unsigned int oldl = STR_LEN(scan);
2239 regnode * const nnext = regnext(n);
2241 DEBUG_PEEP("merg",n,depth);
2244 if (oldl + STR_LEN(n) > U8_MAX)
2246 NEXT_OFF(scan) += NEXT_OFF(n);
2247 STR_LEN(scan) += STR_LEN(n);
2248 next = n + NODE_SZ_STR(n);
2249 /* Now we can overwrite *n : */
2250 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2258 #ifdef EXPERIMENTAL_INPLACESCAN
2259 if (flags && !NEXT_OFF(n)) {
2260 DEBUG_PEEP("atch", val, depth);
2261 if (reg_off_by_arg[OP(n)]) {
2262 ARG_SET(n, val - n);
2265 NEXT_OFF(n) = val - n;
2272 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2274 Two problematic code points in Unicode casefolding of EXACT nodes:
2276 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2277 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2283 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2284 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2286 This means that in case-insensitive matching (or "loose matching",
2287 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2288 length of the above casefolded versions) can match a target string
2289 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2290 This would rather mess up the minimum length computation.
2292 What we'll do is to look for the tail four bytes, and then peek
2293 at the preceding two bytes to see whether we need to decrease
2294 the minimum length by four (six minus two).
2296 Thanks to the design of UTF-8, there cannot be false matches:
2297 A sequence of valid UTF-8 bytes cannot be a subsequence of
2298 another valid sequence of UTF-8 bytes.
2301 char * const s0 = STRING(scan), *s, *t;
2302 char * const s1 = s0 + STR_LEN(scan) - 1;
2303 char * const s2 = s1 - 4;
2304 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2305 const char t0[] = "\xaf\x49\xaf\x42";
2307 const char t0[] = "\xcc\x88\xcc\x81";
2309 const char * const t1 = t0 + 3;
2312 s < s2 && (t = ninstr(s, s1, t0, t1));
2315 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2316 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2318 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2319 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2327 n = scan + NODE_SZ_STR(scan);
2329 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2336 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2340 /* REx optimizer. Converts nodes into quickier variants "in place".
2341 Finds fixed substrings. */
2343 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2344 to the position after last scanned or to NULL. */
2346 #define INIT_AND_WITHP \
2347 assert(!and_withp); \
2348 Newx(and_withp,1,struct regnode_charclass_class); \
2349 SAVEFREEPV(and_withp)
2351 /* this is a chain of data about sub patterns we are processing that
2352 need to be handled seperately/specially in study_chunk. Its so
2353 we can simulate recursion without losing state. */
2355 typedef struct scan_frame {
2356 regnode *last; /* last node to process in this frame */
2357 regnode *next; /* next node to process when last is reached */
2358 struct scan_frame *prev; /*previous frame*/
2359 I32 stop; /* what stopparen do we use */
2363 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2366 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2367 I32 *minlenp, I32 *deltap,
2372 struct regnode_charclass_class *and_withp,
2373 U32 flags, U32 depth)
2374 /* scanp: Start here (read-write). */
2375 /* deltap: Write maxlen-minlen here. */
2376 /* last: Stop before this one. */
2377 /* data: string data about the pattern */
2378 /* stopparen: treat close N as END */
2379 /* recursed: which subroutines have we recursed into */
2380 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2383 I32 min = 0, pars = 0, code;
2384 regnode *scan = *scanp, *next;
2386 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2387 int is_inf_internal = 0; /* The studied chunk is infinite */
2388 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2389 scan_data_t data_fake;
2390 SV *re_trie_maxbuff = NULL;
2391 regnode *first_non_open = scan;
2392 I32 stopmin = I32_MAX;
2393 scan_frame *frame = NULL;
2395 GET_RE_DEBUG_FLAGS_DECL;
2398 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2402 while (first_non_open && OP(first_non_open) == OPEN)
2403 first_non_open=regnext(first_non_open);
2408 while ( scan && OP(scan) != END && scan < last ){
2409 /* Peephole optimizer: */
2410 DEBUG_STUDYDATA("Peep:", data,depth);
2411 DEBUG_PEEP("Peep",scan,depth);
2412 JOIN_EXACT(scan,&min,0);
2414 /* Follow the next-chain of the current node and optimize
2415 away all the NOTHINGs from it. */
2416 if (OP(scan) != CURLYX) {
2417 const int max = (reg_off_by_arg[OP(scan)]
2419 /* I32 may be smaller than U16 on CRAYs! */
2420 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2421 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2425 /* Skip NOTHING and LONGJMP. */
2426 while ((n = regnext(n))
2427 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2428 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2429 && off + noff < max)
2431 if (reg_off_by_arg[OP(scan)])
2434 NEXT_OFF(scan) = off;
2439 /* The principal pseudo-switch. Cannot be a switch, since we
2440 look into several different things. */
2441 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2442 || OP(scan) == IFTHEN) {
2443 next = regnext(scan);
2445 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2447 if (OP(next) == code || code == IFTHEN) {
2448 /* NOTE - There is similar code to this block below for handling
2449 TRIE nodes on a re-study. If you change stuff here check there
2451 I32 max1 = 0, min1 = I32_MAX, num = 0;
2452 struct regnode_charclass_class accum;
2453 regnode * const startbranch=scan;
2455 if (flags & SCF_DO_SUBSTR)
2456 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2457 if (flags & SCF_DO_STCLASS)
2458 cl_init_zero(pRExC_state, &accum);
2460 while (OP(scan) == code) {
2461 I32 deltanext, minnext, f = 0, fake;
2462 struct regnode_charclass_class this_class;
2465 data_fake.flags = 0;
2467 data_fake.whilem_c = data->whilem_c;
2468 data_fake.last_closep = data->last_closep;
2471 data_fake.last_closep = &fake;
2473 data_fake.pos_delta = delta;
2474 next = regnext(scan);
2475 scan = NEXTOPER(scan);
2477 scan = NEXTOPER(scan);
2478 if (flags & SCF_DO_STCLASS) {
2479 cl_init(pRExC_state, &this_class);
2480 data_fake.start_class = &this_class;
2481 f = SCF_DO_STCLASS_AND;
2483 if (flags & SCF_WHILEM_VISITED_POS)
2484 f |= SCF_WHILEM_VISITED_POS;
2486 /* we suppose the run is continuous, last=next...*/
2487 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2489 stopparen, recursed, NULL, f,depth+1);
2492 if (max1 < minnext + deltanext)
2493 max1 = minnext + deltanext;
2494 if (deltanext == I32_MAX)
2495 is_inf = is_inf_internal = 1;
2497 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2499 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2500 if ( stopmin > minnext)
2501 stopmin = min + min1;
2502 flags &= ~SCF_DO_SUBSTR;
2504 data->flags |= SCF_SEEN_ACCEPT;
2507 if (data_fake.flags & SF_HAS_EVAL)
2508 data->flags |= SF_HAS_EVAL;
2509 data->whilem_c = data_fake.whilem_c;
2511 if (flags & SCF_DO_STCLASS)
2512 cl_or(pRExC_state, &accum, &this_class);
2514 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2516 if (flags & SCF_DO_SUBSTR) {
2517 data->pos_min += min1;
2518 data->pos_delta += max1 - min1;
2519 if (max1 != min1 || is_inf)
2520 data->longest = &(data->longest_float);
2523 delta += max1 - min1;
2524 if (flags & SCF_DO_STCLASS_OR) {
2525 cl_or(pRExC_state, data->start_class, &accum);
2527 cl_and(data->start_class, and_withp);
2528 flags &= ~SCF_DO_STCLASS;
2531 else if (flags & SCF_DO_STCLASS_AND) {
2533 cl_and(data->start_class, &accum);
2534 flags &= ~SCF_DO_STCLASS;
2537 /* Switch to OR mode: cache the old value of
2538 * data->start_class */
2540 StructCopy(data->start_class, and_withp,
2541 struct regnode_charclass_class);
2542 flags &= ~SCF_DO_STCLASS_AND;
2543 StructCopy(&accum, data->start_class,
2544 struct regnode_charclass_class);
2545 flags |= SCF_DO_STCLASS_OR;
2546 data->start_class->flags |= ANYOF_EOS;
2550 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2553 Assuming this was/is a branch we are dealing with: 'scan' now
2554 points at the item that follows the branch sequence, whatever
2555 it is. We now start at the beginning of the sequence and look
2562 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2564 If we can find such a subseqence we need to turn the first
2565 element into a trie and then add the subsequent branch exact
2566 strings to the trie.
2570 1. patterns where the whole set of branch can be converted.
2572 2. patterns where only a subset can be converted.
2574 In case 1 we can replace the whole set with a single regop
2575 for the trie. In case 2 we need to keep the start and end
2578 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2579 becomes BRANCH TRIE; BRANCH X;
2581 There is an additional case, that being where there is a
2582 common prefix, which gets split out into an EXACT like node
2583 preceding the TRIE node.
2585 If x(1..n)==tail then we can do a simple trie, if not we make
2586 a "jump" trie, such that when we match the appropriate word
2587 we "jump" to the appopriate tail node. Essentailly we turn
2588 a nested if into a case structure of sorts.
2593 if (!re_trie_maxbuff) {
2594 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2595 if (!SvIOK(re_trie_maxbuff))
2596 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2598 if ( SvIV(re_trie_maxbuff)>=0 ) {
2600 regnode *first = (regnode *)NULL;
2601 regnode *last = (regnode *)NULL;
2602 regnode *tail = scan;
2607 SV * const mysv = sv_newmortal(); /* for dumping */
2609 /* var tail is used because there may be a TAIL
2610 regop in the way. Ie, the exacts will point to the
2611 thing following the TAIL, but the last branch will
2612 point at the TAIL. So we advance tail. If we
2613 have nested (?:) we may have to move through several
2617 while ( OP( tail ) == TAIL ) {
2618 /* this is the TAIL generated by (?:) */
2619 tail = regnext( tail );
2624 regprop(RExC_rx, mysv, tail );
2625 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2626 (int)depth * 2 + 2, "",
2627 "Looking for TRIE'able sequences. Tail node is: ",
2628 SvPV_nolen_const( mysv )
2634 step through the branches, cur represents each
2635 branch, noper is the first thing to be matched
2636 as part of that branch and noper_next is the
2637 regnext() of that node. if noper is an EXACT
2638 and noper_next is the same as scan (our current
2639 position in the regex) then the EXACT branch is
2640 a possible optimization target. Once we have
2641 two or more consequetive such branches we can
2642 create a trie of the EXACT's contents and stich
2643 it in place. If the sequence represents all of
2644 the branches we eliminate the whole thing and
2645 replace it with a single TRIE. If it is a
2646 subsequence then we need to stitch it in. This
2647 means the first branch has to remain, and needs
2648 to be repointed at the item on the branch chain
2649 following the last branch optimized. This could
2650 be either a BRANCH, in which case the
2651 subsequence is internal, or it could be the
2652 item following the branch sequence in which
2653 case the subsequence is at the end.
2657 /* dont use tail as the end marker for this traverse */
2658 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2659 regnode * const noper = NEXTOPER( cur );
2660 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2661 regnode * const noper_next = regnext( noper );
2665 regprop(RExC_rx, mysv, cur);
2666 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2667 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2669 regprop(RExC_rx, mysv, noper);
2670 PerlIO_printf( Perl_debug_log, " -> %s",
2671 SvPV_nolen_const(mysv));
2674 regprop(RExC_rx, mysv, noper_next );
2675 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2676 SvPV_nolen_const(mysv));
2678 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2679 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2681 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2682 : PL_regkind[ OP( noper ) ] == EXACT )
2683 || OP(noper) == NOTHING )
2685 && noper_next == tail
2690 if ( !first || optype == NOTHING ) {
2691 if (!first) first = cur;
2692 optype = OP( noper );
2698 make_trie( pRExC_state,
2699 startbranch, first, cur, tail, count,
2702 if ( PL_regkind[ OP( noper ) ] == EXACT
2704 && noper_next == tail
2709 optype = OP( noper );
2719 regprop(RExC_rx, mysv, cur);
2720 PerlIO_printf( Perl_debug_log,
2721 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2722 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2726 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2727 #ifdef TRIE_STUDY_OPT
2728 if ( ((made == MADE_EXACT_TRIE &&
2729 startbranch == first)
2730 || ( first_non_open == first )) &&
2732 flags |= SCF_TRIE_RESTUDY;
2733 if ( startbranch == first
2736 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2746 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2747 scan = NEXTOPER(NEXTOPER(scan));
2748 } else /* single branch is optimized. */
2749 scan = NEXTOPER(scan);
2751 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2752 scan_frame *newframe = NULL;
2757 if (OP(scan) != SUSPEND) {
2758 /* set the pointer */
2759 if (OP(scan) == GOSUB) {
2761 RExC_recurse[ARG2L(scan)] = scan;
2762 start = RExC_open_parens[paren-1];
2763 end = RExC_close_parens[paren-1];
2766 start = RExC_rxi->program + 1;
2770 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2771 SAVEFREEPV(recursed);
2773 if (!PAREN_TEST(recursed,paren+1)) {
2774 PAREN_SET(recursed,paren+1);
2775 Newx(newframe,1,scan_frame);
2777 if (flags & SCF_DO_SUBSTR) {
2778 SCAN_COMMIT(pRExC_state,data,minlenp);
2779 data->longest = &(data->longest_float);
2781 is_inf = is_inf_internal = 1;
2782 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2783 cl_anything(pRExC_state, data->start_class);
2784 flags &= ~SCF_DO_STCLASS;
2787 Newx(newframe,1,scan_frame);
2790 end = regnext(scan);
2795 SAVEFREEPV(newframe);
2796 newframe->next = regnext(scan);
2797 newframe->last = last;
2798 newframe->stop = stopparen;
2799 newframe->prev = frame;
2809 else if (OP(scan) == EXACT) {
2810 I32 l = STR_LEN(scan);
2813 const U8 * const s = (U8*)STRING(scan);
2814 l = utf8_length(s, s + l);
2815 uc = utf8_to_uvchr(s, NULL);
2817 uc = *((U8*)STRING(scan));
2820 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2821 /* The code below prefers earlier match for fixed
2822 offset, later match for variable offset. */
2823 if (data->last_end == -1) { /* Update the start info. */
2824 data->last_start_min = data->pos_min;
2825 data->last_start_max = is_inf
2826 ? I32_MAX : data->pos_min + data->pos_delta;
2828 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2830 SvUTF8_on(data->last_found);
2832 SV * const sv = data->last_found;
2833 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2834 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2835 if (mg && mg->mg_len >= 0)
2836 mg->mg_len += utf8_length((U8*)STRING(scan),
2837 (U8*)STRING(scan)+STR_LEN(scan));
2839 data->last_end = data->pos_min + l;
2840 data->pos_min += l; /* As in the first entry. */
2841 data->flags &= ~SF_BEFORE_EOL;
2843 if (flags & SCF_DO_STCLASS_AND) {
2844 /* Check whether it is compatible with what we know already! */
2848 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2849 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2850 && (!(data->start_class->flags & ANYOF_FOLD)
2851 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2854 ANYOF_CLASS_ZERO(data->start_class);
2855 ANYOF_BITMAP_ZERO(data->start_class);
2857 ANYOF_BITMAP_SET(data->start_class, uc);
2858 data->start_class->flags &= ~ANYOF_EOS;
2860 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2862 else if (flags & SCF_DO_STCLASS_OR) {
2863 /* false positive possible if the class is case-folded */
2865 ANYOF_BITMAP_SET(data->start_class, uc);
2867 data->start_class->flags |= ANYOF_UNICODE_ALL;
2868 data->start_class->flags &= ~ANYOF_EOS;
2869 cl_and(data->start_class, and_withp);
2871 flags &= ~SCF_DO_STCLASS;
2873 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2874 I32 l = STR_LEN(scan);
2875 UV uc = *((U8*)STRING(scan));
2877 /* Search for fixed substrings supports EXACT only. */
2878 if (flags & SCF_DO_SUBSTR) {
2880 SCAN_COMMIT(pRExC_state, data, minlenp);
2883 const U8 * const s = (U8 *)STRING(scan);
2884 l = utf8_length(s, s + l);
2885 uc = utf8_to_uvchr(s, NULL);
2888 if (flags & SCF_DO_SUBSTR)
2890 if (flags & SCF_DO_STCLASS_AND) {
2891 /* Check whether it is compatible with what we know already! */
2895 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2896 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2897 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2899 ANYOF_CLASS_ZERO(data->start_class);
2900 ANYOF_BITMAP_ZERO(data->start_class);
2902 ANYOF_BITMAP_SET(data->start_class, uc);
2903 data->start_class->flags &= ~ANYOF_EOS;
2904 data->start_class->flags |= ANYOF_FOLD;
2905 if (OP(scan) == EXACTFL)
2906 data->start_class->flags |= ANYOF_LOCALE;
2909 else if (flags & SCF_DO_STCLASS_OR) {
2910 if (data->start_class->flags & ANYOF_FOLD) {
2911 /* false positive possible if the class is case-folded.
2912 Assume that the locale settings are the same... */
2914 ANYOF_BITMAP_SET(data->start_class, uc);
2915 data->start_class->flags &= ~ANYOF_EOS;
2917 cl_and(data->start_class, and_withp);
2919 flags &= ~SCF_DO_STCLASS;
2921 else if (strchr((const char*)PL_varies,OP(scan))) {
2922 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2923 I32 f = flags, pos_before = 0;
2924 regnode * const oscan = scan;
2925 struct regnode_charclass_class this_class;
2926 struct regnode_charclass_class *oclass = NULL;
2927 I32 next_is_eval = 0;
2929 switch (PL_regkind[OP(scan)]) {
2930 case WHILEM: /* End of (?:...)* . */
2931 scan = NEXTOPER(scan);
2934 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2935 next = NEXTOPER(scan);
2936 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2938 maxcount = REG_INFTY;
2939 next = regnext(scan);
2940 scan = NEXTOPER(scan);
2944 if (flags & SCF_DO_SUBSTR)
2949 if (flags & SCF_DO_STCLASS) {
2951 maxcount = REG_INFTY;
2952 next = regnext(scan);
2953 scan = NEXTOPER(scan);
2956 is_inf = is_inf_internal = 1;
2957 scan = regnext(scan);
2958 if (flags & SCF_DO_SUBSTR) {
2959 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2960 data->longest = &(data->longest_float);
2962 goto optimize_curly_tail;
2964 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2965 && (scan->flags == stopparen))
2970 mincount = ARG1(scan);
2971 maxcount = ARG2(scan);
2973 next = regnext(scan);
2974 if (OP(scan) == CURLYX) {
2975 I32 lp = (data ? *(data->last_closep) : 0);
2976 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2978 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2979 next_is_eval = (OP(scan) == EVAL);
2981 if (flags & SCF_DO_SUBSTR) {
2982 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2983 pos_before = data->pos_min;
2987 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2989 data->flags |= SF_IS_INF;
2991 if (flags & SCF_DO_STCLASS) {
2992 cl_init(pRExC_state, &this_class);
2993 oclass = data->start_class;
2994 data->start_class = &this_class;
2995 f |= SCF_DO_STCLASS_AND;
2996 f &= ~SCF_DO_STCLASS_OR;
2998 /* These are the cases when once a subexpression
2999 fails at a particular position, it cannot succeed
3000 even after backtracking at the enclosing scope.
3002 XXXX what if minimal match and we are at the
3003 initial run of {n,m}? */
3004 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3005 f &= ~SCF_WHILEM_VISITED_POS;
3007 /* This will finish on WHILEM, setting scan, or on NULL: */
3008 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3009 last, data, stopparen, recursed, NULL,
3011 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3013 if (flags & SCF_DO_STCLASS)
3014 data->start_class = oclass;
3015 if (mincount == 0 || minnext == 0) {
3016 if (flags & SCF_DO_STCLASS_OR) {
3017 cl_or(pRExC_state, data->start_class, &this_class);
3019 else if (flags & SCF_DO_STCLASS_AND) {
3020 /* Switch to OR mode: cache the old value of
3021 * data->start_class */
3023 StructCopy(data->start_class, and_withp,
3024 struct regnode_charclass_class);
3025 flags &= ~SCF_DO_STCLASS_AND;
3026 StructCopy(&this_class, data->start_class,
3027 struct regnode_charclass_class);
3028 flags |= SCF_DO_STCLASS_OR;
3029 data->start_class->flags |= ANYOF_EOS;
3031 } else { /* Non-zero len */
3032 if (flags & SCF_DO_STCLASS_OR) {
3033 cl_or(pRExC_state, data->start_class, &this_class);
3034 cl_and(data->start_class, and_withp);
3036 else if (flags & SCF_DO_STCLASS_AND)
3037 cl_and(data->start_class, &this_class);
3038 flags &= ~SCF_DO_STCLASS;
3040 if (!scan) /* It was not CURLYX, but CURLY. */
3042 if ( /* ? quantifier ok, except for (?{ ... }) */
3043 (next_is_eval || !(mincount == 0 && maxcount == 1))
3044 && (minnext == 0) && (deltanext == 0)
3045 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3046 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3047 && ckWARN(WARN_REGEXP))
3050 "Quantifier unexpected on zero-length expression");
3053 min += minnext * mincount;
3054 is_inf_internal |= ((maxcount == REG_INFTY
3055 && (minnext + deltanext) > 0)
3056 || deltanext == I32_MAX);
3057 is_inf |= is_inf_internal;
3058 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3060 /* Try powerful optimization CURLYX => CURLYN. */
3061 if ( OP(oscan) == CURLYX && data
3062 && data->flags & SF_IN_PAR
3063 && !(data->flags & SF_HAS_EVAL)
3064 && !deltanext && minnext == 1 ) {
3065 /* Try to optimize to CURLYN. */
3066 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3067 regnode * const nxt1 = nxt;
3074 if (!strchr((const char*)PL_simple,OP(nxt))
3075 && !(PL_regkind[OP(nxt)] == EXACT
3076 && STR_LEN(nxt) == 1))
3082 if (OP(nxt) != CLOSE)
3084 if (RExC_open_parens) {
3085 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3086 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3088 /* Now we know that nxt2 is the only contents: */
3089 oscan->flags = (U8)ARG(nxt);
3091 OP(nxt1) = NOTHING; /* was OPEN. */
3094 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3095 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3096 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3097 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3098 OP(nxt + 1) = OPTIMIZED; /* was count. */
3099 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3104 /* Try optimization CURLYX => CURLYM. */
3105 if ( OP(oscan) == CURLYX && data
3106 && !(data->flags & SF_HAS_PAR)
3107 && !(data->flags & SF_HAS_EVAL)
3108 && !deltanext /* atom is fixed width */
3109 && minnext != 0 /* CURLYM can't handle zero width */
3111 /* XXXX How to optimize if data == 0? */
3112 /* Optimize to a simpler form. */
3113 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3117 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3118 && (OP(nxt2) != WHILEM))
3120 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3121 /* Need to optimize away parenths. */
3122 if (data->flags & SF_IN_PAR) {
3123 /* Set the parenth number. */
3124 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3126 if (OP(nxt) != CLOSE)
3127 FAIL("Panic opt close");
3128 oscan->flags = (U8)ARG(nxt);
3129 if (RExC_open_parens) {
3130 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3131 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3133 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3134 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3137 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3138 OP(nxt + 1) = OPTIMIZED; /* was count. */
3139 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3140 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3143 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3144 regnode *nnxt = regnext(nxt1);
3147 if (reg_off_by_arg[OP(nxt1)])
3148 ARG_SET(nxt1, nxt2 - nxt1);
3149 else if (nxt2 - nxt1 < U16_MAX)
3150 NEXT_OFF(nxt1) = nxt2 - nxt1;
3152 OP(nxt) = NOTHING; /* Cannot beautify */
3157 /* Optimize again: */
3158 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3159 NULL, stopparen, recursed, NULL, 0,depth+1);
3164 else if ((OP(oscan) == CURLYX)
3165 && (flags & SCF_WHILEM_VISITED_POS)
3166 /* See the comment on a similar expression above.
3167 However, this time it not a subexpression
3168 we care about, but the expression itself. */
3169 && (maxcount == REG_INFTY)
3170 && data && ++data->whilem_c < 16) {
3171 /* This stays as CURLYX, we can put the count/of pair. */
3172 /* Find WHILEM (as in regexec.c) */
3173 regnode *nxt = oscan + NEXT_OFF(oscan);
3175 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3177 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3178 | (RExC_whilem_seen << 4)); /* On WHILEM */
3180 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3182 if (flags & SCF_DO_SUBSTR) {
3183 SV *last_str = NULL;
3184 int counted = mincount != 0;
3186 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3187 #if defined(SPARC64_GCC_WORKAROUND)
3190 const char *s = NULL;
3193 if (pos_before >= data->last_start_min)
3196 b = data->last_start_min;
3199 s = SvPV_const(data->last_found, l);
3200 old = b - data->last_start_min;
3203 I32 b = pos_before >= data->last_start_min
3204 ? pos_before : data->last_start_min;
3206 const char * const s = SvPV_const(data->last_found, l);
3207 I32 old = b - data->last_start_min;
3211 old = utf8_hop((U8*)s, old) - (U8*)s;
3214 /* Get the added string: */
3215 last_str = newSVpvn(s + old, l);
3217 SvUTF8_on(last_str);
3218 if (deltanext == 0 && pos_before == b) {
3219 /* What was added is a constant string */
3221 SvGROW(last_str, (mincount * l) + 1);
3222 repeatcpy(SvPVX(last_str) + l,
3223 SvPVX_const(last_str), l, mincount - 1);
3224 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3225 /* Add additional parts. */
3226 SvCUR_set(data->last_found,
3227 SvCUR(data->last_found) - l);
3228 sv_catsv(data->last_found, last_str);
3230 SV * sv = data->last_found;
3232 SvUTF8(sv) && SvMAGICAL(sv) ?
3233 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3234 if (mg && mg->mg_len >= 0)
3235 mg->mg_len += CHR_SVLEN(last_str);
3237 data->last_end += l * (mincount - 1);
3240 /* start offset must point into the last copy */
3241 data->last_start_min += minnext * (mincount - 1);
3242 data->last_start_max += is_inf ? I32_MAX
3243 : (maxcount - 1) * (minnext + data->pos_delta);
3246 /* It is counted once already... */
3247 data->pos_min += minnext * (mincount - counted);
3248 data->pos_delta += - counted * deltanext +
3249 (minnext + deltanext) * maxcount - minnext * mincount;
3250 if (mincount != maxcount) {
3251 /* Cannot extend fixed substrings found inside
3253 SCAN_COMMIT(pRExC_state,data,minlenp);
3254 if (mincount && last_str) {
3255 SV * const sv = data->last_found;
3256 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3257 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3261 sv_setsv(sv, last_str);
3262 data->last_end = data->pos_min;
3263 data->last_start_min =
3264 data->pos_min - CHR_SVLEN(last_str);
3265 data->last_start_max = is_inf
3267 : data->pos_min + data->pos_delta
3268 - CHR_SVLEN(last_str);
3270 data->longest = &(data->longest_float);
3272 SvREFCNT_dec(last_str);
3274 if (data && (fl & SF_HAS_EVAL))
3275 data->flags |= SF_HAS_EVAL;
3276 optimize_curly_tail:
3277 if (OP(oscan) != CURLYX) {
3278 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3280 NEXT_OFF(oscan) += NEXT_OFF(next);
3283 default: /* REF and CLUMP only? */
3284 if (flags & SCF_DO_SUBSTR) {
3285 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3286 data->longest = &(data->longest_float);
3288 is_inf = is_inf_internal = 1;
3289 if (flags & SCF_DO_STCLASS_OR)
3290 cl_anything(pRExC_state, data->start_class);
3291 flags &= ~SCF_DO_STCLASS;
3295 else if (strchr((const char*)PL_simple,OP(scan))) {
3298 if (flags & SCF_DO_SUBSTR) {
3299 SCAN_COMMIT(pRExC_state,data,minlenp);
3303 if (flags & SCF_DO_STCLASS) {
3304 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3306 /* Some of the logic below assumes that switching
3307 locale on will only add false positives. */
3308 switch (PL_regkind[OP(scan)]) {
3312 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3313 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3314 cl_anything(pRExC_state, data->start_class);
3317 if (OP(scan) == SANY)
3319 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3320 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3321 || (data->start_class->flags & ANYOF_CLASS));
3322 cl_anything(pRExC_state, data->start_class);
3324 if (flags & SCF_DO_STCLASS_AND || !value)
3325 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3328 if (flags & SCF_DO_STCLASS_AND)
3329 cl_and(data->start_class,
3330 (struct regnode_charclass_class*)scan);
3332 cl_or(pRExC_state, data->start_class,
3333 (struct regnode_charclass_class*)scan);
3336 if (flags & SCF_DO_STCLASS_AND) {
3337 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3338 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3339 for (value = 0; value < 256; value++)
3340 if (!isALNUM(value))
3341 ANYOF_BITMAP_CLEAR(data->start_class, value);
3345 if (data->start_class->flags & ANYOF_LOCALE)
3346 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3348 for (value = 0; value < 256; value++)
3350 ANYOF_BITMAP_SET(data->start_class, value);
3355 if (flags & SCF_DO_STCLASS_AND) {
3356 if (data->start_class->flags & ANYOF_LOCALE)
3357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3360 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3361 data->start_class->flags |= ANYOF_LOCALE;
3365 if (flags & SCF_DO_STCLASS_AND) {
3366 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3367 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3368 for (value = 0; value < 256; value++)
3370 ANYOF_BITMAP_CLEAR(data->start_class, value);
3374 if (data->start_class->flags & ANYOF_LOCALE)
3375 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3377 for (value = 0; value < 256; value++)
3378 if (!isALNUM(value))
3379 ANYOF_BITMAP_SET(data->start_class, value);
3384 if (flags & SCF_DO_STCLASS_AND) {
3385 if (data->start_class->flags & ANYOF_LOCALE)
3386 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3389 data->start_class->flags |= ANYOF_LOCALE;
3390 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3394 if (flags & SCF_DO_STCLASS_AND) {
3395 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3396 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3397 for (value = 0; value < 256; value++)
3398 if (!isSPACE(value))
3399 ANYOF_BITMAP_CLEAR(data->start_class, value);
3403 if (data->start_class->flags & ANYOF_LOCALE)
3404 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3406 for (value = 0; value < 256; value++)
3408 ANYOF_BITMAP_SET(data->start_class, value);
3413 if (flags & SCF_DO_STCLASS_AND) {
3414 if (data->start_class->flags & ANYOF_LOCALE)
3415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3418 data->start_class->flags |= ANYOF_LOCALE;
3419 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3423 if (flags & SCF_DO_STCLASS_AND) {
3424 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3425 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3426 for (value = 0; value < 256; value++)
3428 ANYOF_BITMAP_CLEAR(data->start_class, value);
3432 if (data->start_class->flags & ANYOF_LOCALE)
3433 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3435 for (value = 0; value < 256; value++)
3436 if (!isSPACE(value))
3437 ANYOF_BITMAP_SET(data->start_class, value);
3442 if (flags & SCF_DO_STCLASS_AND) {
3443 if (data->start_class->flags & ANYOF_LOCALE) {
3444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3445 for (value = 0; value < 256; value++)
3446 if (!isSPACE(value))
3447 ANYOF_BITMAP_CLEAR(data->start_class, value);
3451 data->start_class->flags |= ANYOF_LOCALE;
3452 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3456 if (flags & SCF_DO_STCLASS_AND) {
3457 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3458 for (value = 0; value < 256; value++)
3459 if (!isDIGIT(value))
3460 ANYOF_BITMAP_CLEAR(data->start_class, value);
3463 if (data->start_class->flags & ANYOF_LOCALE)
3464 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3466 for (value = 0; value < 256; value++)
3468 ANYOF_BITMAP_SET(data->start_class, value);
3473 if (flags & SCF_DO_STCLASS_AND) {
3474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3475 for (value = 0; value < 256; value++)
3477 ANYOF_BITMAP_CLEAR(data->start_class, value);
3480 if (data->start_class->flags & ANYOF_LOCALE)
3481 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3483 for (value = 0; value < 256; value++)
3484 if (!isDIGIT(value))
3485 ANYOF_BITMAP_SET(data->start_class, value);
3490 if (flags & SCF_DO_STCLASS_OR)
3491 cl_and(data->start_class, and_withp);
3492 flags &= ~SCF_DO_STCLASS;
3495 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3496 data->flags |= (OP(scan) == MEOL
3500 else if ( PL_regkind[OP(scan)] == BRANCHJ
3501 /* Lookbehind, or need to calculate parens/evals/stclass: */
3502 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3503 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3504 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3505 || OP(scan) == UNLESSM )
3507 /* Negative Lookahead/lookbehind
3508 In this case we can't do fixed string optimisation.
3511 I32 deltanext, minnext, fake = 0;
3513 struct regnode_charclass_class intrnl;
3516 data_fake.flags = 0;
3518 data_fake.whilem_c = data->whilem_c;
3519 data_fake.last_closep = data->last_closep;
3522 data_fake.last_closep = &fake;
3523 data_fake.pos_delta = delta;
3524 if ( flags & SCF_DO_STCLASS && !scan->flags
3525 && OP(scan) == IFMATCH ) { /* Lookahead */
3526 cl_init(pRExC_state, &intrnl);
3527 data_fake.start_class = &intrnl;
3528 f |= SCF_DO_STCLASS_AND;
3530 if (flags & SCF_WHILEM_VISITED_POS)
3531 f |= SCF_WHILEM_VISITED_POS;
3532 next = regnext(scan);
3533 nscan = NEXTOPER(NEXTOPER(scan));
3534 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3535 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3538 FAIL("Variable length lookbehind not implemented");
3540 else if (minnext > (I32)U8_MAX) {
3541 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3543 scan->flags = (U8)minnext;
3546 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3548 if (data_fake.flags & SF_HAS_EVAL)
3549 data->flags |= SF_HAS_EVAL;
3550 data->whilem_c = data_fake.whilem_c;
3552 if (f & SCF_DO_STCLASS_AND) {
3553 const int was = (data->start_class->flags & ANYOF_EOS);
3555 cl_and(data->start_class, &intrnl);
3557 data->start_class->flags |= ANYOF_EOS;
3560 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3562 /* Positive Lookahead/lookbehind
3563 In this case we can do fixed string optimisation,
3564 but we must be careful about it. Note in the case of
3565 lookbehind the positions will be offset by the minimum
3566 length of the pattern, something we won't know about
3567 until after the recurse.
3569 I32 deltanext, fake = 0;
3571 struct regnode_charclass_class intrnl;
3573 /* We use SAVEFREEPV so that when the full compile
3574 is finished perl will clean up the allocated
3575 minlens when its all done. This was we don't
3576 have to worry about freeing them when we know
3577 they wont be used, which would be a pain.
3580 Newx( minnextp, 1, I32 );
3581 SAVEFREEPV(minnextp);
3584 StructCopy(data, &data_fake, scan_data_t);
3585 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3588 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3589 data_fake.last_found=newSVsv(data->last_found);
3593 data_fake.last_closep = &fake;
3594 data_fake.flags = 0;
3595 data_fake.pos_delta = delta;
3597 data_fake.flags |= SF_IS_INF;
3598 if ( flags & SCF_DO_STCLASS && !scan->flags
3599 && OP(scan) == IFMATCH ) { /* Lookahead */
3600 cl_init(pRExC_state, &intrnl);
3601 data_fake.start_class = &intrnl;
3602 f |= SCF_DO_STCLASS_AND;
3604 if (flags & SCF_WHILEM_VISITED_POS)
3605 f |= SCF_WHILEM_VISITED_POS;
3606 next = regnext(scan);
3607 nscan = NEXTOPER(NEXTOPER(scan));
3609 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3610 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3613 FAIL("Variable length lookbehind not implemented");
3615 else if (*minnextp > (I32)U8_MAX) {
3616 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3618 scan->flags = (U8)*minnextp;
3623 if (f & SCF_DO_STCLASS_AND) {
3624 const int was = (data->start_class->flags & ANYOF_EOS);
3626 cl_and(data->start_class, &intrnl);
3628 data->start_class->flags |= ANYOF_EOS;
3631 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3633 if (data_fake.flags & SF_HAS_EVAL)
3634 data->flags |= SF_HAS_EVAL;
3635 data->whilem_c = data_fake.whilem_c;
3636 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3637 if (RExC_rx->minlen<*minnextp)
3638 RExC_rx->minlen=*minnextp;
3639 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3640 SvREFCNT_dec(data_fake.last_found);
3642 if ( data_fake.minlen_fixed != minlenp )
3644 data->offset_fixed= data_fake.offset_fixed;
3645 data->minlen_fixed= data_fake.minlen_fixed;
3646 data->lookbehind_fixed+= scan->flags;
3648 if ( data_fake.minlen_float != minlenp )
3650 data->minlen_float= data_fake.minlen_float;
3651 data->offset_float_min=data_fake.offset_float_min;
3652 data->offset_float_max=data_fake.offset_float_max;
3653 data->lookbehind_float+= scan->flags;
3662 else if (OP(scan) == OPEN) {
3663 if (stopparen != (I32)ARG(scan))
3666 else if (OP(scan) == CLOSE) {
3667 if (stopparen == (I32)ARG(scan)) {
3670 if ((I32)ARG(scan) == is_par) {
3671 next = regnext(scan);
3673 if ( next && (OP(next) != WHILEM) && next < last)
3674 is_par = 0; /* Disable optimization */
3677 *(data->last_closep) = ARG(scan);
3679 else if (OP(scan) == EVAL) {
3681 data->flags |= SF_HAS_EVAL;
3683 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3684 if (flags & SCF_DO_SUBSTR) {
3685 SCAN_COMMIT(pRExC_state,data,minlenp);
3686 flags &= ~SCF_DO_SUBSTR;
3688 if (data && OP(scan)==ACCEPT) {
3689 data->flags |= SCF_SEEN_ACCEPT;
3694 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3696 if (flags & SCF_DO_SUBSTR) {
3697 SCAN_COMMIT(pRExC_state,data,minlenp);
3698 data->longest = &(data->longest_float);
3700 is_inf = is_inf_internal = 1;
3701 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3702 cl_anything(pRExC_state, data->start_class);
3703 flags &= ~SCF_DO_STCLASS;
3705 else if (OP(scan) == GPOS) {
3706 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3707 !(delta || is_inf || (data && data->pos_delta)))
3709 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3710 RExC_rx->extflags |= RXf_ANCH_GPOS;
3711 if (RExC_rx->gofs < (U32)min)
3712 RExC_rx->gofs = min;
3714 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3718 #ifdef TRIE_STUDY_OPT
3719 #ifdef FULL_TRIE_STUDY
3720 else if (PL_regkind[OP(scan)] == TRIE) {
3721 /* NOTE - There is similar code to this block above for handling
3722 BRANCH nodes on the initial study. If you change stuff here
3724 regnode *trie_node= scan;
3725 regnode *tail= regnext(scan);
3726 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3727 I32 max1 = 0, min1 = I32_MAX;
3728 struct regnode_charclass_class accum;
3730 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3731 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3732 if (flags & SCF_DO_STCLASS)
3733 cl_init_zero(pRExC_state, &accum);
3739 const regnode *nextbranch= NULL;
3742 for ( word=1 ; word <= trie->wordcount ; word++)
3744 I32 deltanext=0, minnext=0, f = 0, fake;
3745 struct regnode_charclass_class this_class;
3747 data_fake.flags = 0;
3749 data_fake.whilem_c = data->whilem_c;
3750 data_fake.last_closep = data->last_closep;
3753 data_fake.last_closep = &fake;
3754 data_fake.pos_delta = delta;
3755 if (flags & SCF_DO_STCLASS) {
3756 cl_init(pRExC_state, &this_class);
3757 data_fake.start_class = &this_class;
3758 f = SCF_DO_STCLASS_AND;
3760 if (flags & SCF_WHILEM_VISITED_POS)
3761 f |= SCF_WHILEM_VISITED_POS;
3763 if (trie->jump[word]) {
3765 nextbranch = trie_node + trie->jump[0];
3766 scan= trie_node + trie->jump[word];
3767 /* We go from the jump point to the branch that follows
3768 it. Note this means we need the vestigal unused branches
3769 even though they arent otherwise used.
3771 minnext = study_chunk(pRExC_state, &scan, minlenp,
3772 &deltanext, (regnode *)nextbranch, &data_fake,
3773 stopparen, recursed, NULL, f,depth+1);
3775 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3776 nextbranch= regnext((regnode*)nextbranch);
3778 if (min1 > (I32)(minnext + trie->minlen))
3779 min1 = minnext + trie->minlen;
3780 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3781 max1 = minnext + deltanext + trie->maxlen;
3782 if (deltanext == I32_MAX)
3783 is_inf = is_inf_internal = 1;
3785 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3787 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3788 if ( stopmin > min + min1)
3789 stopmin = min + min1;
3790 flags &= ~SCF_DO_SUBSTR;
3792 data->flags |= SCF_SEEN_ACCEPT;
3795 if (data_fake.flags & SF_HAS_EVAL)
3796 data->flags |= SF_HAS_EVAL;
3797 data->whilem_c = data_fake.whilem_c;
3799 if (flags & SCF_DO_STCLASS)
3800 cl_or(pRExC_state, &accum, &this_class);
3803 if (flags & SCF_DO_SUBSTR) {
3804 data->pos_min += min1;
3805 data->pos_delta += max1 - min1;
3806 if (max1 != min1 || is_inf)
3807 data->longest = &(data->longest_float);
3810 delta += max1 - min1;
3811 if (flags & SCF_DO_STCLASS_OR) {
3812 cl_or(pRExC_state, data->start_class, &accum);
3814 cl_and(data->start_class, and_withp);
3815 flags &= ~SCF_DO_STCLASS;
3818 else if (flags & SCF_DO_STCLASS_AND) {
3820 cl_and(data->start_class, &accum);
3821 flags &= ~SCF_DO_STCLASS;
3824 /* Switch to OR mode: cache the old value of
3825 * data->start_class */
3827 StructCopy(data->start_class, and_withp,
3828 struct regnode_charclass_class);
3829 flags &= ~SCF_DO_STCLASS_AND;
3830 StructCopy(&accum, data->start_class,
3831 struct regnode_charclass_class);
3832 flags |= SCF_DO_STCLASS_OR;
3833 data->start_class->flags |= ANYOF_EOS;
3840 else if (PL_regkind[OP(scan)] == TRIE) {
3841 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3844 min += trie->minlen;
3845 delta += (trie->maxlen - trie->minlen);
3846 flags &= ~SCF_DO_STCLASS; /* xxx */
3847 if (flags & SCF_DO_SUBSTR) {
3848 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3849 data->pos_min += trie->minlen;
3850 data->pos_delta += (trie->maxlen - trie->minlen);
3851 if (trie->maxlen != trie->minlen)
3852 data->longest = &(data->longest_float);
3854 if (trie->jump) /* no more substrings -- for now /grr*/
3855 flags &= ~SCF_DO_SUBSTR;
3857 #endif /* old or new */
3858 #endif /* TRIE_STUDY_OPT */
3859 /* Else: zero-length, ignore. */
3860 scan = regnext(scan);
3865 stopparen = frame->stop;
3866 frame = frame->prev;
3867 goto fake_study_recurse;
3872 DEBUG_STUDYDATA("pre-fin:",data,depth);
3875 *deltap = is_inf_internal ? I32_MAX : delta;
3876 if (flags & SCF_DO_SUBSTR && is_inf)
3877 data->pos_delta = I32_MAX - data->pos_min;
3878 if (is_par > (I32)U8_MAX)
3880 if (is_par && pars==1 && data) {
3881 data->flags |= SF_IN_PAR;
3882 data->flags &= ~SF_HAS_PAR;
3884 else if (pars && data) {
3885 data->flags |= SF_HAS_PAR;
3886 data->flags &= ~SF_IN_PAR;
3888 if (flags & SCF_DO_STCLASS_OR)
3889 cl_and(data->start_class, and_withp);
3890 if (flags & SCF_TRIE_RESTUDY)
3891 data->flags |= SCF_TRIE_RESTUDY;
3893 DEBUG_STUDYDATA("post-fin:",data,depth);
3895 return min < stopmin ? min : stopmin;
3899 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
3901 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3903 Renewc(RExC_rxi->data,
3904 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3905 char, struct reg_data);
3907 Renew(RExC_rxi->data->what, count + n, U8);
3909 Newx(RExC_rxi->data->what, n, U8);
3910 RExC_rxi->data->count = count + n;
3911 Copy(s, RExC_rxi->data->what + count, n, U8);
3915 /*XXX: todo make this not included in a non debugging perl */
3916 #ifndef PERL_IN_XSUB_RE
3918 Perl_reginitcolors(pTHX)
3921 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3923 char *t = savepv(s);
3927 t = strchr(t, '\t');
3933 PL_colors[i] = t = (char *)"";
3938 PL_colors[i++] = (char *)"";
3945 #ifdef TRIE_STUDY_OPT
3946 #define CHECK_RESTUDY_GOTO \
3948 (data.flags & SCF_TRIE_RESTUDY) \
3952 #define CHECK_RESTUDY_GOTO
3956 - pregcomp - compile a regular expression into internal code
3958 * We can't allocate space until we know how big the compiled form will be,
3959 * but we can't compile it (and thus know how big it is) until we've got a
3960 * place to put the code. So we cheat: we compile it twice, once with code
3961 * generation turned off and size counting turned on, and once "for real".
3962 * This also means that we don't allocate space until we are sure that the
3963 * thing really will compile successfully, and we never have to move the
3964 * code and thus invalidate pointers into it. (Note that it has to be in
3965 * one piece because free() must be able to free it all.) [NB: not true in perl]
3967 * Beware that the optimization-preparation code in here knows about some
3968 * of the structure of the compiled regexp. [I'll say.]
3973 #ifndef PERL_IN_XSUB_RE
3974 #define RE_ENGINE_PTR &PL_core_reg_engine
3976 extern const struct regexp_engine my_reg_engine;
3977 #define RE_ENGINE_PTR &my_reg_engine
3980 #ifndef PERL_IN_XSUB_RE
3982 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3985 HV * const table = GvHV(PL_hintgv);
3986 /* Dispatch a request to compile a regexp to correct
3989 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3990 GET_RE_DEBUG_FLAGS_DECL;
3991 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3992 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3994 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3997 return CALLREGCOMP_ENG(eng, exp, xend, pm);
4000 return Perl_re_compile(aTHX_ exp, xend, pm);
4005 Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
4009 register regexp_internal *ri;
4017 RExC_state_t RExC_state;
4018 RExC_state_t * const pRExC_state = &RExC_state;
4019 #ifdef TRIE_STUDY_OPT
4021 RExC_state_t copyRExC_state;
4023 GET_RE_DEBUG_FLAGS_DECL;
4024 DEBUG_r(if (!PL_colorset) reginitcolors());
4027 FAIL("NULL regexp argument");
4029 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
4033 SV *dsv= sv_newmortal();
4034 RE_PV_QUOTED_DECL(s, RExC_utf8,
4035 dsv, RExC_precomp, (xend - exp), 60);
4036 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4037 PL_colors[4],PL_colors[5],s);
4039 RExC_flags = pm->op_pmflags;
4043 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4044 RExC_seen_evals = 0;
4047 /* First pass: determine size, legality. */
4055 RExC_emit = &PL_regdummy;
4056 RExC_whilem_seen = 0;
4057 RExC_charnames = NULL;
4058 RExC_open_parens = NULL;
4059 RExC_close_parens = NULL;
4061 RExC_paren_names = NULL;
4063 RExC_paren_name_list = NULL;
4065 RExC_recurse = NULL;
4066 RExC_recurse_count = 0;
4068 #if 0 /* REGC() is (currently) a NOP at the first pass.
4069 * Clever compilers notice this and complain. --jhi */
4070 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4072 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4073 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4074 RExC_precomp = NULL;
4078 PerlIO_printf(Perl_debug_log,
4079 "Required size %"IVdf" nodes\n"
4080 "Starting second pass (creation)\n",
4083 RExC_lastparse=NULL;
4085 /* Small enough for pointer-storage convention?
4086 If extralen==0, this means that we will not need long jumps. */
4087 if (RExC_size >= 0x10000L && RExC_extralen)
4088 RExC_size += RExC_extralen;
4091 if (RExC_whilem_seen > 15)
4092 RExC_whilem_seen = 15;
4095 /* Make room for a sentinel value at the end of the program */
4099 /* Allocate space and zero-initialize. Note, the two step process
4100 of zeroing when in debug mode, thus anything assigned has to
4101 happen after that */
4102 Newxz(r, 1, regexp);
4103 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4104 char, regexp_internal);
4105 if ( r == NULL || ri == NULL )
4106 FAIL("Regexp out of space");
4108 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4109 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4111 /* bulk initialize base fields with 0. */
4112 Zero(ri, sizeof(regexp_internal), char);
4115 /* non-zero initialization begins here */
4117 r->engine= RE_ENGINE_PTR;
4119 r->prelen = xend - exp;
4120 r->precomp = savepvn(RExC_precomp, r->prelen);
4121 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4123 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4125 if (RExC_seen & REG_SEEN_RECURSE) {
4126 Newxz(RExC_open_parens, RExC_npar,regnode *);
4127 SAVEFREEPV(RExC_open_parens);
4128 Newxz(RExC_close_parens,RExC_npar,regnode *);
4129 SAVEFREEPV(RExC_close_parens);
4132 /* Useful during FAIL. */
4133 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4135 ri->offsets[0] = RExC_size;
4137 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4138 "%s %"UVuf" bytes for offset annotations.\n",
4139 ri->offsets ? "Got" : "Couldn't get",
4140 (UV)((2*RExC_size+1) * sizeof(U32))));
4145 /* Second pass: emit code. */
4146 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
4151 RExC_emit_start = ri->program;
4152 RExC_emit = ri->program;
4154 /* put a sentinal on the end of the program so we can check for
4156 ri->program[RExC_size].type = 255;
4158 /* Store the count of eval-groups for security checks: */
4159 RExC_rx->seen_evals = RExC_seen_evals;
4160 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4161 if (reg(pRExC_state, 0, &flags,1) == NULL)
4164 /* XXXX To minimize changes to RE engine we always allocate
4165 3-units-long substrs field. */
4166 Newx(r->substrs, 1, struct reg_substr_data);
4167 if (RExC_recurse_count) {
4168 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4169 SAVEFREEPV(RExC_recurse);
4173 r->minlen = minlen = sawplus = sawopen = 0;
4174 Zero(r->substrs, 1, struct reg_substr_data);
4176 #ifdef TRIE_STUDY_OPT
4179 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4181 RExC_state = copyRExC_state;
4182 if (seen & REG_TOP_LEVEL_BRANCHES)
4183 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4185 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4186 if (data.last_found) {
4187 SvREFCNT_dec(data.longest_fixed);
4188 SvREFCNT_dec(data.longest_float);
4189 SvREFCNT_dec(data.last_found);
4191 StructCopy(&zero_scan_data, &data, scan_data_t);
4193 StructCopy(&zero_scan_data, &data, scan_data_t);
4194 copyRExC_state = RExC_state;
4197 StructCopy(&zero_scan_data, &data, scan_data_t);
4200 /* Dig out information for optimizations. */
4201 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
4202 pm->op_pmflags = RExC_flags;
4204 r->extflags |= RXf_UTF8; /* Unicode in it? */
4205 ri->regstclass = NULL;
4206 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4207 r->intflags |= PREGf_NAUGHTY;
4208 scan = ri->program + 1; /* First BRANCH. */
4210 /* testing for BRANCH here tells us whether there is "must appear"
4211 data in the pattern. If there is then we can use it for optimisations */
4212 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4214 STRLEN longest_float_length, longest_fixed_length;
4215 struct regnode_charclass_class ch_class; /* pointed to by data */
4217 I32 last_close = 0; /* pointed to by data */
4220 /* Skip introductions and multiplicators >= 1. */
4221 while ((OP(first) == OPEN && (sawopen = 1)) ||
4222 /* An OR of *one* alternative - should not happen now. */
4223 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4224 /* for now we can't handle lookbehind IFMATCH*/
4225 (OP(first) == IFMATCH && !first->flags) ||
4226 (OP(first) == PLUS) ||
4227 (OP(first) == MINMOD) ||
4228 /* An {n,m} with n>0 */
4229 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4232 if (OP(first) == PLUS)
4235 first += regarglen[OP(first)];
4236 if (OP(first) == IFMATCH) {
4237 first = NEXTOPER(first);
4238 first += EXTRA_STEP_2ARGS;
4239 } else /* XXX possible optimisation for /(?=)/ */
4240 first = NEXTOPER(first);
4243 /* Starting-point info. */
4245 DEBUG_PEEP("first:",first,0);
4246 /* Ignore EXACT as we deal with it later. */
4247 if (PL_regkind[OP(first)] == EXACT) {
4248 if (OP(first) == EXACT)
4249 NOOP; /* Empty, get anchored substr later. */
4250 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4251 ri->regstclass = first;
4254 else if (PL_regkind[OP(first)] == TRIE &&
4255 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4258 /* this can happen only on restudy */
4259 if ( OP(first) == TRIE ) {
4260 struct regnode_1 *trieop = (struct regnode_1 *)
4261 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4262 StructCopy(first,trieop,struct regnode_1);
4263 trie_op=(regnode *)trieop;
4265 struct regnode_charclass *trieop = (struct regnode_charclass *)
4266 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4267 StructCopy(first,trieop,struct regnode_charclass);
4268 trie_op=(regnode *)trieop;
4271 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4272 ri->regstclass = trie_op;
4275 else if (strchr((const char*)PL_simple,OP(first)))
4276 ri->regstclass = first;
4277 else if (PL_regkind[OP(first)] == BOUND ||
4278 PL_regkind[OP(first)] == NBOUND)
4279 ri->regstclass = first;
4280 else if (PL_regkind[OP(first)] == BOL) {
4281 r->extflags |= (OP(first) == MBOL
4283 : (OP(first) == SBOL
4286 first = NEXTOPER(first);
4289 else if (OP(first) == GPOS) {
4290 r->extflags |= RXf_ANCH_GPOS;
4291 first = NEXTOPER(first);
4294 else if ((!sawopen || !RExC_sawback) &&
4295 (OP(first) == STAR &&
4296 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4297 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4299 /* turn .* into ^.* with an implied $*=1 */
4301 (OP(NEXTOPER(first)) == REG_ANY)
4304 r->extflags |= type;
4305 r->intflags |= PREGf_IMPLICIT;
4306 first = NEXTOPER(first);
4309 if (sawplus && (!sawopen || !RExC_sawback)
4310 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4311 /* x+ must match at the 1st pos of run of x's */
4312 r->intflags |= PREGf_SKIP;
4314 /* Scan is after the zeroth branch, first is atomic matcher. */
4315 #ifdef TRIE_STUDY_OPT
4318 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4319 (IV)(first - scan + 1))
4323 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4324 (IV)(first - scan + 1))
4330 * If there's something expensive in the r.e., find the
4331 * longest literal string that must appear and make it the
4332 * regmust. Resolve ties in favor of later strings, since
4333 * the regstart check works with the beginning of the r.e.
4334 * and avoiding duplication strengthens checking. Not a
4335 * strong reason, but sufficient in the absence of others.
4336 * [Now we resolve ties in favor of the earlier string if
4337 * it happens that c_offset_min has been invalidated, since the
4338 * earlier string may buy us something the later one won't.]
4341 data.longest_fixed = newSVpvs("");
4342 data.longest_float = newSVpvs("");
4343 data.last_found = newSVpvs("");
4344 data.longest = &(data.longest_fixed);
4346 if (!ri->regstclass) {
4347 cl_init(pRExC_state, &ch_class);
4348 data.start_class = &ch_class;
4349 stclass_flag = SCF_DO_STCLASS_AND;
4350 } else /* XXXX Check for BOUND? */
4352 data.last_closep = &last_close;
4354 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4355 &data, -1, NULL, NULL,
4356 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4362 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4363 && data.last_start_min == 0 && data.last_end > 0
4364 && !RExC_seen_zerolen
4365 && !(RExC_seen & REG_SEEN_VERBARG)
4366 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4367 r->extflags |= RXf_CHECK_ALL;
4368 scan_commit(pRExC_state, &data,&minlen,0);
4369 SvREFCNT_dec(data.last_found);
4371 /* Note that code very similar to this but for anchored string
4372 follows immediately below, changes may need to be made to both.
4375 longest_float_length = CHR_SVLEN(data.longest_float);
4376 if (longest_float_length
4377 || (data.flags & SF_FL_BEFORE_EOL
4378 && (!(data.flags & SF_FL_BEFORE_MEOL)
4379 || (RExC_flags & RXf_PMf_MULTILINE))))
4383 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4384 && data.offset_fixed == data.offset_float_min
4385 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4386 goto remove_float; /* As in (a)+. */
4388 /* copy the information about the longest float from the reg_scan_data
4389 over to the program. */
4390 if (SvUTF8(data.longest_float)) {
4391 r->float_utf8 = data.longest_float;
4392 r->float_substr = NULL;
4394 r->float_substr = data.longest_float;
4395 r->float_utf8 = NULL;
4397 /* float_end_shift is how many chars that must be matched that
4398 follow this item. We calculate it ahead of time as once the
4399 lookbehind offset is added in we lose the ability to correctly
4401 ml = data.minlen_float ? *(data.minlen_float)
4402 : (I32)longest_float_length;
4403 r->float_end_shift = ml - data.offset_float_min
4404 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4405 + data.lookbehind_float;
4406 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4407 r->float_max_offset = data.offset_float_max;
4408 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4409 r->float_max_offset -= data.lookbehind_float;
4411 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4412 && (!(data.flags & SF_FL_BEFORE_MEOL)
4413 || (RExC_flags & RXf_PMf_MULTILINE)));
4414 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4418 r->float_substr = r->float_utf8 = NULL;
4419 SvREFCNT_dec(data.longest_float);
4420 longest_float_length = 0;
4423 /* Note that code very similar to this but for floating string
4424 is immediately above, changes may need to be made to both.
4427 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4428 if (longest_fixed_length
4429 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4430 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4431 || (RExC_flags & RXf_PMf_MULTILINE))))
4435 /* copy the information about the longest fixed
4436 from the reg_scan_data over to the program. */
4437 if (SvUTF8(data.longest_fixed)) {
4438 r->anchored_utf8 = data.longest_fixed;
4439 r->anchored_substr = NULL;
4441 r->anchored_substr = data.longest_fixed;
4442 r->anchored_utf8 = NULL;
4444 /* fixed_end_shift is how many chars that must be matched that
4445 follow this item. We calculate it ahead of time as once the
4446 lookbehind offset is added in we lose the ability to correctly
4448 ml = data.minlen_fixed ? *(data.minlen_fixed)
4449 : (I32)longest_fixed_length;
4450 r->anchored_end_shift = ml - data.offset_fixed
4451 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4452 + data.lookbehind_fixed;
4453 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4455 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4456 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4457 || (RExC_flags & RXf_PMf_MULTILINE)));
4458 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4461 r->anchored_substr = r->anchored_utf8 = NULL;
4462 SvREFCNT_dec(data.longest_fixed);
4463 longest_fixed_length = 0;
4466 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4467 ri->regstclass = NULL;
4468 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4470 && !(data.start_class->flags & ANYOF_EOS)
4471 && !cl_is_anything(data.start_class))
4473 const U32 n = add_data(pRExC_state, 1, "f");
4475 Newx(RExC_rxi->data->data[n], 1,
4476 struct regnode_charclass_class);
4477 StructCopy(data.start_class,
4478 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4479 struct regnode_charclass_class);
4480 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4481 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4482 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4483 regprop(r, sv, (regnode*)data.start_class);
4484 PerlIO_printf(Perl_debug_log,
4485 "synthetic stclass \"%s\".\n",
4486 SvPVX_const(sv));});
4489 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4490 if (longest_fixed_length > longest_float_length) {
4491 r->check_end_shift = r->anchored_end_shift;
4492 r->check_substr = r->anchored_substr;
4493 r->check_utf8 = r->anchored_utf8;
4494 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4495 if (r->extflags & RXf_ANCH_SINGLE)
4496 r->extflags |= RXf_NOSCAN;
4499 r->check_end_shift = r->float_end_shift;
4500 r->check_substr = r->float_substr;
4501 r->check_utf8 = r->float_utf8;
4502 r->check_offset_min = r->float_min_offset;
4503 r->check_offset_max = r->float_max_offset;
4505 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4506 This should be changed ASAP! */
4507 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4508 r->extflags |= RXf_USE_INTUIT;
4509 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4510 r->extflags |= RXf_INTUIT_TAIL;
4512 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4513 if ( (STRLEN)minlen < longest_float_length )
4514 minlen= longest_float_length;
4515 if ( (STRLEN)minlen < longest_fixed_length )
4516 minlen= longest_fixed_length;
4520 /* Several toplevels. Best we can is to set minlen. */
4522 struct regnode_charclass_class ch_class;
4525 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4527 scan = ri->program + 1;
4528 cl_init(pRExC_state, &ch_class);
4529 data.start_class = &ch_class;
4530 data.last_closep = &last_close;
4533 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4534 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4538 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4539 = r->float_substr = r->float_utf8 = NULL;
4540 if (!(data.start_class->flags & ANYOF_EOS)
4541 && !cl_is_anything(data.start_class))
4543 const U32 n = add_data(pRExC_state, 1, "f");
4545 Newx(RExC_rxi->data->data[n], 1,
4546 struct regnode_charclass_class);
4547 StructCopy(data.start_class,
4548 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4549 struct regnode_charclass_class);
4550 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4551 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4552 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4553 regprop(r, sv, (regnode*)data.start_class);
4554 PerlIO_printf(Perl_debug_log,
4555 "synthetic stclass \"%s\".\n",
4556 SvPVX_const(sv));});
4560 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4561 the "real" pattern. */
4563 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4564 (IV)minlen, (IV)r->minlen);
4566 r->minlenret = minlen;
4567 if (r->minlen < minlen)
4570 if (RExC_seen & REG_SEEN_GPOS)
4571 r->extflags |= RXf_GPOS_SEEN;
4572 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4573 r->extflags |= RXf_LOOKBEHIND_SEEN;
4574 if (RExC_seen & REG_SEEN_EVAL)
4575 r->extflags |= RXf_EVAL_SEEN;
4576 if (RExC_seen & REG_SEEN_CANY)
4577 r->extflags |= RXf_CANY_SEEN;
4578 if (RExC_seen & REG_SEEN_VERBARG)
4579 r->intflags |= PREGf_VERBARG_SEEN;
4580 if (RExC_seen & REG_SEEN_CUTGROUP)
4581 r->intflags |= PREGf_CUTGROUP_SEEN;
4582 if (RExC_paren_names)
4583 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4585 r->paren_names = NULL;
4586 if (r->prelen == 3 && strEQ("\\s+", r->precomp))
4587 r->extflags |= RXf_WHITE;
4588 else if (r->prelen == 1 && r->precomp[0] == '^')
4589 r->extflags |= RXf_START_ONLY;
4592 if (RExC_paren_names) {
4593 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4594 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4597 ri->name_list_idx = 0;
4599 if (RExC_recurse_count) {
4600 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4601 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4602 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4605 Newxz(r->startp, RExC_npar, I32);
4606 Newxz(r->endp, RExC_npar, I32);
4607 /* assume we don't need to swap parens around before we match */
4610 PerlIO_printf(Perl_debug_log,"Final program:\n");
4613 DEBUG_OFFSETS_r(if (ri->offsets) {
4614 const U32 len = ri->offsets[0];
4616 GET_RE_DEBUG_FLAGS_DECL;
4617 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
4618 for (i = 1; i <= len; i++) {
4619 if (ri->offsets[i*2-1] || ri->offsets[i*2])
4620 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4621 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
4623 PerlIO_printf(Perl_debug_log, "\n");
4628 #undef CORE_ONLY_BLOCK
4629 #undef RE_ENGINE_PTR
4631 #ifndef PERL_IN_XSUB_RE
4633 Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flags)
4635 AV *retarray = NULL;
4640 if (from_re || PL_curpm) {
4641 const REGEXP * const rx = from_re ? from_re : PM_GETRE(PL_curpm);
4642 if (rx && rx->paren_names) {
4643 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4646 SV* sv_dat=HeVAL(he_str);
4647 I32 *nums=(I32*)SvPVX(sv_dat);
4648 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4649 if ((I32)(rx->lastparen) >= nums[i] &&
4650 rx->endp[nums[i]] != -1)
4652 ret = reg_numbered_buff_get(nums[i],rx,NULL,0);
4656 ret = newSVsv(&PL_sv_undef);
4660 av_push(retarray, ret);
4664 return (SV*)retarray;
4672 Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, U32 flags)
4677 SV *sv = usesv ? usesv : newSVpvs("");
4678 PERL_UNUSED_ARG(flags);
4681 sv_setsv(sv,&PL_sv_undef);
4685 if (paren == -2 && rx->startp[0] != -1) {
4691 if (paren == -1 && rx->endp[0] != -1) {
4693 s = rx->subbeg + rx->endp[0];
4694 i = rx->sublen - rx->endp[0];
4697 if ( 0 <= paren && paren <= (I32)rx->nparens &&
4698 (s1 = rx->startp[paren]) != -1 &&
4699 (t1 = rx->endp[paren]) != -1)
4703 s = rx->subbeg + s1;
4705 sv_setsv(sv,&PL_sv_undef);
4708 assert(rx->sublen >= (s - rx->subbeg) + i );
4710 const int oldtainted = PL_tainted;
4712 sv_setpvn(sv, s, i);
4713 PL_tainted = oldtainted;
4714 if ( (rx->extflags & RXf_CANY_SEEN)
4715 ? (RX_MATCH_UTF8(rx)
4716 && (!i || is_utf8_string((U8*)s, i)))
4717 : (RX_MATCH_UTF8(rx)) )
4724 if (RX_MATCH_TAINTED(rx)) {
4725 if (SvTYPE(sv) >= SVt_PVMG) {
4726 MAGIC* const mg = SvMAGIC(sv);
4729 SvMAGIC_set(sv, mg->mg_moremagic);
4731 if ((mgt = SvMAGIC(sv))) {
4732 mg->mg_moremagic = mgt;
4733 SvMAGIC_set(sv, mg);
4743 sv_setsv(sv,&PL_sv_undef);
4749 /* Scans the name of a named buffer from the pattern.
4750 * If flags is REG_RSN_RETURN_NULL returns null.
4751 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4752 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4753 * to the parsed name as looked up in the RExC_paren_names hash.
4754 * If there is an error throws a vFAIL().. type exception.
4757 #define REG_RSN_RETURN_NULL 0
4758 #define REG_RSN_RETURN_NAME 1
4759 #define REG_RSN_RETURN_DATA 2
4762 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4763 char *name_start = RExC_parse;
4765 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
4766 /* skip IDFIRST by using do...while */
4769 RExC_parse += UTF8SKIP(RExC_parse);
4770 } while (isALNUM_utf8((U8*)RExC_parse));
4774 } while (isALNUM(*RExC_parse));
4778 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4779 (int)(RExC_parse - name_start)));
4782 if ( flags == REG_RSN_RETURN_NAME)
4784 else if (flags==REG_RSN_RETURN_DATA) {
4787 if ( ! sv_name ) /* should not happen*/
4788 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4789 if (RExC_paren_names)
4790 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4792 sv_dat = HeVAL(he_str);
4794 vFAIL("Reference to nonexistent named group");
4798 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4805 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4806 int rem=(int)(RExC_end - RExC_parse); \
4815 if (RExC_lastparse!=RExC_parse) \
4816 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4819 iscut ? "..." : "<" \
4822 PerlIO_printf(Perl_debug_log,"%16s",""); \
4827 num=REG_NODE_NUM(RExC_emit); \
4828 if (RExC_lastnum!=num) \
4829 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4831 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4832 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4833 (int)((depth*2)), "", \
4837 RExC_lastparse=RExC_parse; \
4842 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4843 DEBUG_PARSE_MSG((funcname)); \
4844 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4846 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4847 DEBUG_PARSE_MSG((funcname)); \
4848 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4851 - reg - regular expression, i.e. main body or parenthesized thing
4853 * Caller must absorb opening parenthesis.
4855 * Combining parenthesis handling with the base level of regular expression
4856 * is a trifle forced, but the need to tie the tails of the branches to what
4857 * follows makes it hard to avoid.
4859 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4861 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4863 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4866 /* this idea is borrowed from STR_WITH_LEN in handy.h */
4867 #define CHECK_WORD(s,v,l) \
4868 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4871 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4872 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4875 register regnode *ret; /* Will be the head of the group. */
4876 register regnode *br;
4877 register regnode *lastbr;
4878 register regnode *ender = NULL;
4879 register I32 parno = 0;
4881 const I32 oregflags = RExC_flags;
4882 bool have_branch = 0;
4885 /* for (?g), (?gc), and (?o) warnings; warning
4886 about (?c) will warn about (?g) -- japhy */
4888 #define WASTED_O 0x01
4889 #define WASTED_G 0x02
4890 #define WASTED_C 0x04
4891 #define WASTED_GC (0x02|0x04)
4892 I32 wastedflags = 0x00;
4894 char * parse_start = RExC_parse; /* MJD */
4895 char * const oregcomp_parse = RExC_parse;
4897 GET_RE_DEBUG_FLAGS_DECL;
4898 DEBUG_PARSE("reg ");
4901 *flagp = 0; /* Tentatively. */
4904 /* Make an OPEN node, if parenthesized. */
4906 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4907 char *start_verb = RExC_parse;
4908 STRLEN verb_len = 0;
4909 char *start_arg = NULL;
4910 unsigned char op = 0;
4912 int internal_argval = 0; /* internal_argval is only useful if !argok */
4913 while ( *RExC_parse && *RExC_parse != ')' ) {
4914 if ( *RExC_parse == ':' ) {
4915 start_arg = RExC_parse + 1;
4921 verb_len = RExC_parse - start_verb;
4924 while ( *RExC_parse && *RExC_parse != ')' )
4926 if ( *RExC_parse != ')' )
4927 vFAIL("Unterminated verb pattern argument");
4928 if ( RExC_parse == start_arg )
4931 if ( *RExC_parse != ')' )
4932 vFAIL("Unterminated verb pattern");
4935 switch ( *start_verb ) {
4936 case 'A': /* (*ACCEPT) */
4937 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4939 internal_argval = RExC_nestroot;
4942 case 'C': /* (*COMMIT) */
4943 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4946 case 'F': /* (*FAIL) */
4947 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4952 case ':': /* (*:NAME) */
4953 case 'M': /* (*MARK:NAME) */
4954 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
4959 case 'P': /* (*PRUNE) */
4960 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4963 case 'S': /* (*SKIP) */
4964 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4967 case 'T': /* (*THEN) */
4968 /* [19:06] <TimToady> :: is then */
4969 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4971 RExC_seen |= REG_SEEN_CUTGROUP;
4977 vFAIL3("Unknown verb pattern '%.*s'",
4978 verb_len, start_verb);
4981 if ( start_arg && internal_argval ) {
4982 vFAIL3("Verb pattern '%.*s' may not have an argument",
4983 verb_len, start_verb);
4984 } else if ( argok < 0 && !start_arg ) {
4985 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4986 verb_len, start_verb);
4988 ret = reganode(pRExC_state, op, internal_argval);
4989 if ( ! internal_argval && ! SIZE_ONLY ) {
4991 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4992 ARG(ret) = add_data( pRExC_state, 1, "S" );
4993 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5000 if (!internal_argval)
5001 RExC_seen |= REG_SEEN_VERBARG;
5002 } else if ( start_arg ) {
5003 vFAIL3("Verb pattern '%.*s' may not have an argument",
5004 verb_len, start_verb);
5006 ret = reg_node(pRExC_state, op);
5008 nextchar(pRExC_state);
5011 if (*RExC_parse == '?') { /* (?...) */
5012 bool is_logical = 0;
5013 const char * const seqstart = RExC_parse;
5016 paren = *RExC_parse++;
5017 ret = NULL; /* For look-ahead/behind. */
5020 case 'P': /* (?P...) variants for those used to PCRE/Python */
5021 paren = *RExC_parse++;
5022 if ( paren == '<') /* (?P<...>) named capture */
5024 else if (paren == '>') { /* (?P>name) named recursion */
5025 goto named_recursion;
5027 else if (paren == '=') { /* (?P=...) named backref */
5028 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5029 you change this make sure you change that */
5030 char* name_start = RExC_parse;
5032 SV *sv_dat = reg_scan_name(pRExC_state,
5033 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5034 if (RExC_parse == name_start || *RExC_parse != ')')
5035 vFAIL2("Sequence %.3s... not terminated",parse_start);
5038 num = add_data( pRExC_state, 1, "S" );
5039 RExC_rxi->data->data[num]=(void*)sv_dat;
5040 SvREFCNT_inc(sv_dat);
5043 ret = reganode(pRExC_state,
5044 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5048 Set_Node_Offset(ret, parse_start+1);
5049 Set_Node_Cur_Length(ret); /* MJD */
5051 nextchar(pRExC_state);
5055 case '<': /* (?<...) */
5056 if (*RExC_parse == '!')
5058 else if (*RExC_parse != '=')
5064 case '\'': /* (?'...') */
5065 name_start= RExC_parse;
5066 svname = reg_scan_name(pRExC_state,
5067 SIZE_ONLY ? /* reverse test from the others */
5068 REG_RSN_RETURN_NAME :
5069 REG_RSN_RETURN_NULL);
5070 if (RExC_parse == name_start)
5072 if (*RExC_parse != paren)
5073 vFAIL2("Sequence (?%c... not terminated",
5074 paren=='>' ? '<' : paren);
5078 if (!svname) /* shouldnt happen */
5080 "panic: reg_scan_name returned NULL");
5081 if (!RExC_paren_names) {
5082 RExC_paren_names= newHV();
5083 sv_2mortal((SV*)RExC_paren_names);
5085 RExC_paren_name_list= newAV();
5086 sv_2mortal((SV*)RExC_paren_name_list);
5089 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5091 sv_dat = HeVAL(he_str);
5093 /* croak baby croak */
5095 "panic: paren_name hash element allocation failed");
5096 } else if ( SvPOK(sv_dat) ) {
5097 IV count=SvIV(sv_dat);
5098 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
5099 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
5100 pv[count]=RExC_npar;
5103 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5104 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5109 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5110 SvREFCNT_dec(svname);
5113 /*sv_dump(sv_dat);*/
5115 nextchar(pRExC_state);
5117 goto capturing_parens;
5119 RExC_seen |= REG_SEEN_LOOKBEHIND;
5121 case '=': /* (?=...) */
5122 case '!': /* (?!...) */
5123 RExC_seen_zerolen++;
5124 if (*RExC_parse == ')') {
5125 ret=reg_node(pRExC_state, OPFAIL);
5126 nextchar(pRExC_state);
5129 case ':': /* (?:...) */
5130 case '>': /* (?>...) */
5132 case '$': /* (?$...) */
5133 case '@': /* (?@...) */
5134 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5136 case '#': /* (?#...) */
5137 while (*RExC_parse && *RExC_parse != ')')
5139 if (*RExC_parse != ')')
5140 FAIL("Sequence (?#... not terminated");
5141 nextchar(pRExC_state);
5144 case '0' : /* (?0) */
5145 case 'R' : /* (?R) */
5146 if (*RExC_parse != ')')
5147 FAIL("Sequence (?R) not terminated");
5148 ret = reg_node(pRExC_state, GOSTART);
5149 nextchar(pRExC_state);
5152 { /* named and numeric backreferences */
5154 case '&': /* (?&NAME) */
5155 parse_start = RExC_parse - 1;
5158 SV *sv_dat = reg_scan_name(pRExC_state,
5159 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5160 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5162 goto gen_recurse_regop;
5165 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5167 vFAIL("Illegal pattern");
5169 goto parse_recursion;
5171 case '-': /* (?-1) */
5172 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5173 RExC_parse--; /* rewind to let it be handled later */
5177 case '1': case '2': case '3': case '4': /* (?1) */
5178 case '5': case '6': case '7': case '8': case '9':
5181 num = atoi(RExC_parse);
5182 parse_start = RExC_parse - 1; /* MJD */
5183 if (*RExC_parse == '-')
5185 while (isDIGIT(*RExC_parse))
5187 if (*RExC_parse!=')')
5188 vFAIL("Expecting close bracket");
5191 if ( paren == '-' ) {
5193 Diagram of capture buffer numbering.
5194 Top line is the normal capture buffer numbers
5195 Botton line is the negative indexing as from
5199 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5203 num = RExC_npar + num;
5206 vFAIL("Reference to nonexistent group");
5208 } else if ( paren == '+' ) {
5209 num = RExC_npar + num - 1;
5212 ret = reganode(pRExC_state, GOSUB, num);
5214 if (num > (I32)RExC_rx->nparens) {
5216 vFAIL("Reference to nonexistent group");
5218 ARG2L_SET( ret, RExC_recurse_count++);
5220 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5221 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5225 RExC_seen |= REG_SEEN_RECURSE;
5226 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5227 Set_Node_Offset(ret, parse_start); /* MJD */
5229 nextchar(pRExC_state);
5231 } /* named and numeric backreferences */
5234 case 'p': /* (?p...) */
5235 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
5236 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
5238 case '?': /* (??...) */
5240 if (*RExC_parse != '{')
5242 paren = *RExC_parse++;
5244 case '{': /* (?{...}) */
5249 char *s = RExC_parse;
5251 RExC_seen_zerolen++;
5252 RExC_seen |= REG_SEEN_EVAL;
5253 while (count && (c = *RExC_parse)) {
5264 if (*RExC_parse != ')') {
5266 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5270 OP_4tree *sop, *rop;
5271 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5274 Perl_save_re_context(aTHX);
5275 rop = sv_compile_2op(sv, &sop, "re", &pad);
5276 sop->op_private |= OPpREFCOUNTED;
5277 /* re_dup will OpREFCNT_inc */
5278 OpREFCNT_set(sop, 1);
5281 n = add_data(pRExC_state, 3, "nop");
5282 RExC_rxi->data->data[n] = (void*)rop;
5283 RExC_rxi->data->data[n+1] = (void*)sop;
5284 RExC_rxi->data->data[n+2] = (void*)pad;
5287 else { /* First pass */
5288 if (PL_reginterp_cnt < ++RExC_seen_evals
5290 /* No compiled RE interpolated, has runtime
5291 components ===> unsafe. */
5292 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5293 if (PL_tainting && PL_tainted)
5294 FAIL("Eval-group in insecure regular expression");
5295 #if PERL_VERSION > 8
5296 if (IN_PERL_COMPILETIME)
5301 nextchar(pRExC_state);
5303 ret = reg_node(pRExC_state, LOGICAL);
5306 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5307 /* deal with the length of this later - MJD */
5310 ret = reganode(pRExC_state, EVAL, n);
5311 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5312 Set_Node_Offset(ret, parse_start);
5315 case '(': /* (?(?{...})...) and (?(?=...)...) */
5318 if (RExC_parse[0] == '?') { /* (?(?...)) */
5319 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5320 || RExC_parse[1] == '<'
5321 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5324 ret = reg_node(pRExC_state, LOGICAL);
5327 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5331 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5332 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5334 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5335 char *name_start= RExC_parse++;
5337 SV *sv_dat=reg_scan_name(pRExC_state,
5338 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5339 if (RExC_parse == name_start || *RExC_parse != ch)
5340 vFAIL2("Sequence (?(%c... not terminated",
5341 (ch == '>' ? '<' : ch));
5344 num = add_data( pRExC_state, 1, "S" );
5345 RExC_rxi->data->data[num]=(void*)sv_dat;
5346 SvREFCNT_inc(sv_dat);
5348 ret = reganode(pRExC_state,NGROUPP,num);
5349 goto insert_if_check_paren;
5351 else if (RExC_parse[0] == 'D' &&
5352 RExC_parse[1] == 'E' &&
5353 RExC_parse[2] == 'F' &&
5354 RExC_parse[3] == 'I' &&
5355 RExC_parse[4] == 'N' &&
5356 RExC_parse[5] == 'E')
5358 ret = reganode(pRExC_state,DEFINEP,0);
5361 goto insert_if_check_paren;
5363 else if (RExC_parse[0] == 'R') {
5366 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5367 parno = atoi(RExC_parse++);
5368 while (isDIGIT(*RExC_parse))
5370 } else if (RExC_parse[0] == '&') {
5373 sv_dat = reg_scan_name(pRExC_state,
5374 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5375 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5377 ret = reganode(pRExC_state,INSUBP,parno);
5378 goto insert_if_check_paren;
5380 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5383 parno = atoi(RExC_parse++);
5385 while (isDIGIT(*RExC_parse))
5387 ret = reganode(pRExC_state, GROUPP, parno);
5389 insert_if_check_paren:
5390 if ((c = *nextchar(pRExC_state)) != ')')
5391 vFAIL("Switch condition not recognized");
5393 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5394 br = regbranch(pRExC_state, &flags, 1,depth+1);
5396 br = reganode(pRExC_state, LONGJMP, 0);
5398 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5399 c = *nextchar(pRExC_state);
5404 vFAIL("(?(DEFINE)....) does not allow branches");
5405 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5406 regbranch(pRExC_state, &flags, 1,depth+1);
5407 REGTAIL(pRExC_state, ret, lastbr);
5410 c = *nextchar(pRExC_state);
5415 vFAIL("Switch (?(condition)... contains too many branches");
5416 ender = reg_node(pRExC_state, TAIL);
5417 REGTAIL(pRExC_state, br, ender);
5419 REGTAIL(pRExC_state, lastbr, ender);
5420 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5423 REGTAIL(pRExC_state, ret, ender);
5427 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5431 RExC_parse--; /* for vFAIL to print correctly */
5432 vFAIL("Sequence (? incomplete");
5436 parse_flags: /* (?i) */
5438 U32 posflags = 0, negflags = 0;
5439 U32 *flagsp = &posflags;
5441 while (*RExC_parse) {
5442 /* && strchr("iogcmsx", *RExC_parse) */
5443 /* (?g), (?gc) and (?o) are useless here
5444 and must be globally applied -- japhy */
5445 switch (*RExC_parse) {
5446 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5449 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5450 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5451 if (! (wastedflags & wflagbit) ) {
5452 wastedflags |= wflagbit;
5455 "Useless (%s%c) - %suse /%c modifier",
5456 flagsp == &negflags ? "?-" : "?",
5458 flagsp == &negflags ? "don't " : "",
5466 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5467 if (! (wastedflags & WASTED_C) ) {
5468 wastedflags |= WASTED_GC;
5471 "Useless (%sc) - %suse /gc modifier",
5472 flagsp == &negflags ? "?-" : "?",
5473 flagsp == &negflags ? "don't " : ""
5479 if (flagsp == &negflags) {
5480 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5481 vWARN(RExC_parse + 1,"Useless use of (?-k)");
5483 *flagsp |= RXf_PMf_KEEPCOPY;
5487 if (flagsp == &negflags)
5490 wastedflags = 0; /* reset so (?g-c) warns twice */
5496 RExC_flags |= posflags;
5497 RExC_flags &= ~negflags;
5498 nextchar(pRExC_state);
5510 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5515 }} /* one for the default block, one for the switch */
5522 ret = reganode(pRExC_state, OPEN, parno);
5525 RExC_nestroot = parno;
5526 if (RExC_seen & REG_SEEN_RECURSE) {
5527 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5528 "Setting open paren #%"IVdf" to %d\n",
5529 (IV)parno, REG_NODE_NUM(ret)));
5530 RExC_open_parens[parno-1]= ret;
5533 Set_Node_Length(ret, 1); /* MJD */
5534 Set_Node_Offset(ret, RExC_parse); /* MJD */
5542 /* Pick up the branches, linking them together. */
5543 parse_start = RExC_parse; /* MJD */
5544 br = regbranch(pRExC_state, &flags, 1,depth+1);
5545 /* branch_len = (paren != 0); */
5549 if (*RExC_parse == '|') {
5550 if (!SIZE_ONLY && RExC_extralen) {
5551 reginsert(pRExC_state, BRANCHJ, br, depth+1);
5554 reginsert(pRExC_state, BRANCH, br, depth+1);
5555 Set_Node_Length(br, paren != 0);
5556 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5560 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
5562 else if (paren == ':') {
5563 *flagp |= flags&SIMPLE;
5565 if (is_open) { /* Starts with OPEN. */
5566 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
5568 else if (paren != '?') /* Not Conditional */
5570 *flagp |= flags & (SPSTART | HASWIDTH);
5572 while (*RExC_parse == '|') {
5573 if (!SIZE_ONLY && RExC_extralen) {
5574 ender = reganode(pRExC_state, LONGJMP,0);
5575 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5578 RExC_extralen += 2; /* Account for LONGJMP. */
5579 nextchar(pRExC_state);
5580 br = regbranch(pRExC_state, &flags, 0, depth+1);
5584 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
5588 *flagp |= flags&SPSTART;
5591 if (have_branch || paren != ':') {
5592 /* Make a closing node, and hook it on the end. */
5595 ender = reg_node(pRExC_state, TAIL);
5598 ender = reganode(pRExC_state, CLOSE, parno);
5599 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5600 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5601 "Setting close paren #%"IVdf" to %d\n",
5602 (IV)parno, REG_NODE_NUM(ender)));
5603 RExC_close_parens[parno-1]= ender;
5604 if (RExC_nestroot == parno)
5607 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5608 Set_Node_Length(ender,1); /* MJD */
5614 *flagp &= ~HASWIDTH;
5617 ender = reg_node(pRExC_state, SUCCEED);
5620 ender = reg_node(pRExC_state, END);
5622 assert(!RExC_opend); /* there can only be one! */
5627 REGTAIL(pRExC_state, lastbr, ender);
5629 if (have_branch && !SIZE_ONLY) {
5631 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5633 /* Hook the tails of the branches to the closing node. */
5634 for (br = ret; br; br = regnext(br)) {
5635 const U8 op = PL_regkind[OP(br)];
5637 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5639 else if (op == BRANCHJ) {
5640 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5648 static const char parens[] = "=!<,>";
5650 if (paren && (p = strchr(parens, paren))) {
5651 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5652 int flag = (p - parens) > 1;
5655 node = SUSPEND, flag = 0;
5656 reginsert(pRExC_state, node,ret, depth+1);
5657 Set_Node_Cur_Length(ret);
5658 Set_Node_Offset(ret, parse_start + 1);
5660 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5664 /* Check for proper termination. */
5666 RExC_flags = oregflags;
5667 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5668 RExC_parse = oregcomp_parse;
5669 vFAIL("Unmatched (");
5672 else if (!paren && RExC_parse < RExC_end) {
5673 if (*RExC_parse == ')') {
5675 vFAIL("Unmatched )");
5678 FAIL("Junk on end of regexp"); /* "Can't happen". */
5686 - regbranch - one alternative of an | operator
5688 * Implements the concatenation operator.
5691 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5694 register regnode *ret;
5695 register regnode *chain = NULL;
5696 register regnode *latest;
5697 I32 flags = 0, c = 0;
5698 GET_RE_DEBUG_FLAGS_DECL;
5699 DEBUG_PARSE("brnc");
5703 if (!SIZE_ONLY && RExC_extralen)
5704 ret = reganode(pRExC_state, BRANCHJ,0);
5706 ret = reg_node(pRExC_state, BRANCH);
5707 Set_Node_Length(ret, 1);
5711 if (!first && SIZE_ONLY)
5712 RExC_extralen += 1; /* BRANCHJ */
5714 *flagp = WORST; /* Tentatively. */
5717 nextchar(pRExC_state);
5718 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5720 latest = regpiece(pRExC_state, &flags,depth+1);
5721 if (latest == NULL) {
5722 if (flags & TRYAGAIN)
5726 else if (ret == NULL)
5728 *flagp |= flags&HASWIDTH;
5729 if (chain == NULL) /* First piece. */
5730 *flagp |= flags&SPSTART;
5733 REGTAIL(pRExC_state, chain, latest);
5738 if (chain == NULL) { /* Loop ran zero times. */
5739 chain = reg_node(pRExC_state, NOTHING);
5744 *flagp |= flags&SIMPLE;
5751 - regpiece - something followed by possible [*+?]
5753 * Note that the branching code sequences used for ? and the general cases
5754 * of * and + are somewhat optimized: they use the same NOTHING node as
5755 * both the endmarker for their branch list and the body of the last branch.
5756 * It might seem that this node could be dispensed with entirely, but the
5757 * endmarker role is not redundant.
5760 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5763 register regnode *ret;
5765 register char *next;
5767 const char * const origparse = RExC_parse;
5769 I32 max = REG_INFTY;
5771 const char *maxpos = NULL;
5772 GET_RE_DEBUG_FLAGS_DECL;
5773 DEBUG_PARSE("piec");
5775 ret = regatom(pRExC_state, &flags,depth+1);
5777 if (flags & TRYAGAIN)
5784 if (op == '{' && regcurly(RExC_parse)) {
5786 parse_start = RExC_parse; /* MJD */
5787 next = RExC_parse + 1;
5788 while (isDIGIT(*next) || *next == ',') {
5797 if (*next == '}') { /* got one */
5801 min = atoi(RExC_parse);
5805 maxpos = RExC_parse;
5807 if (!max && *maxpos != '0')
5808 max = REG_INFTY; /* meaning "infinity" */
5809 else if (max >= REG_INFTY)
5810 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5812 nextchar(pRExC_state);
5815 if ((flags&SIMPLE)) {
5816 RExC_naughty += 2 + RExC_naughty / 2;
5817 reginsert(pRExC_state, CURLY, ret, depth+1);
5818 Set_Node_Offset(ret, parse_start+1); /* MJD */
5819 Set_Node_Cur_Length(ret);
5822 regnode * const w = reg_node(pRExC_state, WHILEM);
5825 REGTAIL(pRExC_state, ret, w);
5826 if (!SIZE_ONLY && RExC_extralen) {
5827 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5828 reginsert(pRExC_state, NOTHING,ret, depth+1);
5829 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5831 reginsert(pRExC_state, CURLYX,ret, depth+1);
5833 Set_Node_Offset(ret, parse_start+1);
5834 Set_Node_Length(ret,
5835 op == '{' ? (RExC_parse - parse_start) : 1);
5837 if (!SIZE_ONLY && RExC_extralen)
5838 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
5839 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5841 RExC_whilem_seen++, RExC_extralen += 3;
5842 RExC_naughty += 4 + RExC_naughty; /* compound interest */
5850 if (max && max < min)
5851 vFAIL("Can't do {n,m} with n > m");
5853 ARG1_SET(ret, (U16)min);
5854 ARG2_SET(ret, (U16)max);
5866 #if 0 /* Now runtime fix should be reliable. */
5868 /* if this is reinstated, don't forget to put this back into perldiag:
5870 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5872 (F) The part of the regexp subject to either the * or + quantifier
5873 could match an empty string. The {#} shows in the regular
5874 expression about where the problem was discovered.
5878 if (!(flags&HASWIDTH) && op != '?')
5879 vFAIL("Regexp *+ operand could be empty");
5882 parse_start = RExC_parse;
5883 nextchar(pRExC_state);
5885 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5887 if (op == '*' && (flags&SIMPLE)) {
5888 reginsert(pRExC_state, STAR, ret, depth+1);
5892 else if (op == '*') {
5896 else if (op == '+' && (flags&SIMPLE)) {
5897 reginsert(pRExC_state, PLUS, ret, depth+1);
5901 else if (op == '+') {
5905 else if (op == '?') {
5910 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5912 "%.*s matches null string many times",
5913 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5917 if (RExC_parse < RExC_end && *RExC_parse == '?') {
5918 nextchar(pRExC_state);
5919 reginsert(pRExC_state, MINMOD, ret, depth+1);
5920 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5922 #ifndef REG_ALLOW_MINMOD_SUSPEND
5925 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5927 nextchar(pRExC_state);
5928 ender = reg_node(pRExC_state, SUCCEED);
5929 REGTAIL(pRExC_state, ret, ender);
5930 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5932 ender = reg_node(pRExC_state, TAIL);
5933 REGTAIL(pRExC_state, ret, ender);
5937 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5939 vFAIL("Nested quantifiers");
5946 /* reg_namedseq(pRExC_state,UVp)
5948 This is expected to be called by a parser routine that has
5949 recognized'\N' and needs to handle the rest. RExC_parse is
5950 expected to point at the first char following the N at the time
5953 If valuep is non-null then it is assumed that we are parsing inside
5954 of a charclass definition and the first codepoint in the resolved
5955 string is returned via *valuep and the routine will return NULL.
5956 In this mode if a multichar string is returned from the charnames
5957 handler a warning will be issued, and only the first char in the
5958 sequence will be examined. If the string returned is zero length
5959 then the value of *valuep is undefined and NON-NULL will
5960 be returned to indicate failure. (This will NOT be a valid pointer
5963 If value is null then it is assumed that we are parsing normal text
5964 and inserts a new EXACT node into the program containing the resolved
5965 string and returns a pointer to the new node. If the string is
5966 zerolength a NOTHING node is emitted.
5968 On success RExC_parse is set to the char following the endbrace.
5969 Parsing failures will generate a fatal errorvia vFAIL(...)
5971 NOTE: We cache all results from the charnames handler locally in
5972 the RExC_charnames hash (created on first use) to prevent a charnames
5973 handler from playing silly-buggers and returning a short string and
5974 then a long string for a given pattern. Since the regexp program
5975 size is calculated during an initial parse this would result
5976 in a buffer overrun so we cache to prevent the charname result from
5977 changing during the course of the parse.
5981 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5983 char * name; /* start of the content of the name */
5984 char * endbrace; /* endbrace following the name */
5987 STRLEN len; /* this has various purposes throughout the code */
5988 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5989 regnode *ret = NULL;
5991 if (*RExC_parse != '{') {
5992 vFAIL("Missing braces on \\N{}");
5994 name = RExC_parse+1;
5995 endbrace = strchr(RExC_parse, '}');
5998 vFAIL("Missing right brace on \\N{}");
6000 RExC_parse = endbrace + 1;
6003 /* RExC_parse points at the beginning brace,
6004 endbrace points at the last */
6005 if ( name[0]=='U' && name[1]=='+' ) {
6006 /* its a "unicode hex" notation {U+89AB} */
6007 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6008 | PERL_SCAN_DISALLOW_PREFIX
6009 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6011 len = (STRLEN)(endbrace - name - 2);
6012 cp = grok_hex(name + 2, &len, &fl, NULL);
6013 if ( len != (STRLEN)(endbrace - name - 2) ) {
6022 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
6024 /* fetch the charnames handler for this scope */
6025 HV * const table = GvHV(PL_hintgv);
6027 hv_fetchs(table, "charnames", FALSE) :
6029 SV *cv= cvp ? *cvp : NULL;
6032 /* create an SV with the name as argument */
6033 sv_name = newSVpvn(name, endbrace - name);
6035 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6036 vFAIL2("Constant(\\N{%s}) unknown: "
6037 "(possibly a missing \"use charnames ...\")",
6040 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6041 vFAIL2("Constant(\\N{%s}): "
6042 "$^H{charnames} is not defined",SvPVX(sv_name));
6047 if (!RExC_charnames) {
6048 /* make sure our cache is allocated */
6049 RExC_charnames = newHV();
6050 sv_2mortal((SV*)RExC_charnames);
6052 /* see if we have looked this one up before */
6053 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6055 sv_str = HeVAL(he_str);
6068 count= call_sv(cv, G_SCALAR);
6070 if (count == 1) { /* XXXX is this right? dmq */
6072 SvREFCNT_inc_simple_void(sv_str);
6080 if ( !sv_str || !SvOK(sv_str) ) {
6081 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6082 "did not return a defined value",SvPVX(sv_name));
6084 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6089 char *p = SvPV(sv_str, len);
6092 if ( SvUTF8(sv_str) ) {
6093 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6097 We have to turn on utf8 for high bit chars otherwise
6098 we get failures with
6100 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6101 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6103 This is different from what \x{} would do with the same
6104 codepoint, where the condition is > 0xFF.
6111 /* warn if we havent used the whole string? */
6113 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6115 "Ignoring excess chars from \\N{%s} in character class",
6119 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6121 "Ignoring zero length \\N{%s} in character class",
6126 SvREFCNT_dec(sv_name);
6128 SvREFCNT_dec(sv_str);
6129 return len ? NULL : (regnode *)&len;
6130 } else if(SvCUR(sv_str)) {
6135 char * parse_start = name-3; /* needed for the offsets */
6136 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6138 ret = reg_node(pRExC_state,
6139 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6142 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6143 sv_utf8_upgrade(sv_str);
6144 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6148 p = SvPV(sv_str, len);
6150 /* len is the length written, charlen is the size the char read */
6151 for ( len = 0; p < pend; p += charlen ) {
6153 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6155 STRLEN foldlen,numlen;
6156 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6157 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6158 /* Emit all the Unicode characters. */
6160 for (foldbuf = tmpbuf;
6164 uvc = utf8_to_uvchr(foldbuf, &numlen);
6166 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6169 /* In EBCDIC the numlen
6170 * and unilen can differ. */
6172 if (numlen >= foldlen)
6176 break; /* "Can't happen." */
6179 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6191 RExC_size += STR_SZ(len);
6194 RExC_emit += STR_SZ(len);
6196 Set_Node_Cur_Length(ret); /* MJD */
6198 nextchar(pRExC_state);
6200 ret = reg_node(pRExC_state,NOTHING);
6203 SvREFCNT_dec(sv_str);
6206 SvREFCNT_dec(sv_name);
6216 * It returns the code point in utf8 for the value in *encp.
6217 * value: a code value in the source encoding
6218 * encp: a pointer to an Encode object
6220 * If the result from Encode is not a single character,
6221 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6224 S_reg_recode(pTHX_ const char value, SV **encp)
6227 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6228 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6230 const STRLEN newlen = SvCUR(sv);
6231 UV uv = UNICODE_REPLACEMENT;
6235 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6238 if (!newlen || numlen != newlen) {
6239 uv = UNICODE_REPLACEMENT;
6248 - regatom - the lowest level
6250 Try to identify anything special at the start of the pattern. If there
6251 is, then handle it as required. This may involve generating a single regop,
6252 such as for an assertion; or it may involve recursing, such as to
6253 handle a () structure.
6255 If the string doesn't start with something special then we gobble up
6256 as much literal text as we can.
6258 Once we have been able to handle whatever type of thing started the
6259 sequence, we return.
6261 Note: we have to be careful with escapes, as they can be both literal
6262 and special, and in the case of \10 and friends can either, depending
6263 on context. Specifically there are two seperate switches for handling
6264 escape sequences, with the one for handling literal escapes requiring
6265 a dummy entry for all of the special escapes that are actually handled
6270 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6273 register regnode *ret = NULL;
6275 char *parse_start = RExC_parse;
6276 GET_RE_DEBUG_FLAGS_DECL;
6277 DEBUG_PARSE("atom");
6278 *flagp = WORST; /* Tentatively. */
6282 switch (*RExC_parse) {
6284 RExC_seen_zerolen++;
6285 nextchar(pRExC_state);
6286 if (RExC_flags & RXf_PMf_MULTILINE)
6287 ret = reg_node(pRExC_state, MBOL);
6288 else if (RExC_flags & RXf_PMf_SINGLELINE)
6289 ret = reg_node(pRExC_state, SBOL);
6291 ret = reg_node(pRExC_state, BOL);
6292 Set_Node_Length(ret, 1); /* MJD */
6295 nextchar(pRExC_state);
6297 RExC_seen_zerolen++;
6298 if (RExC_flags & RXf_PMf_MULTILINE)
6299 ret = reg_node(pRExC_state, MEOL);
6300 else if (RExC_flags & RXf_PMf_SINGLELINE)
6301 ret = reg_node(pRExC_state, SEOL);
6303 ret = reg_node(pRExC_state, EOL);
6304 Set_Node_Length(ret, 1); /* MJD */
6307 nextchar(pRExC_state);
6308 if (RExC_flags & RXf_PMf_SINGLELINE)
6309 ret = reg_node(pRExC_state, SANY);
6311 ret = reg_node(pRExC_state, REG_ANY);
6312 *flagp |= HASWIDTH|SIMPLE;
6314 Set_Node_Length(ret, 1); /* MJD */
6318 char * const oregcomp_parse = ++RExC_parse;
6319 ret = regclass(pRExC_state,depth+1);
6320 if (*RExC_parse != ']') {
6321 RExC_parse = oregcomp_parse;
6322 vFAIL("Unmatched [");
6324 nextchar(pRExC_state);
6325 *flagp |= HASWIDTH|SIMPLE;
6326 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6330 nextchar(pRExC_state);
6331 ret = reg(pRExC_state, 1, &flags,depth+1);
6333 if (flags & TRYAGAIN) {
6334 if (RExC_parse == RExC_end) {
6335 /* Make parent create an empty node if needed. */
6343 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
6347 if (flags & TRYAGAIN) {
6351 vFAIL("Internal urp");
6352 /* Supposed to be caught earlier. */
6355 if (!regcurly(RExC_parse)) {
6364 vFAIL("Quantifier follows nothing");
6369 This switch handles escape sequences that resolve to some kind
6370 of special regop and not to literal text. Escape sequnces that
6371 resolve to literal text are handled below in the switch marked
6374 Every entry in this switch *must* have a corresponding entry
6375 in the literal escape switch. However, the opposite is not
6376 required, as the default for this switch is to jump to the
6377 literal text handling code.
6379 switch (*++RExC_parse) {
6380 /* Special Escapes */
6382 RExC_seen_zerolen++;
6383 ret = reg_node(pRExC_state, SBOL);
6385 goto finish_meta_pat;
6387 ret = reg_node(pRExC_state, GPOS);
6388 RExC_seen |= REG_SEEN_GPOS;
6390 goto finish_meta_pat;
6392 RExC_seen_zerolen++;
6393 ret = reg_node(pRExC_state, KEEPS);
6395 goto finish_meta_pat;
6397 ret = reg_node(pRExC_state, SEOL);
6399 RExC_seen_zerolen++; /* Do not optimize RE away */
6400 goto finish_meta_pat;
6402 ret = reg_node(pRExC_state, EOS);
6404 RExC_seen_zerolen++; /* Do not optimize RE away */
6405 goto finish_meta_pat;
6407 ret = reg_node(pRExC_state, CANY);
6408 RExC_seen |= REG_SEEN_CANY;
6409 *flagp |= HASWIDTH|SIMPLE;
6410 goto finish_meta_pat;
6412 ret = reg_node(pRExC_state, CLUMP);
6414 goto finish_meta_pat;
6416 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6417 *flagp |= HASWIDTH|SIMPLE;
6418 goto finish_meta_pat;
6420 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6421 *flagp |= HASWIDTH|SIMPLE;
6422 goto finish_meta_pat;
6424 RExC_seen_zerolen++;
6425 RExC_seen |= REG_SEEN_LOOKBEHIND;
6426 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6428 goto finish_meta_pat;
6430 RExC_seen_zerolen++;
6431 RExC_seen |= REG_SEEN_LOOKBEHIND;
6432 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6434 goto finish_meta_pat;
6436 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6437 *flagp |= HASWIDTH|SIMPLE;
6438 goto finish_meta_pat;
6440 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6441 *flagp |= HASWIDTH|SIMPLE;
6442 goto finish_meta_pat;
6444 ret = reg_node(pRExC_state, DIGIT);
6445 *flagp |= HASWIDTH|SIMPLE;
6446 goto finish_meta_pat;
6448 ret = reg_node(pRExC_state, NDIGIT);
6449 *flagp |= HASWIDTH|SIMPLE;
6450 goto finish_meta_pat;
6452 ret = reganode(pRExC_state, PRUNE, 0);
6455 goto finish_meta_pat;
6457 ret = reganode(pRExC_state, SKIP, 0);
6461 nextchar(pRExC_state);
6462 Set_Node_Length(ret, 2); /* MJD */
6467 char* const oldregxend = RExC_end;
6468 char* parse_start = RExC_parse - 2;
6470 if (RExC_parse[1] == '{') {
6471 /* a lovely hack--pretend we saw [\pX] instead */
6472 RExC_end = strchr(RExC_parse, '}');
6474 const U8 c = (U8)*RExC_parse;
6476 RExC_end = oldregxend;
6477 vFAIL2("Missing right brace on \\%c{}", c);
6482 RExC_end = RExC_parse + 2;
6483 if (RExC_end > oldregxend)
6484 RExC_end = oldregxend;
6488 ret = regclass(pRExC_state,depth+1);
6490 RExC_end = oldregxend;
6493 Set_Node_Offset(ret, parse_start + 2);
6494 Set_Node_Cur_Length(ret);
6495 nextchar(pRExC_state);
6496 *flagp |= HASWIDTH|SIMPLE;
6500 /* Handle \N{NAME} here and not below because it can be
6501 multicharacter. join_exact() will join them up later on.
6502 Also this makes sure that things like /\N{BLAH}+/ and
6503 \N{BLAH} being multi char Just Happen. dmq*/
6505 ret= reg_namedseq(pRExC_state, NULL);
6507 case 'k': /* Handle \k<NAME> and \k'NAME' */
6510 char ch= RExC_parse[1];
6511 if (ch != '<' && ch != '\'' && ch != '{') {
6513 vFAIL2("Sequence %.2s... not terminated",parse_start);
6515 /* this pretty much dupes the code for (?P=...) in reg(), if
6516 you change this make sure you change that */
6517 char* name_start = (RExC_parse += 2);
6519 SV *sv_dat = reg_scan_name(pRExC_state,
6520 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6521 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
6522 if (RExC_parse == name_start || *RExC_parse != ch)
6523 vFAIL2("Sequence %.3s... not terminated",parse_start);
6526 num = add_data( pRExC_state, 1, "S" );
6527 RExC_rxi->data->data[num]=(void*)sv_dat;
6528 SvREFCNT_inc(sv_dat);
6532 ret = reganode(pRExC_state,
6533 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6537 /* override incorrect value set in reganode MJD */
6538 Set_Node_Offset(ret, parse_start+1);
6539 Set_Node_Cur_Length(ret); /* MJD */
6540 nextchar(pRExC_state);
6546 case '1': case '2': case '3': case '4':
6547 case '5': case '6': case '7': case '8': case '9':
6550 bool isg = *RExC_parse == 'g';
6555 if (*RExC_parse == '{') {
6559 if (*RExC_parse == '-') {
6563 if (hasbrace && !isDIGIT(*RExC_parse)) {
6564 if (isrel) RExC_parse--;
6566 goto parse_named_seq;
6568 num = atoi(RExC_parse);
6570 num = RExC_npar - num;
6572 vFAIL("Reference to nonexistent or unclosed group");
6574 if (!isg && num > 9 && num >= RExC_npar)
6577 char * const parse_start = RExC_parse - 1; /* MJD */
6578 while (isDIGIT(*RExC_parse))
6580 if (parse_start == RExC_parse - 1)
6581 vFAIL("Unterminated \\g... pattern");
6583 if (*RExC_parse != '}')
6584 vFAIL("Unterminated \\g{...} pattern");
6588 if (num > (I32)RExC_rx->nparens)
6589 vFAIL("Reference to nonexistent group");
6592 ret = reganode(pRExC_state,
6593 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6597 /* override incorrect value set in reganode MJD */
6598 Set_Node_Offset(ret, parse_start+1);
6599 Set_Node_Cur_Length(ret); /* MJD */
6601 nextchar(pRExC_state);
6606 if (RExC_parse >= RExC_end)
6607 FAIL("Trailing \\");
6610 /* Do not generate "unrecognized" warnings here, we fall
6611 back into the quick-grab loop below */
6618 if (RExC_flags & RXf_PMf_EXTENDED) {
6619 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6621 if (RExC_parse < RExC_end)
6627 register STRLEN len;
6632 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6634 parse_start = RExC_parse - 1;
6640 ret = reg_node(pRExC_state,
6641 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6643 for (len = 0, p = RExC_parse - 1;
6644 len < 127 && p < RExC_end;
6647 char * const oldp = p;
6649 if (RExC_flags & RXf_PMf_EXTENDED)
6650 p = regwhite(p, RExC_end);
6661 /* Literal Escapes Switch
6663 This switch is meant to handle escape sequences that
6664 resolve to a literal character.
6666 Every escape sequence that represents something
6667 else, like an assertion or a char class, is handled
6668 in the switch marked 'Special Escapes' above in this
6669 routine, but also has an entry here as anything that
6670 isn't explicitly mentioned here will be treated as
6671 an unescaped equivalent literal.
6675 /* These are all the special escapes. */
6676 case 'A': /* Start assertion */
6677 case 'b': case 'B': /* Word-boundary assertion*/
6678 case 'C': /* Single char !DANGEROUS! */
6679 case 'd': case 'D': /* digit class */
6680 case 'g': case 'G': /* generic-backref, pos assertion */
6681 case 'k': case 'K': /* named backref, keep marker */
6682 case 'N': /* named char sequence */
6683 case 'p': case 'P': /* unicode property */
6684 case 's': case 'S': /* space class */
6685 case 'v': case 'V': /* (*PRUNE) and (*SKIP) */
6686 case 'w': case 'W': /* word class */
6687 case 'X': /* eXtended Unicode "combining character sequence" */
6688 case 'z': case 'Z': /* End of line/string assertion */
6692 /* Anything after here is an escape that resolves to a
6693 literal. (Except digits, which may or may not)
6712 ender = ASCII_TO_NATIVE('\033');
6716 ender = ASCII_TO_NATIVE('\007');
6721 char* const e = strchr(p, '}');
6725 vFAIL("Missing right brace on \\x{}");
6728 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6729 | PERL_SCAN_DISALLOW_PREFIX;
6730 STRLEN numlen = e - p - 1;
6731 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6738 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6740 ender = grok_hex(p, &numlen, &flags, NULL);
6743 if (PL_encoding && ender < 0x100)
6744 goto recode_encoding;
6748 ender = UCHARAT(p++);
6749 ender = toCTRL(ender);
6751 case '0': case '1': case '2': case '3':case '4':
6752 case '5': case '6': case '7': case '8':case '9':
6754 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6757 ender = grok_oct(p, &numlen, &flags, NULL);
6764 if (PL_encoding && ender < 0x100)
6765 goto recode_encoding;
6769 SV* enc = PL_encoding;
6770 ender = reg_recode((const char)(U8)ender, &enc);
6771 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6772 vWARN(p, "Invalid escape in the specified encoding");
6778 FAIL("Trailing \\");
6781 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6782 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6783 goto normal_default;
6788 if (UTF8_IS_START(*p) && UTF) {
6790 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6791 &numlen, UTF8_ALLOW_DEFAULT);
6798 if (RExC_flags & RXf_PMf_EXTENDED)
6799 p = regwhite(p, RExC_end);
6801 /* Prime the casefolded buffer. */
6802 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6804 if (ISMULT2(p)) { /* Back off on ?+*. */
6809 /* Emit all the Unicode characters. */
6811 for (foldbuf = tmpbuf;
6813 foldlen -= numlen) {
6814 ender = utf8_to_uvchr(foldbuf, &numlen);
6816 const STRLEN unilen = reguni(pRExC_state, ender, s);
6819 /* In EBCDIC the numlen
6820 * and unilen can differ. */
6822 if (numlen >= foldlen)
6826 break; /* "Can't happen." */
6830 const STRLEN unilen = reguni(pRExC_state, ender, s);
6839 REGC((char)ender, s++);
6845 /* Emit all the Unicode characters. */
6847 for (foldbuf = tmpbuf;
6849 foldlen -= numlen) {
6850 ender = utf8_to_uvchr(foldbuf, &numlen);
6852 const STRLEN unilen = reguni(pRExC_state, ender, s);
6855 /* In EBCDIC the numlen
6856 * and unilen can differ. */
6858 if (numlen >= foldlen)
6866 const STRLEN unilen = reguni(pRExC_state, ender, s);
6875 REGC((char)ender, s++);
6879 Set_Node_Cur_Length(ret); /* MJD */
6880 nextchar(pRExC_state);
6882 /* len is STRLEN which is unsigned, need to copy to signed */
6885 vFAIL("Internal disaster");
6889 if (len == 1 && UNI_IS_INVARIANT(ender))
6893 RExC_size += STR_SZ(len);
6896 RExC_emit += STR_SZ(len);
6906 S_regwhite(char *p, const char *e)
6911 else if (*p == '#') {
6914 } while (p < e && *p != '\n');
6922 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6923 Character classes ([:foo:]) can also be negated ([:^foo:]).
6924 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6925 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6926 but trigger failures because they are currently unimplemented. */
6928 #define POSIXCC_DONE(c) ((c) == ':')
6929 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6930 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6933 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6936 I32 namedclass = OOB_NAMEDCLASS;
6938 if (value == '[' && RExC_parse + 1 < RExC_end &&
6939 /* I smell either [: or [= or [. -- POSIX has been here, right? */
6940 POSIXCC(UCHARAT(RExC_parse))) {
6941 const char c = UCHARAT(RExC_parse);
6942 char* const s = RExC_parse++;
6944 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6946 if (RExC_parse == RExC_end)
6947 /* Grandfather lone [:, [=, [. */
6950 const char* const t = RExC_parse++; /* skip over the c */
6953 if (UCHARAT(RExC_parse) == ']') {
6954 const char *posixcc = s + 1;
6955 RExC_parse++; /* skip over the ending ] */
6958 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6959 const I32 skip = t - posixcc;
6961 /* Initially switch on the length of the name. */
6964 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6965 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6968 /* Names all of length 5. */
6969 /* alnum alpha ascii blank cntrl digit graph lower
6970 print punct space upper */
6971 /* Offset 4 gives the best switch position. */
6972 switch (posixcc[4]) {
6974 if (memEQ(posixcc, "alph", 4)) /* alpha */
6975 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6978 if (memEQ(posixcc, "spac", 4)) /* space */
6979 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6982 if (memEQ(posixcc, "grap", 4)) /* graph */
6983 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6986 if (memEQ(posixcc, "asci", 4)) /* ascii */
6987 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6990 if (memEQ(posixcc, "blan", 4)) /* blank */
6991 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6994 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6995 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6998 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6999 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7002 if (memEQ(posixcc, "lowe", 4)) /* lower */
7003 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7004 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7005 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7008 if (memEQ(posixcc, "digi", 4)) /* digit */
7009 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7010 else if (memEQ(posixcc, "prin", 4)) /* print */
7011 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7012 else if (memEQ(posixcc, "punc", 4)) /* punct */
7013 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7018 if (memEQ(posixcc, "xdigit", 6))
7019 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7023 if (namedclass == OOB_NAMEDCLASS)
7024 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7026 assert (posixcc[skip] == ':');
7027 assert (posixcc[skip+1] == ']');
7028 } else if (!SIZE_ONLY) {
7029 /* [[=foo=]] and [[.foo.]] are still future. */
7031 /* adjust RExC_parse so the warning shows after
7033 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7035 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7038 /* Maternal grandfather:
7039 * "[:" ending in ":" but not in ":]" */
7049 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7052 if (POSIXCC(UCHARAT(RExC_parse))) {
7053 const char *s = RExC_parse;
7054 const char c = *s++;
7058 if (*s && c == *s && s[1] == ']') {
7059 if (ckWARN(WARN_REGEXP))
7061 "POSIX syntax [%c %c] belongs inside character classes",
7064 /* [[=foo=]] and [[.foo.]] are still future. */
7065 if (POSIXCC_NOTYET(c)) {
7066 /* adjust RExC_parse so the error shows after
7068 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7070 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7077 #define _C_C_T_(NAME,TEST,WORD) \
7080 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7082 for (value = 0; value < 256; value++) \
7084 ANYOF_BITMAP_SET(ret, value); \
7089 case ANYOF_N##NAME: \
7091 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7093 for (value = 0; value < 256; value++) \
7095 ANYOF_BITMAP_SET(ret, value); \
7103 parse a class specification and produce either an ANYOF node that
7104 matches the pattern or if the pattern matches a single char only and
7105 that char is < 256 and we are case insensitive then we produce an
7110 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7113 register UV value = 0;
7114 register UV nextvalue;
7115 register IV prevvalue = OOB_UNICODE;
7116 register IV range = 0;
7117 register regnode *ret;
7120 char *rangebegin = NULL;
7121 bool need_class = 0;
7124 bool optimize_invert = TRUE;
7125 AV* unicode_alternate = NULL;
7127 UV literal_endpoint = 0;
7129 UV stored = 0; /* number of chars stored in the class */
7131 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7132 case we need to change the emitted regop to an EXACT. */
7133 const char * orig_parse = RExC_parse;
7134 GET_RE_DEBUG_FLAGS_DECL;
7136 PERL_UNUSED_ARG(depth);
7139 DEBUG_PARSE("clas");
7141 /* Assume we are going to generate an ANYOF node. */
7142 ret = reganode(pRExC_state, ANYOF, 0);
7145 ANYOF_FLAGS(ret) = 0;
7147 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7151 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7155 RExC_size += ANYOF_SKIP;
7156 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7159 RExC_emit += ANYOF_SKIP;
7161 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7163 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7164 ANYOF_BITMAP_ZERO(ret);
7165 listsv = newSVpvs("# comment\n");
7168 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7170 if (!SIZE_ONLY && POSIXCC(nextvalue))
7171 checkposixcc(pRExC_state);
7173 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7174 if (UCHARAT(RExC_parse) == ']')
7178 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7182 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7185 rangebegin = RExC_parse;
7187 value = utf8n_to_uvchr((U8*)RExC_parse,
7188 RExC_end - RExC_parse,
7189 &numlen, UTF8_ALLOW_DEFAULT);
7190 RExC_parse += numlen;
7193 value = UCHARAT(RExC_parse++);
7195 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7196 if (value == '[' && POSIXCC(nextvalue))
7197 namedclass = regpposixcc(pRExC_state, value);
7198 else if (value == '\\') {
7200 value = utf8n_to_uvchr((U8*)RExC_parse,
7201 RExC_end - RExC_parse,
7202 &numlen, UTF8_ALLOW_DEFAULT);
7203 RExC_parse += numlen;
7206 value = UCHARAT(RExC_parse++);
7207 /* Some compilers cannot handle switching on 64-bit integer
7208 * values, therefore value cannot be an UV. Yes, this will
7209 * be a problem later if we want switch on Unicode.
7210 * A similar issue a little bit later when switching on
7211 * namedclass. --jhi */
7212 switch ((I32)value) {
7213 case 'w': namedclass = ANYOF_ALNUM; break;
7214 case 'W': namedclass = ANYOF_NALNUM; break;
7215 case 's': namedclass = ANYOF_SPACE; break;
7216 case 'S': namedclass = ANYOF_NSPACE; break;
7217 case 'd': namedclass = ANYOF_DIGIT; break;
7218 case 'D': namedclass = ANYOF_NDIGIT; break;
7219 case 'N': /* Handle \N{NAME} in class */
7221 /* We only pay attention to the first char of
7222 multichar strings being returned. I kinda wonder
7223 if this makes sense as it does change the behaviour
7224 from earlier versions, OTOH that behaviour was broken
7226 UV v; /* value is register so we cant & it /grrr */
7227 if (reg_namedseq(pRExC_state, &v)) {
7237 if (RExC_parse >= RExC_end)
7238 vFAIL2("Empty \\%c{}", (U8)value);
7239 if (*RExC_parse == '{') {
7240 const U8 c = (U8)value;
7241 e = strchr(RExC_parse++, '}');
7243 vFAIL2("Missing right brace on \\%c{}", c);
7244 while (isSPACE(UCHARAT(RExC_parse)))
7246 if (e == RExC_parse)
7247 vFAIL2("Empty \\%c{}", c);
7249 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7257 if (UCHARAT(RExC_parse) == '^') {
7260 value = value == 'p' ? 'P' : 'p'; /* toggle */
7261 while (isSPACE(UCHARAT(RExC_parse))) {
7266 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7267 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7270 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7271 namedclass = ANYOF_MAX; /* no official name, but it's named */
7274 case 'n': value = '\n'; break;
7275 case 'r': value = '\r'; break;
7276 case 't': value = '\t'; break;
7277 case 'f': value = '\f'; break;
7278 case 'b': value = '\b'; break;
7279 case 'e': value = ASCII_TO_NATIVE('\033');break;
7280 case 'a': value = ASCII_TO_NATIVE('\007');break;
7282 if (*RExC_parse == '{') {
7283 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7284 | PERL_SCAN_DISALLOW_PREFIX;
7285 char * const e = strchr(RExC_parse++, '}');
7287 vFAIL("Missing right brace on \\x{}");
7289 numlen = e - RExC_parse;
7290 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7294 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7296 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7297 RExC_parse += numlen;
7299 if (PL_encoding && value < 0x100)
7300 goto recode_encoding;
7303 value = UCHARAT(RExC_parse++);
7304 value = toCTRL(value);
7306 case '0': case '1': case '2': case '3': case '4':
7307 case '5': case '6': case '7': case '8': case '9':
7311 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7312 RExC_parse += numlen;
7313 if (PL_encoding && value < 0x100)
7314 goto recode_encoding;
7319 SV* enc = PL_encoding;
7320 value = reg_recode((const char)(U8)value, &enc);
7321 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7323 "Invalid escape in the specified encoding");
7327 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7329 "Unrecognized escape \\%c in character class passed through",
7333 } /* end of \blah */
7339 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7341 if (!SIZE_ONLY && !need_class)
7342 ANYOF_CLASS_ZERO(ret);
7346 /* a bad range like a-\d, a-[:digit:] ? */
7349 if (ckWARN(WARN_REGEXP)) {
7351 RExC_parse >= rangebegin ?
7352 RExC_parse - rangebegin : 0;
7354 "False [] range \"%*.*s\"",
7357 if (prevvalue < 256) {
7358 ANYOF_BITMAP_SET(ret, prevvalue);
7359 ANYOF_BITMAP_SET(ret, '-');
7362 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7363 Perl_sv_catpvf(aTHX_ listsv,
7364 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7368 range = 0; /* this was not a true range */
7374 const char *what = NULL;
7377 if (namedclass > OOB_NAMEDCLASS)
7378 optimize_invert = FALSE;
7379 /* Possible truncation here but in some 64-bit environments
7380 * the compiler gets heartburn about switch on 64-bit values.
7381 * A similar issue a little earlier when switching on value.
7383 switch ((I32)namedclass) {
7384 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7385 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7386 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7387 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7388 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7389 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7390 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7391 case _C_C_T_(PRINT, isPRINT(value), "Print");
7392 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7393 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7394 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7395 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7396 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
7399 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7402 for (value = 0; value < 128; value++)
7403 ANYOF_BITMAP_SET(ret, value);
7405 for (value = 0; value < 256; value++) {
7407 ANYOF_BITMAP_SET(ret, value);
7416 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7419 for (value = 128; value < 256; value++)
7420 ANYOF_BITMAP_SET(ret, value);
7422 for (value = 0; value < 256; value++) {
7423 if (!isASCII(value))
7424 ANYOF_BITMAP_SET(ret, value);
7433 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7435 /* consecutive digits assumed */
7436 for (value = '0'; value <= '9'; value++)
7437 ANYOF_BITMAP_SET(ret, value);
7444 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7446 /* consecutive digits assumed */
7447 for (value = 0; value < '0'; value++)
7448 ANYOF_BITMAP_SET(ret, value);
7449 for (value = '9' + 1; value < 256; value++)
7450 ANYOF_BITMAP_SET(ret, value);
7456 /* this is to handle \p and \P */
7459 vFAIL("Invalid [::] class");
7463 /* Strings such as "+utf8::isWord\n" */
7464 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7467 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7470 } /* end of namedclass \blah */
7473 if (prevvalue > (IV)value) /* b-a */ {
7474 const int w = RExC_parse - rangebegin;
7475 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7476 range = 0; /* not a valid range */
7480 prevvalue = value; /* save the beginning of the range */
7481 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7482 RExC_parse[1] != ']') {
7485 /* a bad range like \w-, [:word:]- ? */
7486 if (namedclass > OOB_NAMEDCLASS) {
7487 if (ckWARN(WARN_REGEXP)) {
7489 RExC_parse >= rangebegin ?
7490 RExC_parse - rangebegin : 0;
7492 "False [] range \"%*.*s\"",
7496 ANYOF_BITMAP_SET(ret, '-');
7498 range = 1; /* yeah, it's a range! */
7499 continue; /* but do it the next time */
7503 /* now is the next time */
7504 /*stored += (value - prevvalue + 1);*/
7506 if (prevvalue < 256) {
7507 const IV ceilvalue = value < 256 ? value : 255;
7510 /* In EBCDIC [\x89-\x91] should include
7511 * the \x8e but [i-j] should not. */
7512 if (literal_endpoint == 2 &&
7513 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7514 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7516 if (isLOWER(prevvalue)) {
7517 for (i = prevvalue; i <= ceilvalue; i++)
7519 ANYOF_BITMAP_SET(ret, i);
7521 for (i = prevvalue; i <= ceilvalue; i++)
7523 ANYOF_BITMAP_SET(ret, i);
7528 for (i = prevvalue; i <= ceilvalue; i++) {
7529 if (!ANYOF_BITMAP_TEST(ret,i)) {
7531 ANYOF_BITMAP_SET(ret, i);
7535 if (value > 255 || UTF) {
7536 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7537 const UV natvalue = NATIVE_TO_UNI(value);
7538 stored+=2; /* can't optimize this class */
7539 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7540 if (prevnatvalue < natvalue) { /* what about > ? */
7541 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7542 prevnatvalue, natvalue);
7544 else if (prevnatvalue == natvalue) {
7545 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7547 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7549 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7551 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7552 if (RExC_precomp[0] == ':' &&
7553 RExC_precomp[1] == '[' &&
7554 (f == 0xDF || f == 0x92)) {
7555 f = NATIVE_TO_UNI(f);
7558 /* If folding and foldable and a single
7559 * character, insert also the folded version
7560 * to the charclass. */
7562 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7563 if ((RExC_precomp[0] == ':' &&
7564 RExC_precomp[1] == '[' &&
7566 (value == 0xFB05 || value == 0xFB06))) ?
7567 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7568 foldlen == (STRLEN)UNISKIP(f) )
7570 if (foldlen == (STRLEN)UNISKIP(f))
7572 Perl_sv_catpvf(aTHX_ listsv,
7575 /* Any multicharacter foldings
7576 * require the following transform:
7577 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7578 * where E folds into "pq" and F folds
7579 * into "rst", all other characters
7580 * fold to single characters. We save
7581 * away these multicharacter foldings,
7582 * to be later saved as part of the
7583 * additional "s" data. */
7586 if (!unicode_alternate)
7587 unicode_alternate = newAV();
7588 sv = newSVpvn((char*)foldbuf, foldlen);
7590 av_push(unicode_alternate, sv);
7594 /* If folding and the value is one of the Greek
7595 * sigmas insert a few more sigmas to make the
7596 * folding rules of the sigmas to work right.
7597 * Note that not all the possible combinations
7598 * are handled here: some of them are handled
7599 * by the standard folding rules, and some of
7600 * them (literal or EXACTF cases) are handled
7601 * during runtime in regexec.c:S_find_byclass(). */
7602 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7603 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7604 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7605 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7606 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7608 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7609 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7610 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7615 literal_endpoint = 0;
7619 range = 0; /* this range (if it was one) is done now */
7623 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7625 RExC_size += ANYOF_CLASS_ADD_SKIP;
7627 RExC_emit += ANYOF_CLASS_ADD_SKIP;
7633 /****** !SIZE_ONLY AFTER HERE *********/
7635 if( stored == 1 && value < 256
7636 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7638 /* optimize single char class to an EXACT node
7639 but *only* when its not a UTF/high char */
7640 const char * cur_parse= RExC_parse;
7641 RExC_emit = (regnode *)orig_emit;
7642 RExC_parse = (char *)orig_parse;
7643 ret = reg_node(pRExC_state,
7644 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7645 RExC_parse = (char *)cur_parse;
7646 *STRING(ret)= (char)value;
7648 RExC_emit += STR_SZ(1);
7651 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7652 if ( /* If the only flag is folding (plus possibly inversion). */
7653 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7655 for (value = 0; value < 256; ++value) {
7656 if (ANYOF_BITMAP_TEST(ret, value)) {
7657 UV fold = PL_fold[value];
7660 ANYOF_BITMAP_SET(ret, fold);
7663 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7666 /* optimize inverted simple patterns (e.g. [^a-z]) */
7667 if (optimize_invert &&
7668 /* If the only flag is inversion. */
7669 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7670 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7671 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7672 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7675 AV * const av = newAV();
7677 /* The 0th element stores the character class description
7678 * in its textual form: used later (regexec.c:Perl_regclass_swash())
7679 * to initialize the appropriate swash (which gets stored in
7680 * the 1st element), and also useful for dumping the regnode.
7681 * The 2nd element stores the multicharacter foldings,
7682 * used later (regexec.c:S_reginclass()). */
7683 av_store(av, 0, listsv);
7684 av_store(av, 1, NULL);
7685 av_store(av, 2, (SV*)unicode_alternate);
7686 rv = newRV_noinc((SV*)av);
7687 n = add_data(pRExC_state, 1, "s");
7688 RExC_rxi->data->data[n] = (void*)rv;
7697 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7699 char* const retval = RExC_parse++;
7702 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7703 RExC_parse[2] == '#') {
7704 while (*RExC_parse != ')') {
7705 if (RExC_parse == RExC_end)
7706 FAIL("Sequence (?#... not terminated");
7712 if (RExC_flags & RXf_PMf_EXTENDED) {
7713 if (isSPACE(*RExC_parse)) {
7717 else if (*RExC_parse == '#') {
7718 while (RExC_parse < RExC_end)
7719 if (*RExC_parse++ == '\n') break;
7728 - reg_node - emit a node
7730 STATIC regnode * /* Location. */
7731 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7734 register regnode *ptr;
7735 regnode * const ret = RExC_emit;
7736 GET_RE_DEBUG_FLAGS_DECL;
7739 SIZE_ALIGN(RExC_size);
7744 if (OP(RExC_emit) == 255)
7745 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7746 reg_name[op], OP(RExC_emit));
7748 NODE_ALIGN_FILL(ret);
7750 FILL_ADVANCE_NODE(ptr, op);
7751 if (RExC_offsets) { /* MJD */
7752 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7753 "reg_node", __LINE__,
7755 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7756 ? "Overwriting end of array!\n" : "OK",
7757 (UV)(RExC_emit - RExC_emit_start),
7758 (UV)(RExC_parse - RExC_start),
7759 (UV)RExC_offsets[0]));
7760 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7768 - reganode - emit a node with an argument
7770 STATIC regnode * /* Location. */
7771 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7774 register regnode *ptr;
7775 regnode * const ret = RExC_emit;
7776 GET_RE_DEBUG_FLAGS_DECL;
7779 SIZE_ALIGN(RExC_size);
7784 assert(2==regarglen[op]+1);
7786 Anything larger than this has to allocate the extra amount.
7787 If we changed this to be:
7789 RExC_size += (1 + regarglen[op]);
7791 then it wouldn't matter. Its not clear what side effect
7792 might come from that so its not done so far.
7798 if (OP(RExC_emit) == 255)
7799 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7801 NODE_ALIGN_FILL(ret);
7803 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7804 if (RExC_offsets) { /* MJD */
7805 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7809 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7810 "Overwriting end of array!\n" : "OK",
7811 (UV)(RExC_emit - RExC_emit_start),
7812 (UV)(RExC_parse - RExC_start),
7813 (UV)RExC_offsets[0]));
7814 Set_Cur_Node_Offset;
7822 - reguni - emit (if appropriate) a Unicode character
7825 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7828 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7832 - reginsert - insert an operator in front of already-emitted operand
7834 * Means relocating the operand.
7837 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7840 register regnode *src;
7841 register regnode *dst;
7842 register regnode *place;
7843 const int offset = regarglen[(U8)op];
7844 const int size = NODE_STEP_REGNODE + offset;
7845 GET_RE_DEBUG_FLAGS_DECL;
7846 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7847 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7856 if (RExC_open_parens) {
7858 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
7859 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7860 if ( RExC_open_parens[paren] >= opnd ) {
7861 DEBUG_PARSE_FMT("open"," - %d",size);
7862 RExC_open_parens[paren] += size;
7864 DEBUG_PARSE_FMT("open"," - %s","ok");
7866 if ( RExC_close_parens[paren] >= opnd ) {
7867 DEBUG_PARSE_FMT("close"," - %d",size);
7868 RExC_close_parens[paren] += size;
7870 DEBUG_PARSE_FMT("close"," - %s","ok");
7875 while (src > opnd) {
7876 StructCopy(--src, --dst, regnode);
7877 if (RExC_offsets) { /* MJD 20010112 */
7878 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7882 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7883 ? "Overwriting end of array!\n" : "OK",
7884 (UV)(src - RExC_emit_start),
7885 (UV)(dst - RExC_emit_start),
7886 (UV)RExC_offsets[0]));
7887 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7888 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7893 place = opnd; /* Op node, where operand used to be. */
7894 if (RExC_offsets) { /* MJD */
7895 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7899 (UV)(place - RExC_emit_start) > RExC_offsets[0]
7900 ? "Overwriting end of array!\n" : "OK",
7901 (UV)(place - RExC_emit_start),
7902 (UV)(RExC_parse - RExC_start),
7903 (UV)RExC_offsets[0]));
7904 Set_Node_Offset(place, RExC_parse);
7905 Set_Node_Length(place, 1);
7907 src = NEXTOPER(place);
7908 FILL_ADVANCE_NODE(place, op);
7909 Zero(src, offset, regnode);
7913 - regtail - set the next-pointer at the end of a node chain of p to val.
7914 - SEE ALSO: regtail_study
7916 /* TODO: All three parms should be const */
7918 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7921 register regnode *scan;
7922 GET_RE_DEBUG_FLAGS_DECL;
7924 PERL_UNUSED_ARG(depth);
7930 /* Find last node. */
7933 regnode * const temp = regnext(scan);
7935 SV * const mysv=sv_newmortal();
7936 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7937 regprop(RExC_rx, mysv, scan);
7938 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7939 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7940 (temp == NULL ? "->" : ""),
7941 (temp == NULL ? reg_name[OP(val)] : "")
7949 if (reg_off_by_arg[OP(scan)]) {
7950 ARG_SET(scan, val - scan);
7953 NEXT_OFF(scan) = val - scan;
7959 - regtail_study - set the next-pointer at the end of a node chain of p to val.
7960 - Look for optimizable sequences at the same time.
7961 - currently only looks for EXACT chains.
7963 This is expermental code. The idea is to use this routine to perform
7964 in place optimizations on branches and groups as they are constructed,
7965 with the long term intention of removing optimization from study_chunk so
7966 that it is purely analytical.
7968 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7969 to control which is which.
7972 /* TODO: All four parms should be const */
7975 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7978 register regnode *scan;
7980 #ifdef EXPERIMENTAL_INPLACESCAN
7984 GET_RE_DEBUG_FLAGS_DECL;
7990 /* Find last node. */
7994 regnode * const temp = regnext(scan);
7995 #ifdef EXPERIMENTAL_INPLACESCAN
7996 if (PL_regkind[OP(scan)] == EXACT)
7997 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8005 if( exact == PSEUDO )
8007 else if ( exact != OP(scan) )
8016 SV * const mysv=sv_newmortal();
8017 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8018 regprop(RExC_rx, mysv, scan);
8019 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8020 SvPV_nolen_const(mysv),
8029 SV * const mysv_val=sv_newmortal();
8030 DEBUG_PARSE_MSG("");
8031 regprop(RExC_rx, mysv_val, val);
8032 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8033 SvPV_nolen_const(mysv_val),
8034 (IV)REG_NODE_NUM(val),
8038 if (reg_off_by_arg[OP(scan)]) {
8039 ARG_SET(scan, val - scan);
8042 NEXT_OFF(scan) = val - scan;
8050 - regcurly - a little FSA that accepts {\d+,?\d*}
8053 S_regcurly(register const char *s)
8072 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8075 Perl_regdump(pTHX_ const regexp *r)
8079 SV * const sv = sv_newmortal();
8080 SV *dsv= sv_newmortal();
8083 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8085 /* Header fields of interest. */
8086 if (r->anchored_substr) {
8087 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8088 RE_SV_DUMPLEN(r->anchored_substr), 30);
8089 PerlIO_printf(Perl_debug_log,
8090 "anchored %s%s at %"IVdf" ",
8091 s, RE_SV_TAIL(r->anchored_substr),
8092 (IV)r->anchored_offset);
8093 } else if (r->anchored_utf8) {
8094 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8095 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8096 PerlIO_printf(Perl_debug_log,
8097 "anchored utf8 %s%s at %"IVdf" ",
8098 s, RE_SV_TAIL(r->anchored_utf8),
8099 (IV)r->anchored_offset);
8101 if (r->float_substr) {
8102 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8103 RE_SV_DUMPLEN(r->float_substr), 30);
8104 PerlIO_printf(Perl_debug_log,
8105 "floating %s%s at %"IVdf"..%"UVuf" ",
8106 s, RE_SV_TAIL(r->float_substr),
8107 (IV)r->float_min_offset, (UV)r->float_max_offset);
8108 } else if (r->float_utf8) {
8109 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8110 RE_SV_DUMPLEN(r->float_utf8), 30);
8111 PerlIO_printf(Perl_debug_log,
8112 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8113 s, RE_SV_TAIL(r->float_utf8),
8114 (IV)r->float_min_offset, (UV)r->float_max_offset);
8116 if (r->check_substr || r->check_utf8)
8117 PerlIO_printf(Perl_debug_log,
8119 (r->check_substr == r->float_substr
8120 && r->check_utf8 == r->float_utf8
8121 ? "(checking floating" : "(checking anchored"));
8122 if (r->extflags & RXf_NOSCAN)
8123 PerlIO_printf(Perl_debug_log, " noscan");
8124 if (r->extflags & RXf_CHECK_ALL)
8125 PerlIO_printf(Perl_debug_log, " isall");
8126 if (r->check_substr || r->check_utf8)
8127 PerlIO_printf(Perl_debug_log, ") ");
8129 if (ri->regstclass) {
8130 regprop(r, sv, ri->regstclass);
8131 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8133 if (r->extflags & RXf_ANCH) {
8134 PerlIO_printf(Perl_debug_log, "anchored");
8135 if (r->extflags & RXf_ANCH_BOL)
8136 PerlIO_printf(Perl_debug_log, "(BOL)");
8137 if (r->extflags & RXf_ANCH_MBOL)
8138 PerlIO_printf(Perl_debug_log, "(MBOL)");
8139 if (r->extflags & RXf_ANCH_SBOL)
8140 PerlIO_printf(Perl_debug_log, "(SBOL)");
8141 if (r->extflags & RXf_ANCH_GPOS)
8142 PerlIO_printf(Perl_debug_log, "(GPOS)");
8143 PerlIO_putc(Perl_debug_log, ' ');
8145 if (r->extflags & RXf_GPOS_SEEN)
8146 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8147 if (r->intflags & PREGf_SKIP)
8148 PerlIO_printf(Perl_debug_log, "plus ");
8149 if (r->intflags & PREGf_IMPLICIT)
8150 PerlIO_printf(Perl_debug_log, "implicit ");
8151 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8152 if (r->extflags & RXf_EVAL_SEEN)
8153 PerlIO_printf(Perl_debug_log, "with eval ");
8154 PerlIO_printf(Perl_debug_log, "\n");
8156 PERL_UNUSED_CONTEXT;
8158 #endif /* DEBUGGING */
8162 - regprop - printable representation of opcode
8165 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8170 RXi_GET_DECL(prog,progi);
8171 GET_RE_DEBUG_FLAGS_DECL;
8174 sv_setpvn(sv, "", 0);
8176 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8177 /* It would be nice to FAIL() here, but this may be called from
8178 regexec.c, and it would be hard to supply pRExC_state. */
8179 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8180 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8182 k = PL_regkind[OP(o)];
8185 SV * const dsv = sv_2mortal(newSVpvs(""));
8186 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8187 * is a crude hack but it may be the best for now since
8188 * we have no flag "this EXACTish node was UTF-8"
8190 const char * const s =
8191 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8192 PL_colors[0], PL_colors[1],
8193 PERL_PV_ESCAPE_UNI_DETECT |
8194 PERL_PV_PRETTY_ELIPSES |
8197 Perl_sv_catpvf(aTHX_ sv, " %s", s );
8198 } else if (k == TRIE) {
8199 /* print the details of the trie in dumpuntil instead, as
8200 * progi->data isn't available here */
8201 const char op = OP(o);
8202 const U32 n = ARG(o);
8203 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8204 (reg_ac_data *)progi->data->data[n] :
8206 const reg_trie_data * const trie
8207 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
8209 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8210 DEBUG_TRIE_COMPILE_r(
8211 Perl_sv_catpvf(aTHX_ sv,
8212 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8213 (UV)trie->startstate,
8214 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8215 (UV)trie->wordcount,
8218 (UV)TRIE_CHARCOUNT(trie),
8219 (UV)trie->uniquecharcount
8222 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8224 int rangestart = -1;
8225 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8226 Perl_sv_catpvf(aTHX_ sv, "[");
8227 for (i = 0; i <= 256; i++) {
8228 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8229 if (rangestart == -1)
8231 } else if (rangestart != -1) {
8232 if (i <= rangestart + 3)
8233 for (; rangestart < i; rangestart++)
8234 put_byte(sv, rangestart);
8236 put_byte(sv, rangestart);
8238 put_byte(sv, i - 1);
8243 Perl_sv_catpvf(aTHX_ sv, "]");
8246 } else if (k == CURLY) {
8247 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8248 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8249 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8251 else if (k == WHILEM && o->flags) /* Ordinal/of */
8252 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8253 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
8254 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8255 if ( prog->paren_names ) {
8256 if ( k != REF || OP(o) < NREF) {
8257 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8258 SV **name= av_fetch(list, ARG(o), 0 );
8260 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8263 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8264 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8265 I32 *nums=(I32*)SvPVX(sv_dat);
8266 SV **name= av_fetch(list, nums[0], 0 );
8269 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8270 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8271 (n ? "," : ""), (IV)nums[n]);
8273 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8277 } else if (k == GOSUB)
8278 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8279 else if (k == VERB) {
8281 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8282 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
8283 } else if (k == LOGICAL)
8284 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8285 else if (k == ANYOF) {
8286 int i, rangestart = -1;
8287 const U8 flags = ANYOF_FLAGS(o);
8289 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8290 static const char * const anyofs[] = {
8323 if (flags & ANYOF_LOCALE)
8324 sv_catpvs(sv, "{loc}");
8325 if (flags & ANYOF_FOLD)
8326 sv_catpvs(sv, "{i}");
8327 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8328 if (flags & ANYOF_INVERT)
8330 for (i = 0; i <= 256; i++) {
8331 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8332 if (rangestart == -1)
8334 } else if (rangestart != -1) {
8335 if (i <= rangestart + 3)
8336 for (; rangestart < i; rangestart++)
8337 put_byte(sv, rangestart);
8339 put_byte(sv, rangestart);
8341 put_byte(sv, i - 1);
8347 if (o->flags & ANYOF_CLASS)
8348 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8349 if (ANYOF_CLASS_TEST(o,i))
8350 sv_catpv(sv, anyofs[i]);
8352 if (flags & ANYOF_UNICODE)
8353 sv_catpvs(sv, "{unicode}");
8354 else if (flags & ANYOF_UNICODE_ALL)
8355 sv_catpvs(sv, "{unicode_all}");
8359 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8363 U8 s[UTF8_MAXBYTES_CASE+1];
8365 for (i = 0; i <= 256; i++) { /* just the first 256 */
8366 uvchr_to_utf8(s, i);
8368 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8369 if (rangestart == -1)
8371 } else if (rangestart != -1) {
8372 if (i <= rangestart + 3)
8373 for (; rangestart < i; rangestart++) {
8374 const U8 * const e = uvchr_to_utf8(s,rangestart);
8376 for(p = s; p < e; p++)
8380 const U8 *e = uvchr_to_utf8(s,rangestart);
8382 for (p = s; p < e; p++)
8385 e = uvchr_to_utf8(s, i-1);
8386 for (p = s; p < e; p++)
8393 sv_catpvs(sv, "..."); /* et cetera */
8397 char *s = savesvpv(lv);
8398 char * const origs = s;
8400 while (*s && *s != '\n')
8404 const char * const t = ++s;
8422 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8424 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8425 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8427 PERL_UNUSED_CONTEXT;
8428 PERL_UNUSED_ARG(sv);
8430 PERL_UNUSED_ARG(prog);
8431 #endif /* DEBUGGING */
8435 Perl_re_intuit_string(pTHX_ regexp *prog)
8436 { /* Assume that RE_INTUIT is set */
8438 GET_RE_DEBUG_FLAGS_DECL;
8439 PERL_UNUSED_CONTEXT;
8443 const char * const s = SvPV_nolen_const(prog->check_substr
8444 ? prog->check_substr : prog->check_utf8);
8446 if (!PL_colorset) reginitcolors();
8447 PerlIO_printf(Perl_debug_log,
8448 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8450 prog->check_substr ? "" : "utf8 ",
8451 PL_colors[5],PL_colors[0],
8454 (strlen(s) > 60 ? "..." : ""));
8457 return prog->check_substr ? prog->check_substr : prog->check_utf8;
8463 handles refcounting and freeing the perl core regexp structure. When
8464 it is necessary to actually free the structure the first thing it
8465 does is call the 'free' method of the regexp_engine associated to to
8466 the regexp, allowing the handling of the void *pprivate; member
8467 first. (This routine is not overridable by extensions, which is why
8468 the extensions free is called first.)
8470 See regdupe and regdupe_internal if you change anything here.
8472 #ifndef PERL_IN_XSUB_RE
8474 Perl_pregfree(pTHX_ struct regexp *r)
8477 GET_RE_DEBUG_FLAGS_DECL;
8479 if (!r || (--r->refcnt > 0))
8482 CALLREGFREE_PVT(r); /* free the private data */
8484 /* gcov results gave these as non-null 100% of the time, so there's no
8485 optimisation in checking them before calling Safefree */
8486 Safefree(r->precomp);
8487 RX_MATCH_COPY_FREE(r);
8488 #ifdef PERL_OLD_COPY_ON_WRITE
8490 SvREFCNT_dec(r->saved_copy);
8493 if (r->anchored_substr)
8494 SvREFCNT_dec(r->anchored_substr);
8495 if (r->anchored_utf8)
8496 SvREFCNT_dec(r->anchored_utf8);
8497 if (r->float_substr)
8498 SvREFCNT_dec(r->float_substr);
8500 SvREFCNT_dec(r->float_utf8);
8501 Safefree(r->substrs);
8504 SvREFCNT_dec(r->paren_names);
8506 Safefree(r->startp);
8512 /* regfree_internal()
8514 Free the private data in a regexp. This is overloadable by
8515 extensions. Perl takes care of the regexp structure in pregfree(),
8516 this covers the *pprivate pointer which technically perldoesnt
8517 know about, however of course we have to handle the
8518 regexp_internal structure when no extension is in use.
8520 Note this is called before freeing anything in the regexp
8525 Perl_regfree_internal(pTHX_ struct regexp *r)
8529 GET_RE_DEBUG_FLAGS_DECL;
8535 SV *dsv= sv_newmortal();
8536 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8537 dsv, r->precomp, r->prelen, 60);
8538 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8539 PL_colors[4],PL_colors[5],s);
8543 Safefree(ri->offsets); /* 20010421 MJD */
8545 int n = ri->data->count;
8546 PAD* new_comppad = NULL;
8551 /* If you add a ->what type here, update the comment in regcomp.h */
8552 switch (ri->data->what[n]) {
8556 SvREFCNT_dec((SV*)ri->data->data[n]);
8559 Safefree(ri->data->data[n]);
8562 new_comppad = (AV*)ri->data->data[n];
8565 if (new_comppad == NULL)
8566 Perl_croak(aTHX_ "panic: pregfree comppad");
8567 PAD_SAVE_LOCAL(old_comppad,
8568 /* Watch out for global destruction's random ordering. */
8569 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8572 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
8575 op_free((OP_4tree*)ri->data->data[n]);
8577 PAD_RESTORE_LOCAL(old_comppad);
8578 SvREFCNT_dec((SV*)new_comppad);
8584 { /* Aho Corasick add-on structure for a trie node.
8585 Used in stclass optimization only */
8587 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
8589 refcount = --aho->refcount;
8592 PerlMemShared_free(aho->states);
8593 PerlMemShared_free(aho->fail);
8594 /* do this last!!!! */
8595 PerlMemShared_free(ri->data->data[n]);
8596 PerlMemShared_free(ri->regstclass);
8602 /* trie structure. */
8604 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
8606 refcount = --trie->refcount;
8609 PerlMemShared_free(trie->charmap);
8610 PerlMemShared_free(trie->states);
8611 PerlMemShared_free(trie->trans);
8613 PerlMemShared_free(trie->bitmap);
8615 PerlMemShared_free(trie->wordlen);
8617 PerlMemShared_free(trie->jump);
8619 PerlMemShared_free(trie->nextword);
8620 /* do this last!!!! */
8621 PerlMemShared_free(ri->data->data[n]);
8626 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
8629 Safefree(ri->data->what);
8633 Safefree(ri->swap->startp);
8634 Safefree(ri->swap->endp);
8640 #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8641 #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8642 #define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8643 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8646 regdupe - duplicate a regexp.
8648 This routine is called by sv.c's re_dup and is expected to clone a
8649 given regexp structure. It is a no-op when not under USE_ITHREADS.
8650 (Originally this *was* re_dup() for change history see sv.c)
8652 After all of the core data stored in struct regexp is duplicated
8653 the regexp_engine.dupe method is used to copy any private data
8654 stored in the *pprivate pointer. This allows extensions to handle
8655 any duplication it needs to do.
8657 See pregfree() and regfree_internal() if you change anything here.
8659 #if defined(USE_ITHREADS)
8660 #ifndef PERL_IN_XSUB_RE
8662 Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
8667 struct reg_substr_datum *s;
8670 return (REGEXP *)NULL;
8672 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8676 npar = r->nparens+1;
8677 Newxz(ret, 1, regexp);
8678 Newx(ret->startp, npar, I32);
8679 Copy(r->startp, ret->startp, npar, I32);
8680 Newx(ret->endp, npar, I32);
8681 Copy(r->endp, ret->endp, npar, I32);
8684 Newx(ret->substrs, 1, struct reg_substr_data);
8685 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8686 s->min_offset = r->substrs->data[i].min_offset;
8687 s->max_offset = r->substrs->data[i].max_offset;
8688 s->end_shift = r->substrs->data[i].end_shift;
8689 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8690 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8693 ret->substrs = NULL;
8695 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8696 ret->refcnt = r->refcnt;
8697 ret->minlen = r->minlen;
8698 ret->minlenret = r->minlenret;
8699 ret->prelen = r->prelen;
8700 ret->nparens = r->nparens;
8701 ret->lastparen = r->lastparen;
8702 ret->lastcloseparen = r->lastcloseparen;
8703 ret->intflags = r->intflags;
8704 ret->extflags = r->extflags;
8706 ret->sublen = r->sublen;
8708 ret->engine = r->engine;
8710 ret->paren_names = hv_dup_inc(r->paren_names, param);
8712 if (RX_MATCH_COPIED(ret))
8713 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8716 #ifdef PERL_OLD_COPY_ON_WRITE
8717 ret->saved_copy = NULL;
8720 ret->pprivate = r->pprivate;
8722 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
8724 ptr_table_store(PL_ptr_table, r, ret);
8727 #endif /* PERL_IN_XSUB_RE */
8732 This is the internal complement to regdupe() which is used to copy
8733 the structure pointed to by the *pprivate pointer in the regexp.
8734 This is the core version of the extension overridable cloning hook.
8735 The regexp structure being duplicated will be copied by perl prior
8736 to this and will be provided as the regexp *r argument, however
8737 with the /old/ structures pprivate pointer value. Thus this routine
8738 may override any copying normally done by perl.
8740 It returns a pointer to the new regexp_internal structure.
8744 Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
8747 regexp_internal *reti;
8751 npar = r->nparens+1;
8752 len = ri->offsets[0];
8754 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8755 Copy(ri->program, reti->program, len+1, regnode);
8758 Newx(reti->swap, 1, regexp_paren_ofs);
8759 /* no need to copy these */
8760 Newx(reti->swap->startp, npar, I32);
8761 Newx(reti->swap->endp, npar, I32);
8767 reti->regstclass = NULL;
8770 const int count = ri->data->count;
8773 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8774 char, struct reg_data);
8775 Newx(d->what, count, U8);
8778 for (i = 0; i < count; i++) {
8779 d->what[i] = ri->data->what[i];
8780 switch (d->what[i]) {
8781 /* legal options are one of: sSfpontTu
8782 see also regcomp.h and pregfree() */
8785 case 'p': /* actually an AV, but the dup function is identical. */
8786 case 'u': /* actually an HV, but the dup function is identical. */
8787 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
8790 /* This is cheating. */
8791 Newx(d->data[i], 1, struct regnode_charclass_class);
8792 StructCopy(ri->data->data[i], d->data[i],
8793 struct regnode_charclass_class);
8794 reti->regstclass = (regnode*)d->data[i];
8797 /* Compiled op trees are readonly and in shared memory,
8798 and can thus be shared without duplication. */
8800 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
8804 /* Trie stclasses are readonly and can thus be shared
8805 * without duplication. We free the stclass in pregfree
8806 * when the corresponding reg_ac_data struct is freed.
8808 reti->regstclass= ri->regstclass;
8812 ((reg_trie_data*)ri->data->data[i])->refcount++;
8816 d->data[i] = ri->data->data[i];
8819 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
8828 reti->name_list_idx = ri->name_list_idx;
8830 Newx(reti->offsets, 2*len+1, U32);
8831 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
8836 #endif /* USE_ITHREADS */
8841 converts a regexp embedded in a MAGIC struct to its stringified form,
8842 caching the converted form in the struct and returns the cached
8845 If lp is nonnull then it is used to return the length of the
8848 If flags is nonnull and the returned string contains UTF8 then
8849 (*flags & 1) will be true.
8851 If haseval is nonnull then it is used to return whether the pattern
8854 Normally called via macro:
8856 CALLREG_STRINGIFY(mg,&len,&utf8);
8860 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
8862 See sv_2pv_flags() in sv.c for an example of internal usage.
8865 #ifndef PERL_IN_XSUB_RE
8867 Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8869 const regexp * const re = (regexp *)mg->mg_obj;
8872 const char *fptr = "msix";
8875 bool hask = ((re->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
8876 bool hasm = ((re->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
8877 U16 reganch = (U16)((re->extflags & RXf_PMf_STD_PMMOD) >> 12);
8878 bool need_newline = 0;
8880 int right = 4 + hask;
8882 reflags[left++]='k';
8883 while((ch = *fptr++)) {
8885 reflags[left++] = ch;
8888 reflags[right--] = ch;
8893 reflags[left] = '-';
8896 /* printf("[%*.7s]\n",left,reflags); */
8897 mg->mg_len = re->prelen + 4 + left;
8899 * If /x was used, we have to worry about a regex ending with a
8900 * comment later being embedded within another regex. If so, we don't
8901 * want this regex's "commentization" to leak out to the right part of
8902 * the enclosing regex, we must cap it with a newline.
8904 * So, if /x was used, we scan backwards from the end of the regex. If
8905 * we find a '#' before we find a newline, we need to add a newline
8906 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8907 * we don't need to add anything. -jfriedl
8909 if (PMf_EXTENDED & re->extflags) {
8910 const char *endptr = re->precomp + re->prelen;
8911 while (endptr >= re->precomp) {
8912 const char c = *(endptr--);
8914 break; /* don't need another */
8916 /* we end while in a comment, so we need a newline */
8917 mg->mg_len++; /* save space for it */
8918 need_newline = 1; /* note to add it */
8924 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8925 mg->mg_ptr[0] = '(';
8926 mg->mg_ptr[1] = '?';
8927 Copy(reflags, mg->mg_ptr+2, left, char);
8928 *(mg->mg_ptr+left+2) = ':';
8929 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8931 mg->mg_ptr[mg->mg_len - 2] = '\n';
8932 mg->mg_ptr[mg->mg_len - 1] = ')';
8933 mg->mg_ptr[mg->mg_len] = 0;
8936 *haseval = re->seen_evals;
8938 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
8946 - regnext - dig the "next" pointer out of a node
8949 Perl_regnext(pTHX_ register regnode *p)
8952 register I32 offset;
8957 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8966 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8969 STRLEN l1 = strlen(pat1);
8970 STRLEN l2 = strlen(pat2);
8973 const char *message;
8979 Copy(pat1, buf, l1 , char);
8980 Copy(pat2, buf + l1, l2 , char);
8981 buf[l1 + l2] = '\n';
8982 buf[l1 + l2 + 1] = '\0';
8984 /* ANSI variant takes additional second argument */
8985 va_start(args, pat2);
8989 msv = vmess(buf, &args);
8991 message = SvPV_const(msv,l1);
8994 Copy(message, buf, l1 , char);
8995 buf[l1-1] = '\0'; /* Overwrite \n */
8996 Perl_croak(aTHX_ "%s", buf);
8999 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9001 #ifndef PERL_IN_XSUB_RE
9003 Perl_save_re_context(pTHX)
9007 struct re_save_state *state;
9009 SAVEVPTR(PL_curcop);
9010 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9012 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9013 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9014 SSPUSHINT(SAVEt_RE_STATE);
9016 Copy(&PL_reg_state, state, 1, struct re_save_state);
9018 PL_reg_start_tmp = 0;
9019 PL_reg_start_tmpl = 0;
9020 PL_reg_oldsaved = NULL;
9021 PL_reg_oldsavedlen = 0;
9023 PL_reg_leftiter = 0;
9024 PL_reg_poscache = NULL;
9025 PL_reg_poscache_size = 0;
9026 #ifdef PERL_OLD_COPY_ON_WRITE
9030 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9032 const REGEXP * const rx = PM_GETRE(PL_curpm);
9035 for (i = 1; i <= rx->nparens; i++) {
9036 char digits[TYPE_CHARS(long)];
9037 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9038 GV *const *const gvp
9039 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9042 GV * const gv = *gvp;
9043 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9053 clear_re(pTHX_ void *r)
9056 ReREFCNT_dec((regexp *)r);
9062 S_put_byte(pTHX_ SV *sv, int c)
9064 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9065 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9066 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9067 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9069 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9073 #define CLEAR_OPTSTART \
9074 if (optstart) STMT_START { \
9075 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9079 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9081 STATIC const regnode *
9082 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9083 const regnode *last, const regnode *plast,
9084 SV* sv, I32 indent, U32 depth)
9087 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9088 register const regnode *next;
9089 const regnode *optstart= NULL;
9092 GET_RE_DEBUG_FLAGS_DECL;
9094 #ifdef DEBUG_DUMPUNTIL
9095 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9096 last ? last-start : 0,plast ? plast-start : 0);
9099 if (plast && plast < last)
9102 while (PL_regkind[op] != END && (!last || node < last)) {
9103 /* While that wasn't END last time... */
9106 if (op == CLOSE || op == WHILEM)
9108 next = regnext((regnode *)node);
9111 if (OP(node) == OPTIMIZED) {
9112 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9119 regprop(r, sv, node);
9120 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9121 (int)(2*indent + 1), "", SvPVX_const(sv));
9123 if (OP(node) != OPTIMIZED) {
9124 if (next == NULL) /* Next ptr. */
9125 PerlIO_printf(Perl_debug_log, " (0)");
9126 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9127 PerlIO_printf(Perl_debug_log, " (FAIL)");
9129 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9130 (void)PerlIO_putc(Perl_debug_log, '\n');
9134 if (PL_regkind[(U8)op] == BRANCHJ) {
9137 register const regnode *nnode = (OP(next) == LONGJMP
9138 ? regnext((regnode *)next)
9140 if (last && nnode > last)
9142 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9145 else if (PL_regkind[(U8)op] == BRANCH) {
9147 DUMPUNTIL(NEXTOPER(node), next);
9149 else if ( PL_regkind[(U8)op] == TRIE ) {
9150 const regnode *this_trie = node;
9151 const char op = OP(node);
9152 const U32 n = ARG(node);
9153 const reg_ac_data * const ac = op>=AHOCORASICK ?
9154 (reg_ac_data *)ri->data->data[n] :
9156 const reg_trie_data * const trie =
9157 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
9159 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9161 const regnode *nextbranch= NULL;
9163 sv_setpvn(sv, "", 0);
9164 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9165 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
9167 PerlIO_printf(Perl_debug_log, "%*s%s ",
9168 (int)(2*(indent+3)), "",
9169 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9170 PL_colors[0], PL_colors[1],
9171 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9172 PERL_PV_PRETTY_ELIPSES |
9178 U16 dist= trie->jump[word_idx+1];
9179 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9180 (UV)((dist ? this_trie + dist : next) - start));
9183 nextbranch= this_trie + trie->jump[0];
9184 DUMPUNTIL(this_trie + dist, nextbranch);
9186 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9187 nextbranch= regnext((regnode *)nextbranch);
9189 PerlIO_printf(Perl_debug_log, "\n");
9192 if (last && next > last)
9197 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9198 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9199 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9201 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9203 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9205 else if ( op == PLUS || op == STAR) {
9206 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9208 else if (op == ANYOF) {
9209 /* arglen 1 + class block */
9210 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9211 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9212 node = NEXTOPER(node);
9214 else if (PL_regkind[(U8)op] == EXACT) {
9215 /* Literal string, where present. */
9216 node += NODE_SZ_STR(node) - 1;
9217 node = NEXTOPER(node);
9220 node = NEXTOPER(node);
9221 node += regarglen[(U8)op];
9223 if (op == CURLYX || op == OPEN)
9227 #ifdef DEBUG_DUMPUNTIL
9228 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
9233 #endif /* DEBUGGING */
9237 * c-indentation-style: bsd
9239 * indent-tabs-mode: t
9242 * ex: set ts=8 sts=4 sw=4 noet: